In [1]:
from collections import Counter
import numpy as np
import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,ConfusionMatrixDisplay
from tensorflow.keras.layers import Input, Dense,Concatenate, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, MaxPooling2D,Dropout, Add,Embedding
from keras.models import Model, load_model
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
MAX_URL_LEN_CHAR = 70
MAX_URL_LEN_WORD = 70
MAX_WORD_LEN = 15
k = 16
num_filters = 256
batch_size = 32
VAL_FREQ = 1

In [3]:
# conn = sqlite3.connect('casino.db')

# df = pd.read_sql('select * from web_pages',conn)
# df.drop(columns=["id","url","html","screenshot"],inplace=True)



# Load dmoz dataset

In [4]:
df = pd.read_csv('./datasets/dmoz.csv')
print(len(df))
#drop Nans
df = df.dropna()
print(len(df))

1562977
1562974


In [5]:
filtered_df = df[df['url'].apply(lambda x: len(x) > 70)]


In [6]:
filtered_df

Unnamed: 0,url,category
143,http://www.hotsexsites.org/free-anime-hentai-m...,Adult
146,http://www.hotsexsites.org/free-anime-hentai-m...,Adult
149,http://www.hotsexsites.org/free-anime-hentai-m...,Adult
151,http://www.hotsexsites.org/free-anime-hentai-m...,Adult
152,http://www.hotsexsites.org/free-anime-hentai-m...,Adult
...,...,...
1561981,http://www.goviks.com/sportselect.dbml?temp_si...,Sports
1561984,http://www.cmuchippewas.com/sportselect.dbml?d...,Sports
1562057,http://www.negaunee.k12.mi.us/hs_web/athletics...,Sports
1562649,http://sports.dir.groups.yahoo.com/dir/recreat...,Sports


In [7]:
# shuffle the data
df = df.sample(frac=1, random_state=44).reset_index(drop=True)
half_df = int((len(df)*0.5))
df = df[:half_df]
print(len(df))

781487


In [8]:
classes = sorted(list(set(df.category.values)))
num_classes = len(classes)

In [9]:
print(num_classes)

15


In [10]:
class_to_int = { word:i for i,word in enumerate(classes)}
int_to_class = {i:word for i,word in enumerate(classes)}

encode = lambda class_: class_to_int[class_]

df['category'] = df['category'].apply(encode)

In [11]:
train_split = int(0.8 * len(df))
x_train = df.url.values[:train_split]
y_train = df.category.values[:train_split]

x_test = df.url.values[train_split: ]
y_test =  df.category.values[train_split:]

print("for training: " + str(len(x_train))+ " for testing: " + str(len(x_test)))
print(x_train[0],y_train[0])

for training: 625189 for testing: 156298
http://www.newadvent.org/cathen/11535a.htm 13


In [12]:
filtered_df = df[df['url'].apply(lambda x: len(x) > 70)]
filtered_df


Unnamed: 0,url,category
27,http://www.preplogic.com/products/exams/practi...,3
49,http://rogerebert.suntimes.com/apps/pbcs.dll/a...,1
53,http://www.women.com/entertain/celeb/articles/...,1
74,http://www.admin.cam.ac.uk/univ/gsprospectus/c...,10
86,http://www.uwbadgers.com/sport_news/mxc/headli...,14
...,...,...
781380,http://web.clas.ufl.edu/users/rhatch/pages/02-...,13
781382,http://www.free-porn-pics-free-porn-pics-free-...,0
781432,http://open-site.org/health/conditions_and_dis...,5
781462,ftp://ftp.apple.com/developer/tool_chest/local...,3


# Tokenizers and functions

## Character level tokenizer

In [13]:
# Character tokenization
char_tokenizer = Tokenizer(char_level=True, oov_token='<OOV>')
char_tokenizer.fit_on_texts(x_train)


In [14]:
def text_to_word_sequence(
    input_text,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
):
    
    if lower:
        input_text = input_text.lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    seq = input_text.split(split)
    return [i for i in seq if i]

### Word level tokenizer

#### Calculate number of words which appear more than once

In [15]:
def frequency_calculation():
    word_sequences = [text_to_word_sequence(sentence) for sentence in x_train]
    all_words_concat = [item for sublist in word_sequences for item in sublist]
    freq_dict_words = Counter(all_words_concat)
    
    low_freq_words_count = 0
    for key in freq_dict_words:
        if freq_dict_words[key] == 1:
            low_freq_words_count += 1
    
    high_freq_word_count = int(len(freq_dict_words) * (1-(low_freq_words_count/len(freq_dict_words))))
    print(high_freq_word_count)
    return high_freq_word_count

#### Url into sequence of word ids

In [16]:
high_freq_word_count = frequency_calculation()
word_tokenizer = Tokenizer(oov_token='<OOV>',num_words=high_freq_word_count)
word_tokenizer.fit_on_texts(x_train)


86766


## define DataGenerator for preprocessing(tokenizing and padding) inputs on the fly

In [17]:
class DataGenerator(Sequence):
    def __init__(self, urls, labels, batch_size, char_tokenizer, word_tokenizer):
        self.urls = urls
        self.labels = labels
        self.batch_size = batch_size
        self.char_tokenizer = char_tokenizer
        self.word_tokenizer = word_tokenizer
        self.indexes = np.arange(len(urls))
    
    def __len__(self):
        return int(np.ceil(len(self.urls) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_urls = [self.urls[i] for i in batch_indexes]
        batch_labels = [self.labels[i] for i in batch_indexes]
        
        char_input_data = self._preprocess_char_input(batch_urls)
        word_input_data = self._preprocess_word_input(batch_urls)
        word_char_input_data = self._preprocess_word_char_input(batch_urls)
        
        return [char_input_data, word_input_data, word_char_input_data], np.array(batch_labels)
    
    def _preprocess_char_input(self, urls):
        char_sequences = self.char_tokenizer.texts_to_sequences(urls)
        return pad_sequences(char_sequences, maxlen=MAX_URL_LEN_CHAR, padding='post',truncating='post')
    
    def _preprocess_word_input(self, urls):
        word_sequences = self.word_tokenizer.texts_to_sequences(urls)
        return pad_sequences(word_sequences, maxlen=MAX_URL_LEN_WORD, padding='post',truncating='post')
    
    def _preprocess_word_char_input(self, urls):
        sentences_splitted = [text_to_word_sequence(sentence) for sentence in urls]
    
        for i in range(len(sentences_splitted)):
            while len(sentences_splitted[i])<MAX_URL_LEN_CHAR:
                sentences_splitted[i].append('<OOV>')
        
        words_splitted = [[word.split() for word in sentence] for sentence in sentences_splitted]

        word_char_sequences = []
        
        for sentence in words_splitted:
            sentence_tokenized = []
            for i in range(len(sentence)):
                word = self.char_tokenizer.texts_to_sequences(sentence[i])
                word_char_padded = pad_sequences(word,maxlen=MAX_WORD_LEN,padding='post',truncating='post')
                sentence_tokenized.append(word_char_padded)
            word_char_sequences.append(sentence_tokenized)
      
        word_char_sequences = np.array(word_char_sequences)
        word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])
        
        return word_char_sequences


    def on_epoch_end(self):
        np.random.shuffle(self.indexes)


In [18]:
training_generator = DataGenerator(x_train, y_train, batch_size, char_tokenizer, word_tokenizer)
test_generator = DataGenerator(x_test, y_test, batch_size, char_tokenizer, word_tokenizer)

In [19]:
item = training_generator.__getitem__(27)

In [20]:
len(item[0][0][0])

70

# Character level CNN

### Character embedding

In [21]:
char_input = Input((MAX_URL_LEN_CHAR,))

# Embedding layer
char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(char_input)
char_embedding = tf.expand_dims(char_embedding, -1)  # Add channel dimension

# CNN Block

In [22]:
# Convolution and pooling for character-level
# h = 3
conv_3_char = Conv2D(num_filters, (3, k), activation='relu')(char_embedding)
conv_3_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_3_char)
# h = 4
conv_4_char = Conv2D(num_filters, (4, k), activation='relu')(char_embedding)
conv_4_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_4_char)
# h = 5
conv_5_char = Conv2D(num_filters, (5, k), activation='relu')(char_embedding)
conv_5_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_5_char)
# h = 6
conv_6_char = Conv2D(num_filters, (6, k), activation='relu')(char_embedding)
conv_6_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_6_char)

### Fully connected layer

In [23]:
#concatenate all convolutional layer outputs
concatenated = Concatenate(axis=1)([conv_3_char,conv_4_char,conv_5_char,conv_6_char])
flattened = Flatten()(concatenated)

# feed concatenated conv layers to fully conected layer
dense_char = Dense(512,activation='relu')(flattened)
dropout = Dropout(0.5)(dense_char)

# Word-level Block

##### word embedding

In [24]:
word_input = Input(shape=(MAX_URL_LEN_WORD,))

# Embedding layer
word_embedding = Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_WORD)(word_input)

word_embedding.shape

TensorShape([None, 70, 16])

##### char embedding 2

In [25]:
word_char_input = Input(shape=(MAX_URL_LEN_CHAR,MAX_WORD_LEN))

# Embedding layer
word_char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(word_char_input)
word_char_embedding.shape

TensorShape([None, 70, 15, 16])

##### Sum over characters

In [26]:
pooled_layer = tf.keras.layers.Lambda(lambda x: tf.math.reduce_sum(x, axis=2))(word_char_embedding)
pooled_layer.shape

TensorShape([None, 70, 16])

#### Element-wise Addition

In [27]:
addition_layer = Add()([pooled_layer, word_embedding])
addition_layer = tf.expand_dims(addition_layer,-1)

addition_layer.shape

TensorShape([None, 70, 16, 1])

## Word-level CNN Block

In [28]:
# Convolution and pooling for word-level
# h = 3
conv_3_word = Conv2D(num_filters, (3, k), activation='relu')(addition_layer)
conv_3_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_3_word)
# h = 4
conv_4_word = Conv2D(num_filters, (4, k), activation='relu')(addition_layer)
conv_4_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_4_word)
# h = 5
conv_5_word = Conv2D(num_filters, (5, k), activation='relu')(addition_layer)
conv_5_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_5_word)
# h = 6
conv_6_word = Conv2D(num_filters, (6, k), activation='relu')(addition_layer)
conv_6_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_6_word)

## Word level fully connected layer

In [29]:
#concatenate all convolutional layer outputs
concatenated_word = Concatenate(axis=1)([conv_3_word,conv_4_word,conv_5_word,conv_6_word])
flattened_word = Flatten()(concatenated_word)

# feed concatenated conv layers to fully conected layer
dense_word = Dense(512,activation='relu')(flattened_word)
dropout_word = Dropout(0.5)(dense_word)
dropout_word.shape

TensorShape([None, 512])

## concatenate outputs of char-level and word-level blocks

In [30]:
concatenate_all = Concatenate()([dropout,dropout_word])
concatenate_all.shape

TensorShape([None, 1024])

# last fully connected layers

In [31]:
fc_1 = Dense(512,activation='relu')(concatenate_all)
dropout_fc_1 = Dropout(0.5)(fc_1)

fc_2 = Dense(256,activation='relu')(dropout_fc_1)
dropout_fc_2 = Dropout(0.5)(fc_2)

fc_3 = Dense(128,activation='relu')(dropout_fc_2)
dropout_fc_3 = Dropout(0.5)(fc_3)
dropout_fc_3.shape

TensorShape([None, 128])

#### output layer with softmax

In [32]:
output = Dense(num_classes, activation='softmax')(dropout_fc_3)

# Model compile

In [33]:
# Define callbacks
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

In [34]:
model = Model(inputs=[char_input, word_input, word_char_input],outputs=output)
model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy']
             )

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 70, 15)]     0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 70, 15, 16)   1104        input_3[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 70)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 70)]         0                                            
______________________________________________________________________________________________

In [35]:
# x_train = x_train[:2000]
# y_train = y_train[:2000]
# len(set(y_train))

In [36]:
# urls = x_train

In [37]:

    
# def preprocess_char_input(urls):
#     char_sequences =char_tokenizer.texts_to_sequences(urls)
#     return pad_sequences(char_sequences, maxlen=MAX_URL_LEN_CHAR, padding='post')

# def preprocess_word_input( urls):
#     word_sequences = word_tokenizer.texts_to_sequences(urls)
#     return pad_sequences(word_sequences, maxlen=MAX_URL_LEN_WORD, padding='post')

# def preprocess_word_char_input( urls):
#     sentences_splitted = [text_to_word_sequence(sentence) for sentence in urls]

#     for i in range(len(sentences_splitted)):
#         while len(sentences_splitted[i])<100:
#             sentences_splitted[i].append('<PAD>')
    
#     words_splitted = [[word.split() for word in sentence] for sentence in sentences_splitted]

#     word_char_sequences = []
    
#     for sentence in words_splitted:
#         sentence_tokenized = []
#         for i in range(len(sentence)):
#             word = char_tokenizer.texts_to_sequences(sentence[i])
#             word_char_padded = pad_sequences(word,maxlen=MAX_WORD_LEN,padding='post')
#             sentence_tokenized.append(word_char_padded)
#         word_char_sequences.append(sentence_tokenized)
  
#     word_char_sequences = np.array(word_char_sequences)
#     word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])
    
#     return word_char_sequences
# char_input_data = preprocess_char_input(urls)
# word_input_data = preprocess_word_input(urls)
# word_char_input_data = preprocess_word_char_input(urls)
        
# print(char_input_data)
# print(word_input_data)
# print(word_char_input_data)
# print(y_train)

In [38]:
history = model.fit(
    training_generator,
    # [char_input_data, word_input_data, word_char_input_data],
    # y_train,
    validation_data=test_generator,
    epochs=2,
    callbacks=[checkpoint, early_stopping])

Epoch 1/2
 4041/19538 [=====>........................] - ETA: 7:14 - loss: 2.2045 - accuracy: 0.3030

  word_char_sequences = np.array(word_char_sequences)


UnknownError: 2 root error(s) found.
  (0) Unknown:  IndexError: tuple index out of range
Traceback (most recent call last):

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\ops\script_ops.py", line 249, in __call__
    ret = func(*args)

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 645, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 892, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\keras\engine\data_adapter.py", line 822, in wrapped_generator
    for data in generator_fn():

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\keras\engine\data_adapter.py", line 948, in generator_fn
    yield x[i]

  File "C:\Users\warfa\AppData\Local\Temp\ipykernel_24416\3722087896.py", line 20, in __getitem__
    word_char_input_data = self._preprocess_word_char_input(batch_urls)

  File "C:\Users\warfa\AppData\Local\Temp\ipykernel_24416\3722087896.py", line 52, in _preprocess_word_char_input
    word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])

IndexError: tuple index out of range


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
  (1) Unknown:  IndexError: tuple index out of range
Traceback (most recent call last):

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\ops\script_ops.py", line 249, in __call__
    ret = func(*args)

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 645, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 892, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\keras\engine\data_adapter.py", line 822, in wrapped_generator
    for data in generator_fn():

  File "C:\Users\warfa\.conda\envs\AI\lib\site-packages\keras\engine\data_adapter.py", line 948, in generator_fn
    yield x[i]

  File "C:\Users\warfa\AppData\Local\Temp\ipykernel_24416\3722087896.py", line 20, in __getitem__
    word_char_input_data = self._preprocess_word_char_input(batch_urls)

  File "C:\Users\warfa\AppData\Local\Temp\ipykernel_24416\3722087896.py", line 52, in _preprocess_word_char_input
    word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])

IndexError: tuple index out of range


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_6]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_2276]

Function call stack:
train_function -> train_function


In [None]:
validation_freq=VAL_FREQ

In [59]:
len(classes)
training_generator.__getitem__(175)

([array([[ 8,  3,  3,  9, 17,  2,  2, 20, 14,  5, 13, 12, 19, 22,  5, 10,
           7, 12,  2,  3, 13,  3, 18,  6,  2,  3,  3, 28, 28, 29, 41, 38,
          42, 41,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0]]),
  array([[ 2, 24, 32,  4, 49,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0]]),
  array([[[ 8,  3,  3, ...,  0,  0,  0],
          [20, 14,  0, ...,  0,  0,  0],
          [13, 12, 19, ...,  0,  0,  0],
          ...,
          [ 1,  9, 11, ...,  0,  0,  0],
          [ 1,  9, 11, ...,  0,  0,  0],
          [ 1,  9, 11, ...,  0,  0,  0]]])],
 array([1], dtype=int64))

In [66]:
item = training_generator.__getitem__(17571)[0]
print(item)

[array([[ 8,  3,  3,  9, 17,  2,  2,  4,  4,  4,  5, 23, 18, 11,  3,  4,
         7,  7, 19, 14, 10,  8, 20, 15, 10,  8,  7, 23, 10,  8, 15, 13,
        14,  3,  5,  7, 15, 21,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0]]), array([[2, 3, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]]), array([[[ 8,  3,  3, ...,  0,  0,  0],
        [ 4,  4,  4, ...,  0,  0,  0],
        [14, 10,  8, ..., 13, 14,  3],
        ...,
        [ 1,  9, 11, ...,  0,  0,  0],
        [ 1,  9, 11, ...,  0,  0,  0],
        [ 1,  9, 11, ...,  0,  0,  0]]])]


In [73]:
# char_tokenizer.sequences_to_texts(item[2])
# word_tokenizer.sequences_to_texts(item[1])


(1, 70, 15)

In [54]:
char_tokenizer.texts_to_sequences(['gamarjoba'])

[[21, 11, 12, 11, 15, 33, 7, 22, 11]]

In [55]:
char_tokenizer.sequences_to_texts([[21, 11, 12, 11, 15, 33, 7, 22, 11]])

['g a m a r j o b a']

In [None]:
def preprocess_word_char_input(urls):
        sentences_splitted = [text_to_word_sequence(sentence) for sentence in urls]
    
        for i in range(len(sentences_splitted)):
            while len(sentences_splitted[i])<MAX_URL_LEN_CHAR:
                sentences_splitted[i].append('<OOV>')
        
        words_splitted = [[word.split() for word in sentence] for sentence in sentences_splitted]

        word_char_sequences = []
        
        for sentence in words_splitted:
            sentence_tokenized = []
            for i in range(len(sentence)):
                word = self.char_tokenizer.texts_to_sequences(sentence[i])
                word_char_padded = pad_sequences(word,maxlen=MAX_WORD_LEN,padding='post',truncating='post')
                sentence_tokenized.append(word_char_padded)
            word_char_sequences.append(sentence_tokenized)
      
        word_char_sequences = np.array(word_char_sequences)
        word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])
        
        return word_char_sequences
