In [1]:
from collections import Counter
import numpy as np
import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,ConfusionMatrixDisplay
from tensorflow.keras.layers import Input, Dense,Concatenate, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, MaxPooling2D,Dropout, Add,Embedding
from keras.models import Model, load_model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [2]:
MAX_URL_LEN_CHAR = 150
MAX_URL_LEN_WORD = 150
MAX_WORD_LEN = 20
k = 32
num_filters = 256

In [3]:
# conn = sqlite3.connect('casino.db')

# df = pd.read_sql('select * from web_pages',conn)
# df.drop(columns=["id","url","html","screenshot"],inplace=True)



# Load dmoz dataset

In [4]:
df = pd.read_csv('./datasets/dmoz.csv')
print(len(df))
#drop Nans
df = df.dropna()
print(len(df))

# shuffle the data
df = df.sample(frac=1, random_state=44).reset_index(drop=True)

1562977
1562974


In [5]:
classes = sorted(list(set(df.category.values)))
num_classes = len(classes)

In [6]:
train_split = int(0.8 * len(df))
x_train = df.url.values[:train_split]
y_train = df.category.values[:train_split]

x_test = df.url.values[train_split: ]
y_test =  df.category.values[train_split:]

print("for training: " + str(len(x_train))+ " for testing: " + str(len(x_test)))
print(x_train[0],y_train[0])

for training: 1250379 for testing: 312595
http://www.newadvent.org/cathen/11535a.htm Society


# Character level CNN

## Character level tokenizer

In [20]:
# Character tokenization
char_tokenizer = Tokenizer(char_level=True, oov_token='<OOV>')
char_tokenizer.fit_on_texts(x_train)

char_sequences = char_tokenizer.texts_to_sequences(x_train)
char_padded = pad_sequences(char_sequences, maxlen=MAX_URL_LEN_CHAR, padding='post')

In [19]:
input_shape = MAX_URL_LEN_CHAR
print(f'input shape: {input_shape}')


input shape: 150


### Character embedding

In [9]:
char_input = Input((MAX_URL_LEN_CHAR,))

# Embedding layer
char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(char_input)
char_embedding = tf.expand_dims(char_embedding, -1)  # Add channel dimension

### CNN Block

In [10]:
# Convolution and pooling for character-level
# h = 3
conv_3_char = Conv2D(num_filters, (3, k), activation='relu')(char_embedding)
conv_3_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_3_char)
# h = 4
conv_4_char = Conv2D(num_filters, (4, k), activation='relu')(char_embedding)
conv_4_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_4_char)
# h = 5
conv_5_char = Conv2D(num_filters, (5, k), activation='relu')(char_embedding)
conv_5_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_5_char)
# h = 6
conv_6_char = Conv2D(num_filters, (6, k), activation='relu')(char_embedding)
conv_6_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_6_char)

### Fully connected layer

In [11]:
#concatenate all convolutional layer outputs
concatenated = Concatenate(axis=1)([conv_3_char,conv_4_char,conv_5_char,conv_6_char])
flattened = Flatten()(concatenated)

# feed concatenated conv layers to fully conected layer
dense_char = Dense(1024,activation='relu')(flattened)
dropout = Dropout(0.5)(dense_char)

# Word-level CNN

### Word level tokenizer

In [24]:
def text_to_word_sequence(
    input_text,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
):
    
    if lower:
        input_text = input_text.lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    seq = input_text.split(split)
    return [i for i in seq if i]

#### Calculate number of words which appear more than once

In [42]:
word_sequences = [text_to_word_sequence(sentence) for sentence in x_train]
all_words_concat = [item for sublist in word_sequences for item in sublist]
# all_words_concat = " ".join([' '.join([word for word in sentence ]) for sentence in word_sequences])

In [44]:
freq_dict_words = Counter(all_words_concat)

low_freq_words_count = 0
for key in freq_dict_words:
    if freq_dict_words[key] == 1:
        low_freq_words_count += 1

high_freq_word_count = int(len(freq_dict_words) * (1-(low_freq_words_count/len(freq_dict_words))))
print(high_freq_word_count)
del word_sequences, all_words_concat

168710


#### Url into sequence of word ids

In [16]:
# reinstaniate tokenizer
word_tokenizer = Tokenizer(oov_token='<OOV>',num_words=high_freq_word_count)
word_tokenizer.fit_on_texts(x_train)

word_sequences = word_tokenizer.texts_to_sequences(x_train)
word_padded = pad_sequences(word_sequences, maxlen=MAX_URL_LEN_WORD, padding='post')

1220802


#### url into sequence of words in char ids 

In [None]:
# word_tokenizer_for_chars = Tokenizer(oov_token='<OOV>')
# word_tokenizer_for_chars.fit_on_texts(x)

# sentences_to_sequences = word_tokenizer_for_chars.texts_to_sequences(x)
# print(sentences_to_sequences)
# sentences_padded = pad_sequences(sentences_to_sequences,maxlen=MAX_URL_LEN_WORD,padding='post')
# # print(sentences_padded)
# words_splitted = word_tokenizer_for_chars.sequences_to_texts(sentences_padded)
# print(words_splitted)

# x = ['gamarjoba!me?saba.var','gamarjoba!me?saba.var']

sentences_splitted = [text_to_word_sequence(sentence) for sentence in x_train]

for i in range(len(sentences_splitted)):
    while len(sentences_splitted[i])<150:
        sentences_splitted[i].append('<PAD>')

words_splitted = [[word.split() for word in sentence] for sentence in sentences_splitted]

del sentences_splitted

word_char_sequences = []

for sentence in words_splitted:
    sentence_tokenized = []
    for i in range(len(sentence)):
        word = char_tokenizer.texts_to_sequences(sentence[i])
        word_char_padded = pad_sequences(word,maxlen=MAX_WORD_LEN,padding='post')
        sentence_tokenized.append(word_char_padded)
    word_char_sequences.append(sentence_tokenized)

del words_splitted
word_char_sequences = np.array(word_char_sequences)
word_char_sequences = word_char_sequences.reshape(word_char_sequences.shape[0],word_char_sequences.shape[1],word_char_sequences.shape[-1])

print(word_char_sequences.shape)

(506, 200, 20)

##### word embedding

In [24]:
word_input = Input(shape=(MAX_URL_LEN_WORD,))

# Embedding layer
word_embedding = Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_WORD)(word_input)

word_embedding.shape

##### char embedding 2

In [26]:
word_char_input = Input(shape=(MAX_URL_LEN_CHAR,MAX_WORD_LEN))

# Embedding layer
word_char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(word_char_input)
word_char_embedding.shape

##### Sum over characters

In [28]:
pooled_layer = tf.keras.layers.Lambda(lambda x: tf.math.reduce_sum(x, axis=2))(word_char_embedding)
pooled_layer.shape

TensorShape([None, 200, 32])

#### Element-wise Addition

In [33]:
addition_layer = Add()([pooled_layer, word_embedding])
addition_layer = tf.expand_dims(addition_layer,-1)

addition_layer.shape

TensorShape([None, 200, 32, 1])

## CNN Block

In [35]:
# Convolution and pooling for word-level
# h = 3
conv_3_word = Conv2D(num_filters, (3, k), activation='relu')(addition_layer)
conv_3_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_3_word)
# h = 4
conv_4_word = Conv2D(num_filters, (4, k), activation='relu')(addition_layer)
conv_4_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_4_word)
# h = 5
conv_5_word = Conv2D(num_filters, (5, k), activation='relu')(addition_layer)
conv_5_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_5_word)
# h = 6
conv_6_word = Conv2D(num_filters, (6, k), activation='relu')(addition_layer)
conv_6_word = MaxPooling2D((2, 1), strides=(2, 1))(conv_6_word)

## Word level fully connected layer

In [36]:
#concatenate all convolutional layer outputs
concatenated_word = Concatenate(axis=1)([conv_3_word,conv_4_word,conv_5_word,conv_6_word])
flattened_word = Flatten()(concatenated_word)

# feed concatenated conv layers to fully conected layer
dense_word = Dense(1024,activation='relu')(flattened_word)
dropout_word = Dropout(0.5)(dense_word)
dropout_word.shape

TensorShape([None, 1024])

## concatenate outputs of char-level and word-level blocks

In [42]:
concatenate_all = Concatenate()([dropout,dropout_word])
concatenate_all.shape

TensorShape([None, 2048])

# last fully connected layers

In [45]:
fc_1 = Dense(512,activation='relu')(concatenate_all)
dropout_fc_1 = Dropout(0.5)(fc_1)

fc_2 = Dense(256,activation='relu')(fc_1)
dropout_fc_2 = Dropout(0.5)(fc_2)

fc_3 = Dense(128,activation='relu')(fc_2)
dropout_fc_3 = Dropout(0.5)(fc_3)
dropout_fc_3.shape

TensorShape([None, 128])

#### output layer with softmax

In [46]:
softmax = Dense(num_classes, activation='softmax')(dropout_fc_3)

NameError: name 'num_classes' is not defined