In [131]:
from collections import Counter
import numpy as np
import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,ConfusionMatrixDisplay
from tensorflow.keras.layers import Input, Dense,Concatenate, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, MaxPooling2D,Dropout, Add,Embedding
from keras.models import Model, load_model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [76]:
MAX_URL_LEN_CHAR = 200
MAX_URL_LEN_WORD = 200
MAX_WORD_LEN = 20
k = 32
num_filters = 256

In [14]:
conn = sqlite3.connect('casino.db')

df = pd.read_sql('select * from web_pages',conn)
df.drop(columns=["id","url","html","screenshot"],inplace=True)

In [16]:
urls = df.real_url.values

In [18]:
urls[0]

'https://bc.game/?stag=6954_666ec6ad9782f44562dc5ce0&i=4cxse6dr&utm_source=4cxse6dr'

# Character level CNN

## Character level tokenizer

In [72]:
# Character tokenization
char_tokenizer = Tokenizer(char_level=True, oov_token='<OOV>')
char_tokenizer.fit_on_texts(urls)

char_sequences = char_tokenizer.texts_to_sequences(urls)
char_padded = pad_sequences(char_sequences, maxlen=200, padding='post')

In [73]:
input_shape = char_padded.shape[1]
print(f'input shape: {input_shape}')

input shape: 200


### Character embedding

In [None]:
char_input = Input(shape=(MAX_URL_LEN_CHAR,))

# Embedding layer
char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(char_input)
char_embedding = tf.expand_dims(char_embedding, -1)  # Add channel dimension

### CNN Block

In [None]:
# Convolution and pooling for character-level
# h = 3
conv_3_char = Conv2D(num_filters, (3, k), activation='relu')(char_embedding)
conv_3_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_3_char)
# h = 4
conv_4_char = Conv2D(num_filters, (4, k), activation='relu')(char_embedding)
conv_4_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_4_char)
# h = 5
conv_5_char = Conv2D(num_filters, (5, k), activation='relu')(char_embedding)
conv_5_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_5_char)
# h = 6
conv_6_char = Conv2D(num_filters, (6, k), activation='relu')(char_embedding)
conv_6_char = MaxPooling2D((2, 1), strides=(2, 1))(conv_6_char)

### Fully connected layer

In [88]:
#concatenate all convolutional layer outputs
concatenated = Concatenate(axis=1)([conv_3_char,conv_4_char,conv_5_char,conv_6_char])
flattened = Flatten()(concatenated)

# feed concatenated conv layers to fully conected layer
dense_char = Dense(1024,activation='relu')(flattened)
dropout = Dropout(0.5)(dense_char)

# Word-level CNN

### Word level tokenizer

In [99]:
# Word tokenization
word_tokenizer = Tokenizer(oov_token='<OOV>')
word_tokenizer.fit_on_texts(urls)

word_sequences = word_tokenizer.texts_to_sequences(urls)

#### Calculate number of words which appear more than once

In [176]:
all_sequences = [sequence for sequence in word_tokenizer.sequences_to_texts(word_sequences)]
all_words_concat = ' '.join([' '.join(sequence.split()) for sequence in all_sequences])

freq_dict_words = Counter(all_words_concat.split())

low_freq_words_count = 0
for key in freq_dict_words:
    if freq_dict_words[key] == 1:
        low_freq_words_count += 1

high_freq_word_count = int(len(freq_dict_words) * (1-(low_freq_words_count/len(freq_dict_words))))
print(high_freq_word_count)

315


1861


#### Url into sequence of word ids

In [None]:
# reinstaniate tokenizer
word_tokenizer = Tokenizer(oov_token='<OOV>',num_words=high_freq_word_count)
word_tokenizer.fit_on_texts(urls)
print(len(word_tokenizer.word_counts))

word_sequences = word_tokenizer.texts_to_sequences(urls)
word_padded = pad_sequences(word_sequences, maxlen=MAX_URL_LEN_WORD, padding='post')

#### url into sequence of words in char ids 

In [None]:
word_tokenizer_for_chars = Tokenizer(oov_token='<OOV>')
word_tokenizer_for_chars.fit_on_texts(urls)

sentences_to_sequences = word_tokenizer_for_chars.texts_to_sequences(urls)
sentences_padded = pad_sequences(sentences_to_sequences,maxlen=MAX_URL_LEN_WORD,padding='post')
words_splitted = word_tokenizer_for_chars.sequences_to_texts(sentences_padded)

words_splitted = [sentence.split() for sentence in words_splitted]
words_splitted = [[word.split() for word in sentence] for sentence in words_splitted]

word_char_sequences = []

for sentence in words_splitted:
    sentence_tokenized = []
    for i in range(len(sentence)):
        word = char_tokenizer_for_words.texts_to_sequences(sentence[i])
        word_char_padded = pad_sequences(word,maxlen=MAX_WORD_LEN,padding='post')
        sentence_tokenized.append(word_char_padded)
    word_char_sequences.append(sentence_tokenized)

word_char_sequences = np.array(word_char_sequences)

##### word embedding

In [None]:
char_input = Input(shape=(MAX_URL_LEN_CHAR,))

# Embedding layer
char_embedding = Embedding(input_dim=len(char_tokenizer.word_index) + 1, output_dim=k, input_length=MAX_URL_LEN_CHAR)(char_input)
char_embedding = tf.expand_dims(char_embedding, -1)  # Add channel dimension

##### char embedding 2

##### Sum over characters

#### Element-wise Addition

## CNN Block

In [None]:





print("Character Padded Sequences:")
print(char_padded)
print("\nWord Padded Sequences:")
print(word_padded)
print("\nWord-Character Padded Sequences:")
print(word_char_padded)


In [None]:
def charTokenize(string):
    char_list = []
    for char in string:
        char_list.append(char)
    return char_list
concat_urls = ''.join([url.lower() for url in urls])

chars = sorted(list(set(concat_urls)))
char_vocab_size = len(chars)
print(char_vocab_size)

char_to_int = {char:i for i,char in enumerate(chars)}
int_to_char = {i:char for i,char in enumerate(chars)}
encode = lambda url: [char_to_int[ch] for ch in charTokenize(url)] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([int_to_char[i] for i in l]) # decoder: take a list of integers, output a string

decode(encode(urls[0]))
def encode_url(url):
    url = [char_to_int[ch] for ch in charTokenize(url)]
    while len(url<200):
        url.append()
    return url