In [1]:
import os
import json
import pickle
import random
import sqlite3
import jsonlines
from collections import Counter
from unicodedata import normalize
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Embedding
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,ConfusionMatrixDisplay





print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
tokenizer_vocab_size = 20000
html_max_length = 1000
embed_dimension = 4
batch_size = 32

In [3]:
def get_median_and_average(seq_list):
    lengths = [len(doc.ids) for doc in seq_list]
    lengths.sort()
    lengths_avg = sum(lengths)/len(lengths)
    median = lengths[len(lengths)//2]
    print(f"median: {median}")
    print(f"average length: {int(lengths_avg)}")

In [4]:
# Load the list from the pickle file
with open('htmls_parsed_text_list.pkl', 'rb') as f:
    loaded_html_documents = pickle.load(f)
    print("list loaded successfully")

list loaded successfully


In [6]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(loaded_html_documents,
                              vocab_size=tokenizer_vocab_size, 
                              min_frequency=2,
                              special_tokens=["<unk>", "<s>", "</s>", "<pad>"])

In [7]:
# encoded = 
# encoded_html_docs = [tokenizer.encode(doc) for doc in loaded_html_documents]

In [9]:
def save_tokenizer(path):
    # Create the directory if it does not exist
    os.makedirs(path, exist_ok=True)
    tokenizer.save_model(path)
    print("tokenizer saved successfully")

# Load model
def load_tokenizer(path):
    tokenizer = ByteLevelBPETokenizer(f"{path}/vocab.json", f"{path}/merges.txt")
    return tokenizer

In [10]:
save_tokenizer("bpe_tokenizer_20k")

tokenizer saved successfully


In [53]:

def encode_and_pad_sequence(sequence, max_length, pad_token="<pad>"):
    pad_token_id = 3

   
    token_ids =  tokenizer.encode(sequence).ids
    if len(token_ids) < max_length:
        # Pad with <pad> token
        token_ids.extend([pad_token_id] * (max_length - len(token_ids)))
    return token_ids[:max_length]


In [54]:
encoded_padded_html_docs = [encode_and_pad_sequence(doc,html_max_length) for doc in loaded_html_documents]

In [64]:
encoded_html_docs = None
loaded_html_documents = None
del encoded_html_docs
del loaded_html_documents

In [None]:
input = Input(shape=(html_max_length,))

embedding = Embedding(input_dim=tokenizer_vocab_size, input_length=html_max_length, output_dim=embed_dimension)(input)
print(embedding.shape)

output_layer = Dense(2, activation='softmax')(embedding)

In [None]:
class DataGenerator(Sequence):
    def __init__(self, htmls, labels, batch_size, char_tokenizer, word_tokenizer):
        self.htmls = htmls
        self.labels = labels
        self.batch_size = batch_size
        self.char_tokenizer = char_tokenizer
        self.indexes = np.arange(len(htmls))
    
    def __len__(self):
        return int(np.ceil(len(self.htmls) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_htmls = [self.htmls[i] for i in batch_indexes]
        batch_labels = [self.labels[i] for i in batch_indexes]
        
        char_input_data = self._preprocess_char_input(batch_htmls)
        word_input_data = self._preprocess_word_input(batch_htmls)
        word_char_input_data = self._preprocess_word_char_input(batch_htmls)
        
        return [char_input_data, word_input_data, word_char_input_data], np.array(batch_labels)
    
    def _preprocess_char_input(self, htmls):
        char_sequences = self.char_tokenizer.texts_to_sequences(htmls)
        return pad_sequences(char_sequences, maxlen=MAX_URL_LEN_CHAR, padding='post',truncating='post')
    


    def on_epoch_end(self):
        np.random.shuffle(self.indexes)


In [1]:
# save model
# 
