In [1]:
import os
import json
import pickle
import random
import sqlite3
import jsonlines
from collections import Counter
from unicodedata import normalize
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Embedding, Conv2D, MaxPooling2D, Concatenate, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,ConfusionMatrixDisplay
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from tensorflow.keras import backend as K

# Clear memory
K.clear_session()



print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


In [22]:
tokenizer_vocab_size = 15000
html_max_length = 128
embed_dimension = 2
batch_size = 8
num_filters = 256


In [3]:
def get_median_and_average(seq_list):
    lengths = [len(doc.ids) for doc in seq_list]
    lengths.sort()
    lengths_avg = sum(lengths)/len(lengths)
    median = lengths[len(lengths)//2]
    print(f"median: {median}")
    print(f"average length: {int(lengths_avg)}")

In [4]:
# Load the list from the pickle file
with open('training_data.pkl', 'rb') as f:
    training_data = pickle.load(f)
    # random.shuffle(training_data)
    print("list loaded successfully")
    print(len(training_data))



list loaded successfully
2000


In [5]:
# split the dataset into training and test sets
train_split = 0.8
test_split = 1-train_split


In [6]:
train = training_data[: int(len(training_data) * train_split)]
test = training_data[int(len(training_data) * train_split):]
print(len(train),len(test))

1600 400


In [7]:
# load X and Y s of dataset of tuples which are (html,label) 
x_train = []
y_train = []

x_test = []
y_test = []

for elem in train:
    x_train.append(elem[0])
    if elem[1] == "Adult":
        y_train.append(1)
    elif elem[1] == "Benign":
        y_train.append(0)

for elem in test:
    x_test.append(elem[0])
    if elem[1] == "Adult":
        y_test.append(1)
    elif elem[1] == "Benign":
        y_test.append(0)


In [8]:

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(x_train+x_test,
                              vocab_size=tokenizer_vocab_size, 
                              min_frequency=2,
                              special_tokens=["<unk>", "<s>", "</s>", "<pad>"])

In [9]:
# encoded = 
# encoded_html_docs = [tokenizer.encode(doc) for doc in loaded_html_documents]

In [10]:
def save_tokenizer(path):
    # Create the directory if it does not exist
    os.makedirs(path, exist_ok=True)
    tokenizer.save_model(path)
    print("tokenizer saved successfully")

# Load model
def load_tokenizer(path):
    tokenizer = ByteLevelBPETokenizer(f"{path}/vocab.json", f"{path}/merges.txt")
    return tokenizer

In [11]:
save_tokenizer("bpe_tokenizer_20k")

tokenizer saved successfully


In [12]:
class DataGenerator(Sequence):
    def __init__(self, htmls, labels, batch_size, tokenizer):
        self.htmls = htmls
        self.labels = labels
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.indexes = np.arange(len(htmls))
    
    def __len__(self):
        return int(np.ceil(len(self.htmls) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_htmls = [self.htmls[i] for i in batch_indexes]
        batch_labels = [self.labels[i] for i in batch_indexes]
        
        
        html_input_data = self._preprocess_html_doc(batch_htmls)
        
        return html_input_data, np.array(batch_labels)

    
    def encode_and_pad_sequence(self,sequence, max_length, pad_token="<pad>"):
        pad_token_id = 3
    
       
        token_ids =  tokenizer.encode(sequence).ids
        if len(token_ids) < max_length:
            # Pad with <pad> token
            token_ids.extend([pad_token_id] * (max_length - len(token_ids)))
        return token_ids[:max_length]
    
    def _preprocess_html_doc(self, htmls):
        html_sequences = [self.encode_and_pad_sequence(sequence,html_max_length) for sequence in htmls]
        
        return html_sequences
    


    def on_epoch_end(self):
        np.random.shuffle(self.indexes)


In [13]:
training_generator = DataGenerator(x_train, y_train, batch_size, tokenizer)
test_generator = DataGenerator(x_test, y_test, batch_size, tokenizer)

In [None]:

# Load the pre-trained MobileBERT model and tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")



In [14]:
input = Input(shape=(html_max_length,))

embedding = Embedding(input_dim=tokenizer_vocab_size, input_length=html_max_length, output_dim=embed_dimension)(input)
embedding = tf.expand_dims(embedding, -1)  # Add channel dimension
print(embedding.shape)

(None, 700, 4, 1)


In [22]:
conv_3 = Conv2D(num_filters, (3, embed_dimension))(embedding)
conv_3 = BatchNormalization()(conv_3)  # Batch normalization after convolution
conv_3 = Activation('relu')(conv_3) 
conv_3 = MaxPooling2D((2, 1), strides=(2, 1))(conv_3)
# h = 5
conv_5 = Conv2D(num_filters, (5, embed_dimension), activation='relu')(embedding)
conv_5 = BatchNormalization()(conv_5)  # Batch normalization after convolution
conv_5 = Activation('relu')(conv_5) 
conv_5 = MaxPooling2D((2, 1), strides=(2, 1))(conv_5)


In [23]:
#concatenate all convolutional layer outputs
concatenated = Concatenate(axis=1)([conv_3,conv_5])
flattened = Flatten()(concatenated)
print(flattened.shape)

# feed concatenated conv layers to fully conected layer
dense = Dense(512,activation='relu',kernel_regularizer=l2(0.01))(flattened)
dropout = Dropout(0.5)(dense)
dropout.shape

(None, 178432)


ResourceExhaustedError: OOM when allocating tensor with shape[178432,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

In [None]:
output_layer = Dense(2, activation='softmax')(dropout)

# MobileBERT implementation


In [37]:
# texts = [elem[0] for elem in training_data]
# labels = [elem[1] for elem in training_data]

In [23]:
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

# Tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=html_max_length, return_tensors="tf")

# Tokenize your dataset
inputs = tokenize_function(x_train)
test_inputs = tokenize_function(x_test)


In [24]:
# Convert to TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), y_train))
dataset_test =  tf.data.Dataset.from_tensor_slices((dict(test_inputs), y_test))

# Shuffle and batch the dataset
train_dataset = dataset.shuffle(len(x_train)).batch(batch_size)
test_dataset = dataset.shuffle(len(x_test)).batch(batch_size)

In [25]:
# Load MobileBERT for classification
model = TFAutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)

All model checkpoint layers were used when initializing TFMobileBertForSequenceClassification.

Some layers of TFMobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# checkpoint = ModelCheckpoint('mobileBERT_alpha_2k.h5', monitor='val_loss', save_best_only=True, mode='min')
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

In [27]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [29]:
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 11/200 [>.............................] - ETA: 37s - loss: 0.3669 - accuracy: 0.9545


KeyboardInterrupt



In [47]:
tokenize_function([""])


{'input_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[  101, 22555,  6979,  2497, 22555,  3348,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 