# THIS CODE WAS RUN ON GOOGLE COLAB FOR COMPATIBILITY.

# Notebook Setup

In [None]:
DEBUG = False

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"

import os
import shutil

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text



In [58]:
tfhub_handle_preprocess = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
tfhub_handle_encoder = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-8-h-512-a-8/2"

In [59]:
tf.__version__

'2.13.1'

# Preprocessing

In [60]:
df = pd.read_csv("/content/drive/MyDrive/coursework/p_spam_detection/data/data_clean.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,label,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i don't think he goes to usf he lives arou...


## Tensors

In [61]:
BATCH_SIZE = 64

In [62]:
df['text'] = df['text'].astype(str)

In [63]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])
train, val = tf.data.Dataset.from_tensor_slices((x_train, y_train)), tf.data.Dataset.from_tensor_slices((x_val, y_val))
train_batch, val_batch = train.shuffle(len(train)).batch(BATCH_SIZE), val.shuffle(len(val)).batch(BATCH_SIZE)

# preview data
if DEBUG:
    for text, label in train_batch.take(1):
        print(text, label)

# Model

In [64]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
  return tf.keras.Model(text_input, net)

bert_model = build_classifier_model()

In [65]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
bert_model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
    )


In [None]:
def scheduler(epoch, lr):
    # lr = lr * np.exp(-0.1)
    lr = lr / 2
    return lr

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="/content/drive/MyDrive/coursework/p_spam_detection/model_checkpoints",
    save_weights_only=True)
cp_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = bert_model.fit(
    train_batch, validation_data=val_batch, epochs=10,
    callbacks=[cp_callback, cp_lr]
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# for x, y in val_batch.take(10):
#     x_val = x.numpy()
#     y_val = y.numpy()

val_pred = bert_model.predict(x_val)
val_pred = np.where(val_pred < 0.5, 0, 1)

from sklearn.metrics import f1_score

f1_score(y_val, val_pred)



0.9387755102040817

In [None]:
bert_model.save("/content/drive/MyDrive/coursework/p_spam_detection/model_checkpoints")

PS: for the sake of performance, we started by using one of the lightest versions of BERT. This model follows the typical BERT architecture with 2 layers and a hidden size of 128 (embedding dimension). Performance was middling, with an f1_score of 0.85714.

Switching to a heavier model (8/512), our performance improved to 0.XXxxx which is an xxxxxxx relative to our homebrew model with an f1-score of 0.XXxxx.