# Notebook Setup

In [52]:
# Data
import pandas as pd
import numpy as np

# ML/DL
import tensorflow as tf
import tensorflow.keras as k

# Technical
import os
import time
import typing
from IPython.display import clear_output

In [53]:
df = pd.read_csv(r'data\data_clean.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,label,text
0,0,"jurong point , crazy .. available bugis n grea..."
1,0,ok lar ... joke wif u oni ...
2,1,free entry 2 wkly comp win FA Cup final tkts 2...
3,0,u dun early hor ... u c ...
4,0,"nah I think usf , live"


# Preprocessing

## Tokenization

In [54]:
NUM_WORDS = 5_000
BATCH_SIZE = 64

In [55]:
df['text'] = df['text'].astype(str)

def tokenize(text, tokenizer, num_words):
    tokenizer = tokenizer(num_words=num_words, oov_token='oov')
    tokenizer.fit_on_texts(text)
    return tokenizer.texts_to_sequences(text)

df['token'] = tokenize(df['text'], k.preprocessing.text.Tokenizer, NUM_WORDS)

## Padding

In [56]:
padded = k.preprocessing.sequence.pad_sequences(df['token'], maxlen=None, padding='post')

## Tensors

In [57]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(padded, df['label'], test_size=0.2)
train, val = tf.data.Dataset.from_tensor_slices((x_train, y_train)), tf.data.Dataset.from_tensor_slices((x_val, y_val))
train_batch, val_batch = train.shuffle(len(train)).batch(BATCH_SIZE), val.shuffle(len(val)).batch(BATCH_SIZE)

for text, label in train_batch.take(1):
    print(text, label)

tf.Tensor(
[[3639    1  370 ...    0    0    0]
 [ 101  183 1168 ...    0    0    0]
 [4885  364    3 ...    0    0    0]
 ...
 [  33 1657  323 ...    0    0    0]
 [2777   12   73 ...    0    0    0]
 [   2   19  382 ...    0    0    0]], shape=(64, 78), dtype=int32) tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0], shape=(64,), dtype=int64)


# Model

In [58]:
import tensorflow.keras.layers as l

vocab_size = NUM_WORDS
model = k.Sequential([
    l.Embedding(vocab_size, 8, input_shape=[padded.shape[1],]),
    l.GlobalAveragePooling1D(),
    l.Dense(8, activation='swish'),
    l.Dropout(0.2),
    l.Dense(1, activation='sigmoid')
])

optimizer = k.optimizers.Adam(learning_rate=0.1)
model.compile(
    optimizer=optimizer,
    loss=k.losses.BinaryCrossentropy(),
    metrics=[k.metrics.BinaryAccuracy(), k.metrics.Recall(), k.metrics.Precision()],
    )

In [59]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 78, 8)             40000     
                                                                 
 global_average_pooling1d_6   (None, 8)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_12 (Dense)            (None, 8)                 72        
                                                                 
 dropout_6 (Dropout)         (None, 8)                 0         
                                                                 
 dense_13 (Dense)            (None, 1)                 9         
                                                                 
Total params: 40,081
Trainable params: 40,081
Non-trainable params: 0
__________________________________________________

In [60]:
def scheduler(epoch, lr):
    lr = lr * np.exp(-0.1)
    return lr
cp_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

hist = model.fit(
    train_batch,
    validation_data=val_batch,
    epochs=100,
    callbacks=[cp_lr]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [78]:
model.evaluate(val_batch)



[0.16267839074134827,
 0.9802690744400024,
 0.8888888955116272,
 0.9806451797485352]

Even with a very basic model (8,000 embedding params [1000 vocab, 8 dim] and 81 layer params) we get very satisfying results.

Reminder:  
accuracy TP + TN / TP+TN+FP+FN (% of correct predictions)  
recall TP / TP + FN (% of positive values that were correctly predicted)  
precision TP / TP + FP (% of positive predictions that were correct)

We can detect 88% of all spam cases, and 98% of our predictions are correct.

Let's try seeing if using transfer learning of a more complex model can improve our performance.