# Notebook Setup

In [139]:
# Data
import pandas as pd
import numpy as np

# ML/DL
import tensorflow as tf
import tensorflow.keras as k

# Technical
import os
import time
import typing
from IPython.display import clear_output

In [140]:
df = pd.read_csv(r'data\data_lemmatized_stopped_clean.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,label,text
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joke wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun early hor u c
4,0,nah think usf live


# Preprocessing

## Tokenization

In [141]:
NUM_WORDS = 1_000
BATCH_SIZE = 64

In [142]:
df['text'] = df['text'].astype(str)

def tokenize(text, tokenizer, num_words):
    tokenizer = tokenizer(num_words=num_words, oov_token='oov')
    tokenizer.fit_on_texts(text)
    return tokenizer.texts_to_sequences(text)

df['token'] = tokenize(df['text'], k.preprocessing.text.Tokenizer, NUM_WORDS)

## Padding

In [143]:
padded = k.preprocessing.sequence.pad_sequences(df['token'], maxlen=None, padding='post')

## Tensors

In [144]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(padded, df['label'], test_size=0.2)
train, val = tf.data.Dataset.from_tensor_slices((x_train, y_train)), tf.data.Dataset.from_tensor_slices((x_val, y_val))
train_batch, val_batch = train.shuffle(len(train)).batch(BATCH_SIZE), val.shuffle(len(val)).batch(BATCH_SIZE)

for text, label in train_batch.take(1):
    print(text, label)

tf.Tensor(
[[  2 126   3 ...   0   0   0]
 [360 371  70 ...   0   0   0]
 [143   1   0 ...   0   0   0]
 ...
 [119   2   1 ...   0   0   0]
 [ 66   1   0 ...   0   0   0]
 [178   1  67 ...   0   0   0]], shape=(64, 74), dtype=int32) tf.Tensor(
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0], shape=(64,), dtype=int64)


# Model

In [145]:
import tensorflow.keras.layers as l

vocab_size = NUM_WORDS
model = k.Sequential([
    l.Embedding(vocab_size, 8, input_shape=[padded.shape[1],]),
    l.GlobalAveragePooling1D(),
    l.Dense(8, activation='swish'),
    l.Dropout(0.2),
    l.Dense(1, activation='sigmoid')
])

optimizer = k.optimizers.Adam()
model.compile(
    optimizer=optimizer,
    loss=k.losses.BinaryCrossentropy(),
    metrics=[k.metrics.BinaryAccuracy(), k.metrics.Recall(), k.metrics.Precision()]
    )

In [146]:
model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_22 (Embedding)    (None, 74, 8)             8000      
                                                                 
 global_average_pooling1d_22  (None, 8)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_62 (Dense)            (None, 8)                 72        
                                                                 
 dropout_40 (Dropout)        (None, 8)                 0         
                                                                 
 dense_63 (Dense)            (None, 1)                 9         
                                                                 
Total params: 8,081
Trainable params: 8,081
Non-trainable params: 0
___________________________________________________

In [147]:
hist = model.fit(
    train_batch,
    validation_data=val_batch,
    epochs=50
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

KeyboardInterrupt: 

In [148]:
print(
    'Best accuracy: {}\nBest recall: {}\nBest precision: {}'.format(np.max(hist.history['val_binary_accuracy']), np.max(hist.history['val_recall_1']), np.max(hist.history['val_precision_1'])),
)

Best accuracy: 0.9838564991950989
Best recall: 0.9066666960716248
Best precision: 0.9774436354637146


Even with a very basic model (8000 embedding params [1000 vocab, 8 dim] and 81 layer params) we get very satisfying results.

Reminder:  
accuracy TP + TN / TP+TN+FP+FN (% of correct predictions)  
recall TP / TP + FN (% of positive values that were correctly predicted)  
precision TP / TP + FP (% of positive predictions that were correct)

We can detect 90% of all spam cases, and 97% of our predictions are correct.

Let's try seeing if using transfer learning of a more complex model can improve our performance.