# Convolution neural network
Building a CNN on top of the techniques we've described :
- Word embedding layer
- Upsampling the dataset using back-translations

In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#Text processing
import tensorflow as tf # conda install -c conda-forge tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.layers import Conv1D, MaxPooling1D #CNN specific

In [2]:
url = "https://raw.githubusercontent.com/ThinhNguyendai/SMSSpamDetection/main/SMSSpamCollection" #Use the RAW one
messages = pd.read_csv(url, sep ='\t', names=["label", "message"])

#Oversampled part
url2 = "https://raw.githubusercontent.com/ThinhNguyendai/SMSSpamDetection/main/Spam"
new_spam = pd.read_csv(url2, sep ='\t', names=["message"]) # No label like the usual file here
spam_labels = ["spam" for i in range(len(new_spam))]
new_spam.insert(0, "label", spam_labels, allow_duplicates=True)

In [3]:
ham_msg = messages[messages.label =='ham']
spam_msg = messages[messages.label=='spam']
new_spam_df = new_spam.sample(n = len(ham_msg) - len(spam_msg), random_state = 754)
msg_df = pd.concat([ham_msg, spam_msg, new_spam_df])
msg_df.reset_index(drop=True)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,ham,Even my brother is not like to speak with me. ...
...,...,...
9645,spam,You have a secret admirer who is looking 2 con...
9646,spam,25p 4 Alfie Moon's Children in need song on ur...
9647,spam,Block Breaker now comes in deluxe format with ...
9648,spam,"Sun vacation. To claim your medical holiday, s..."


In [4]:
msg_labels = (msg_df['label'].map({'ham': 0, 'spam': 1})).values
train_msg, test_msg, train_labels, test_labels = train_test_split(msg_df['message'],
                                                                  msg_labels,
                                                                  test_size=0.2,
                                                                  random_state=705)

In [8]:
#Tokenizer : turn words into integers
oov_tok = "<OOV>" # What to replace words that are not in the vocabulary with
vocab_size = 500 # Maximum number of words for tokenizer

tokenizer = Tokenizer(num_words = vocab_size,
                      char_level=False, # Work words by word
                      oov_token = oov_tok)
tokenizer.fit_on_texts(train_msg)

In [24]:
word_index = tokenizer.word_index
len(word_index) #Before using data augmentation : had 4194 words

8947

In [9]:
# Sequencing and padding on training and testing 
max_len = 50 # Max number of tokens, used with truncating and padding
trunc_type = "post" # Truncates sequences of tokens that are longer than max_len, post=right side
padding_type = "post" # Pads AFTER (with post) if sequence is shorter than max_len

training_sequences = tokenizer.texts_to_sequences(train_msg)
training_padded = pad_sequences (training_sequences, maxlen = max_len,
                                 padding = padding_type, truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_msg)
testing_padded = pad_sequences(testing_sequences, maxlen = max_len,
                               padding = padding_type, truncating = trunc_type)

In [12]:
print('Shape of training array: ', training_padded.shape)
print('Shape of testing array: ', testing_padded.shape)
print(type(training_padded))
print(type(train_labels))

Shape of training array:  (7720, 50)
Shape of testing array:  (1930, 50)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [18]:
#For some reason, sensitivity and specificity are not by default in Keras
#Source of this code : https://www.sabinasz.net/unbalanced-classes-machine-learning/
from tensorflow.keras import backend as K

def sensitivity(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

def specificity(y_true, y_pred):
    true_negatives = K.sum(K.round(K.clip((1-y_true) * (1-y_pred), 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(1-y_true, 0, 1)))
    return true_negatives / (possible_negatives + K.epsilon())

### Keras functions to define neural network
Embedding layer takes as input a vector of length *input_length* of **integers**, where the integers are between 0 and *vocab_size-1* (both bounds included). The output is a matrix of dimensions *input_length* X *output_dim*. In other words, each input neuron is projected into a space of dimension *output_dim*.

The flatten layer flattens the 2D output into a 1D array

Dense layer is another name for the regular fully connected layer.

In [22]:
#vocab_size = 500
#max_len = 50
embedding_dim = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_len))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', sensitivity, specificity])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 32)            16000     
_________________________________________________________________
flatten_3 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               204928    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 221,057
Trainable params: 221,057
Non-trainable params: 0
_________________________________________________________________


In [23]:
hist = model.fit(training_padded, train_labels, validation_split=0.2, epochs=15, batch_size=20)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Commenting on results
This new neural network performs significantly better.
It's difficult to say whether it's due to simply using more parameters or the embedding layer, but the results are there.

**Might be worth checking whether the choices of hyperparameters make sense**. Max_len and vocab_size might no longer be appropriate for this new dataset.

# Convolutional neural network
It would be nice to obtain a model with significantly less parameters but similar or even better performance.
The size of the model hasn't been an issue in our case, but can quickly become one as we work with more complicated language processing problems.

This is a 1D convolutional neural network. **I'm not sure why it would work well for language processing problems, since convolutional neural networks were designed for image processing**. The basic idea of a convolutional layer is that it allows to extract features from the input image, and pooling layers allow you to combine those features. I guess I could vaguely see how that would be useful for text classification, but we'll see.

I'm going to work with the simple layers that Keras gives, such as Conv1D layer, MaxPooling1D layer and AveragePooling1D layer.

In [53]:
max_len = 50
embedding_dim = 32
batch_size = 20

conv1_size = 24 #No ideas bro
conv2_size = 32
fc_size = 64  # Reduced size of this

In [41]:
CNN = Sequential()
CNN.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
# No flatten because Conv1D input is formatted this way
CNN.add(Conv1D(conv1_size, 1, activation='relu'))
CNN.add(Conv1D(conv2_size, 3, activation='relu'))
CNN.add(MaxPooling1D(pool_size=2))
CNN.add(Flatten())
CNN.add(Dense(fc_size, activation='relu'))
CNN.add(Dense(1, activation='sigmoid'))
CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', sensitivity, specificity])
CNN.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 50, 32)            16000     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 50, 24)            792       
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 48, 32)            2336      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 24, 32)            0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 768)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                49216     
_________________________________________________________________
dense_17 (Dense)             (None, 1)               

In [43]:
histCNN = CNN.fit(training_padded, train_labels, validation_split=0.2, epochs=15, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# Results
The training is slower at first, not sure why.

We obtain pretty similar results, but the number of parameters is way down. Let's try more convolutional and pooling and less fully connected.

In [54]:
conv1_size = 24
conv2_size = 32
conv3_size = 32 #Second set of (Conv -> Conv) -> Pooling
conv4_size = 48
fc_size = 32  # Reduced size of this again

In [51]:
CNN2 = Sequential()
CNN2.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
CNN2.add(Conv1D(conv1_size, 1, activation='relu'))
CNN2.add(Conv1D(conv2_size, 3, activation='relu'))
CNN2.add(MaxPooling1D(pool_size=2))
CNN2.add(Conv1D(conv3_size, 1, activation='relu'))
CNN2.add(Conv1D(conv4_size, 3, activation='relu'))
CNN2.add(MaxPooling1D(pool_size=2))
CNN2.add(Flatten())
CNN2.add(Dense(fc_size, activation='relu'))
CNN2.add(Dense(1, activation='sigmoid'))
CNN2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', sensitivity, specificity])
CNN2.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 50, 32)            16000     
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 50, 24)            792       
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 48, 32)            2336      
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 24, 32)            0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 24, 32)            1056      
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 22, 48)            4656      
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 11, 48)          

In [55]:
histCNN2 = CNN2.fit(training_padded, train_labels, validation_split=0.2, epochs=15, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# Results of 2nd CNN
Even though we have less parameters, the initial epochs are slower to train. I do not know why.

The training is less stable than the other 2 models, but the achieved sensitivity and specificity seem better.
**I straight up have no idea how to choose architectures, I'm just trying stuff out**.