In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import tensorflow_model_analysis as tfma
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.callbacks import ModelCheckpoint
#from keras.models import Sequential
#from keras.layers.embeddings import Embedding
from keras.layers import TextVectorization
from keras.layers import Embedding, Dense, Dropout, Flatten, GRU, Input, LSTM, SimpleRNN
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/My Drive/spam.csv', encoding= 'latin_1')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
data.rename(columns={'v1': 'Target', 'v2': 'Email'}, inplace=True)
data['Target']=data['Target'].map({'ham': 0, 'spam': 1})
data.head()

texts = data['Email']
labels = data['Target']

print("Text count: " , len(texts))
print("Label count: ", len(labels))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
email_sequences = tokenizer.texts_to_sequences(texts)
email_sequences = sorted(email_sequences, key=len)

word_index = tokenizer.word_index
print("{0} unique words found".format(len(word_index)))

#padded_emails = pad_sequences(email_sequences)

#print("Shape of data: ", padded_emails.shape)
#print(padded_emails)

Text count:  5572
Label count:  5572
8920 unique words found


In [25]:
#Data splitting
X_train, x_test, Y_train, y_test = train_test_split(email_sequences, labels, test_size=0.3, random_state=7)

X_train = pad_sequences(X_train)
x_test = pad_sequences(sorted(x_test, key=len))

split0 = int(len(x_test)/3)
split1 = int((2*len(x_test))/3)
print(split0,split1)

#Test dataset small
x_test_small = x_test[:split0]
y_test_small = y_test[:split0]

print(y_test_small.shape)
#Test dataset mid
x_test_mid = x_test[split0:split1]
y_test_mid = y_test[split0:split1]

print(y_test_mid.shape)
#Test dataset large
x_test_large = x_test[split1:]
y_test_large = y_test[split1:]

print(y_test_large.shape)

557 1114
(557,)
(557,)
(558,)


In [26]:
##Debugging splits
print(x_test_small)
print(x_test_mid)
print(x_test_large)

print(y_test_small)
print(y_test_mid)
print(y_test_large)

[[   0    0    0 ...    0    0  743]
 [   0    0    0 ...    0    0  570]
 [   0    0    0 ...    0    0  647]
 ...
 [   0    0    0 ... 6613    2 1210]
 [   0    0    0 ...    8    5 1289]
 [   0    0    0 ...  198   55   20]]
[[   0    0    0 ... 5098 5099    3]
 [   0    0    0 ...   86  569   44]
 [   0    0    0 ...   45 3869 1102]
 ...
 [   0    0    0 ...  283 3469  187]
 [   0    0    0 ...    2   32 2133]
 [   0    0    0 ...  154  655  847]]
[[   0    0    0 ...   29 4190  388]
 [   0    0    0 ...   15  810 8392]
 [   0    0    0 ...   17  108  196]
 ...
 [   0    0    0 ... 2640 1603  428]
 [   0    0    0 ...  132  249  811]
 [ 607  615   15 ... 1198  798 1373]]
83      0
2235    0
2746    0
246     0
3120    0
       ..
4051    0
3285    0
2885    0
3903    1
2103    0
Name: Target, Length: 557, dtype: int64
3412    0
4458    1
2748    0
1078    0
3205    0
       ..
5529    0
2460    0
2661    0
2086    0
5126    0
Name: Target, Length: 557, dtype: int64
2886    0
4459  

In [27]:
#Metric definitions from Stack Overflow
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [28]:
#Simple RNN
EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, EMBEDDING_SIZE))
model.add(SimpleRNN(EMBEDDING_SIZE, input_shape=(X_train.shape[1],1)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics = ["accuracy", f1_m, precision_m, recall_m])

#Train and save the best model
filepath = "SimpleRNN_model.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test_large, y_test_large, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 512)         4567552   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 512)               524800    
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 513       
                                                                 
Total params: 5,092,865
Trainable params: 5,092,865
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.43913, saving model to SimpleRNN_model.h1
Epoch 2/5
Epoch 2: loss improved from 0.43913 to 0.40165, saving model to SimpleRNN_mod

In [30]:
#LSTM
EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, EMBEDDING_SIZE))
model.add(LSTM(EMBEDDING_SIZE, input_shape=(X_train.shape[1],1)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics = ["accuracy", f1_m, precision_m, recall_m])

#Train and save the best model
filepath = "LSTM_model.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test_large, y_test_large, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 512)         4567552   
                                                                 
 lstm_2 (LSTM)               (None, 512)               2099200   
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 513       
                                                                 
Total params: 6,667,265
Trainable params: 6,667,265
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.42724, saving model to LSTM_model.h1




Epoch 2/5
Epoch 2: loss improved from 0.42724 to 0.36753, saving model to LSTM_model.h1




Epoch 3/5
Epoch 3: loss improved from 0.36753 to 0.26405, saving model to LSTM_model.h1




Epoch 4/5
Epoch 4: loss improved from 0.26405 to 0.16935, saving model to LSTM_model.h1




Epoch 5/5
Epoch 5: loss improved from 0.16935 to 0.12002, saving model to LSTM_model.h1




1/3 test data score
Test loss: 0.6504
Test accuracy: 82.05
Test f1_score: 0.09
Test precision: 0.15
Test recall: 0.06
2/3 test data score
Test loss: 0.5772
Test accuracy: 82.76
Test f1_score: 0.05
Test precision: 0.07
Test recall: 0.04
Full test data score
Test loss: 0.6479
Test accuracy: 81.28
Test f1_score: 0.10
Test precision: 0.16
Test recall: 0.09


In [None]:
#GRU
EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, EMBEDDING_SIZE))
model.add(GRU(EMBEDDING_SIZE, input_shape=(X_train.shape[1],1)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer ='adam',metrics = ["accuracy", f1_m, precision_m, recall_m])

#Train and save the best model
filepath = "GRU_model.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("3/3 test data score")
score = model.evaluate(x_test_large, y_test_large, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

In [None]:
##Part Two

#Download and unzip the Stanford GloVe model (pretrained word embeddings)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-10-10 19:03:22--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-10-10 19:03:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-10-10 19:03:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip -q glove.twitter.27B.zip

--2022-10-10 19:25:38--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2022-10-10 19:25:38--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2022-10-10 19:30:23 (5.08 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]



In [None]:
# Upgrade pip to the latest, and install TFMA.
!pip install -U pip
!pip install tensorflow-model-analysis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 8.5 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-model-analysis
  Downloading tensorflow_model_analysis-0.41.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting attrs<22,>=19.3.0
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow!=2

In [None]:
#Install Gensim
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 64.9 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
#2. Use builtin function in Gensim to convert glove to word2vec format
# Gensim works on Word2Vec and has built in function to convert Glove to Word2Vec

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = "glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec.txt"
glove2word2vec(glove_input_file, word2vec_output_file)

  import sys


(400000, 100)

In [None]:
big_glove_input_file = "glove.twitter.27B.100d.txt"
big_word2vec_output_file = "glove.twitter.27B.100d.word2vec.txt"
glove2word2vec(big_glove_input_file, big_word2vec_output_file)

  This is separate from the ipykernel package so we can avoid doing imports until


(1193514, 100)

In [None]:
#10. Read the embeddings in the pretrained model 

import os
path_to_glove_file = "glove.6B.100d.word2vec.txt"

embeddings_index = {}

with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit = 1)
    coefs = np.fromstring(coefs, "f", sep = " ")
    embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400001 word vectors.


In [None]:
#11. Create "embedding_matrix" to index our vocab using the GloVe model

# Prepare embedding_matrix for our word list
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 100)) #Not ", EMBEDDING_SIZE"?
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

In [None]:
# Embedding again for the bigger Twitter model

path_to_glove_file = "glove.twitter.27B.100d.word2vec.txt"

big_embeddings_index = {}

with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit = 1)
    coefs = np.fromstring(coefs, "f", sep = " ")
    big_embeddings_index[word] = coefs

print("Found %s word vectors." % len(big_embeddings_index))

Found 1193515 word vectors.


In [None]:
# Creating "embedding_matrix" again to index our vocab using the bigger Twitter model

# Prepare embedding_matrix for our word list
big_embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 100)) #Not ", EMBEDDING_SIZE"?
for word, i in word_index.items():
  big_embedding_vector = big_embeddings_index.get(word)
  if big_embedding_vector is not None:
    big_embedding_matrix[i] = big_embedding_vector

print(embedding_matrix.shape)

In [None]:
#Intermediary steps

#vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=100)
#voc = vectorizer.get_vocabulary()

## Prep the train dataset to samples and labels
#train_samples = [x['text'] for x in train]
#train_labels = [x['label'] for x in train]
#print("Classes: ", np.unique(train_labels))
#print("Number of samples in train: ", len(train_samples))
#print(train_samples[0])

#val_samples = [x['text'] for x in val]
#val_labels = [x['label'] for x in val]

#test_samples = [x['text'] for x in test]
#test_labels = [x['label'] for x in test]

#vocab_size = len(tokenizer.word_index) + 1
print((X_train.shape[1],1))

In [None]:
#Simple RNN - Glove 6B
#EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Input(shape=(None, ), dtype = "int64"))
model.add(Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(SimpleRNN(100, input_shape=(X_train.shape[1],1)))
#model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics = ["accuracy", f1_m, precision_m, recall_m])
# summarize the model
model.summary()

#Train and save the best model
filepath = "SimpleRNN_glove_6B.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test, y_test, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))


print("Simple RNN - Glove 6B Confusion Matrix")
#mat1 = metrics.confusion_matrix(y_test, x_test)
#sns.heatmap(mat1.T, square = True, annot = True, fmt = 'd', cbar = False)
#plt.xlabel("True label")
#plt.ylabel("Predicted label")
tfma.metrics.ConfusionMatrixPlot(
    num_thresholds: int = DEFAULT_NUM_THRESHOLDS,
    name: str = CONFUSION_MATRIX_PLOT_NAME
)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 100)         892100    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               20100     
                                                                 
 flatten (Flatten)           (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 912,301
Trainable params: 20,201
Non-trainable params: 892,100
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.24108, saving model to SimpleRNN_glove_6B.h1
Epoch 2/5
Epoch 2: loss improved from 0.24108 to 0.16517, saving model to SimpleRNN

In [None]:
#LSTM - Glove 6B
#EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Input(shape=(None, ), dtype = "int64"))
model.add(Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(LSTM(100, input_shape=(X_train.shape[1],1)))
#model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics = ["accuracy", f1_m, precision_m, recall_m])
# summarize the model
model.summary()

#Train and save the best model
filepath = "LSTM_glove_6B.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test, y_test, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, None, 100)         892100    
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 flatten_11 (Flatten)        (None, 100)               0         
                                                                 
 dense_16 (Dense)            (None, 1)                 101       
                                                                 
Total params: 972,601
Trainable params: 80,501
Non-trainable params: 892,100
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.26251, saving model to LSTM_glove_6B.h1




Epoch 2/5
Epoch 2: loss improved from 0.26251 to 0.09470, saving model to LSTM_glove_6B.h1




Epoch 3/5
Epoch 3: loss improved from 0.09470 to 0.06817, saving model to LSTM_glove_6B.h1




Epoch 4/5
Epoch 4: loss improved from 0.06817 to 0.05681, saving model to LSTM_glove_6B.h1




Epoch 5/5
Epoch 5: loss improved from 0.05681 to 0.04302, saving model to LSTM_glove_6B.h1




1/3 test data score
Test loss: 0.0526
Test accuracy: 98.20
Test f1_score: 0.93
Test precision: 0.95
Test recall: 0.93
2/3 test data score
Test loss: 0.0480
Test accuracy: 98.47
Test f1_score: 0.93
Test precision: 0.94
Test recall: 0.94
Full test data score
Test loss: 0.0555
Test accuracy: 98.21
Test f1_score: 0.92
Test precision: 0.94
Test recall: 0.93


In [None]:
#Simple RNN - Glove 27B
#EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Input(shape=(None, ), dtype = "int64"))
model.add(Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(SimpleRNN(100, input_shape=(X_train.shape[1],1)))
#model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics = ["accuracy", f1_m, precision_m, recall_m])
# summarize the model
model.summary()

#Train and save the best model
filepath = "SimpleRNN_glove_27B.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test, y_test, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, None, 100)         892100    
                                                                 
 simple_rnn_12 (SimpleRNN)   (None, 100)               20100     
                                                                 
 flatten_12 (Flatten)        (None, 100)               0         
                                                                 
 dense_17 (Dense)            (None, 1)                 101       
                                                                 
Total params: 912,301
Trainable params: 20,201
Non-trainable params: 892,100
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.25203, saving model to SimpleRNN_glove_27B.h1
Epoch 2/5
Epoch 2: loss improved from 0.25203 to 0.11327, saving model to SimpleR

In [None]:
#LSTM - Glove 27B
#EMBEDDING_SIZE=512
model = keras.Sequential()
model.add(Input(shape=(None, ), dtype = "int64"))
model.add(Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(LSTM(100, input_shape=(X_train.shape[1],1)))
#model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics = ["accuracy", f1_m, precision_m, recall_m])
# summarize the model
model.summary()

#Train and save the best model
filepath = "LSTM_glove_27B.h1"
checkpoint = ModelCheckpoint(filepath, monitor = "loss", mode = "min", verbose =1, save_best_only = True)
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 100, callbacks = [checkpoint])

#1/3 of test data
print("1/3 test data score")
score = model.evaluate(x_test_small, y_test_small, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#2/3 of test data
print("2/3 test data score")
score = model.evaluate(x_test_mid, y_test_mid, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

#3/3 of test data
print("Full test data score")
score = model.evaluate(x_test, y_test, verbose = 0)
print("Test loss: %.4f" % score[0])
print("Test accuracy: %.2f" % (score[1] * 100.0))
print("Test f1_score: %.2f" % (score[2]))
print("Test precision: %.2f" % (score[3]))
print("Test recall: %.2f" % (score[4]))

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_20 (Embedding)    (None, None, 100)         892100    
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 flatten_14 (Flatten)        (None, 100)               0         
                                                                 
 dense_19 (Dense)            (None, 1)                 101       
                                                                 
Total params: 972,601
Trainable params: 80,501
Non-trainable params: 892,100
_________________________________________________________________
Epoch 1/5
Epoch 1: loss improved from inf to 0.30708, saving model to LSTM_glove_27B.h1




Epoch 2/5
Epoch 2: loss improved from 0.30708 to 0.09584, saving model to LSTM_glove_27B.h1




Epoch 3/5
Epoch 3: loss improved from 0.09584 to 0.07086, saving model to LSTM_glove_27B.h1




Epoch 4/5
Epoch 4: loss improved from 0.07086 to 0.06076, saving model to LSTM_glove_27B.h1




Epoch 5/5
Epoch 5: loss improved from 0.06076 to 0.04770, saving model to LSTM_glove_27B.h1




1/3 test data score
Test loss: 0.0541
Test accuracy: 98.56
Test f1_score: 0.95
Test precision: 0.97
Test recall: 0.94
2/3 test data score
Test loss: 0.0495
Test accuracy: 98.74
Test f1_score: 0.95
Test precision: 0.95
Test recall: 0.96
Full test data score
Test loss: 0.0564
Test accuracy: 98.44
Test f1_score: 0.94
Test precision: 0.95
Test recall: 0.94
