<a href="https://colab.research.google.com/github/RedDawe/SMS_spam_classification/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install tensorflow-gpu==2.0

In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [0]:
assert tf.executing_eagerly()

In [0]:
data = pd.read_csv('spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
X = data['v2']
Y = data['v1']

Y.where(Y == 'ham', 1, inplace=True)
Y.where(Y == 1, 0, inplace= True)

Y.head()

0    0
1    0
2    1
3    0
4    0
Name: v1, dtype: object

In [0]:
Y_test = np.array(Y[5000:], dtype=np.int32)
Y_cv = np.array(Y[4000:5000], dtype=np.int32)
Y_train = np.array(Y[:4000], dtype=np.int32)

X_test = X[5000:]
X_cv = X[4000:5000]
X_train = X[:4000]

In [8]:
spam_perc = np.sum(Y_train)/Y_train.shape[0]*100
print(spam_perc)

weight_dict = {
    0: 1,
    1: 2
}

13.375


In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train)

def preprocess(untokenized):
  sequences = tokenizer.texts_to_sequences(untokenized)
  sequence_matrix = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=150)

  return sequence_matrix

In [0]:
def recall_m(y_true, y_pred):
        true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
        possible_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
        predicted_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+tf.keras.backend.epsilon()))

In [11]:
inputs = tf.keras.Input(shape=[150])
model = tf.keras.layers.Embedding(1000, 100, input_length=150)(inputs)
model = tf.keras.layers.LSTM(64, activation='tanh', return_sequences=True)(model)
model = tf.keras.layers.LSTM(1, activation='sigmoid')(model)
model = tf.keras.Model(inputs=inputs, outputs=model)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          100000    
_________________________________________________________________
lstm (LSTM)                  (None, 150, 64)           42240     
_________________________________________________________________
lstm_1 (LSTM)                (None, 1)                 264       
Total params: 142,504
Trainable params: 142,504
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(preprocess(X_train), Y_train, batch_size=128, epochs=10, class_weight=weight_dict)

Train on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f177411c198>

In [14]:
model.evaluate(preprocess(X_cv), Y_cv)



[0.07758428309112787, 0.981, 0.90532064]

In [15]:
model.evaluate(preprocess(X_test), Y_test)



[0.05747152964700131, 0.986014, 0.89624816]