<a href="https://colab.research.google.com/github/RedDawe/SMS_spam_classification/blob/master/v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install autocorrect

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from autocorrect import Speller
import string
import re

In [None]:
data = pd.read_csv('spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [None]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
spell = Speller(lang='en')

In [None]:
dictionary = dict()
dictionary['__unk__'] = 1
last_value_used = 1

In [None]:
def process_sentence(text, fit=True):
  global last_value_used

  text = re.sub(r"[^a-zA-Z0-9.?!:()' ]+", "", text)
  text = re.sub(':D', ' smiley_face ', text)
  text = re.sub(':\\)', ' happy_face ', text) #extra whitespace cause in the datasej emojis are often written without any at all:)
  text = re.sub(':\\(', ' sad_face ', text)

  words = text.split()
  lemmed = []
  for word in words:
    spelled = spell(word).lower()
    lem = lemmatizer.lemmatize(spelled)
    #lem = word.lower()

    if not lem in dictionary:
      if fit:
        last_value_used += 1
        dictionary[lem] = last_value_used

        if len(dictionary) > 12000:
          print('size_exceeded, we\'re at:', len(dictionary))
        """
        in python 3.8 replace with:
          if (ln := len(dictionary)) > 12000:
            print('size_exceeded, we\'re at:', ln)
        """
      else:
        lem = '__unk__'

    lemmed.append(dictionary[lem])

  lemmed = [0]*(200-len(lemmed)) + lemmed

  return lemmed

In [None]:
process_sentence("i'm Helo:( :D, yup :) s my caar's great :D")

In [None]:
data['v2'].iloc[:4000] = data['v2'].iloc[:4000].apply(process_sentence)
data['v2'].iloc[4000:] = data['v2'].iloc[4000:].apply(process_sentence, fit=False)
data.head()

Unnamed: 0,v1,v2
0,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,spam,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
X = data['v2']
Y = data['v1']

Y.where(Y == 'ham', 1, inplace=True)
Y.where(Y == 1, 0, inplace= True)

Y.head()

0    0
1    0
2    1
3    0
4    0
Name: v1, dtype: object

In [None]:
X = np.array(X.tolist())
X.shape

(5572,)

In [None]:
Y_test = np.array(Y[5000:], dtype=np.int32)
Y_cv = np.array(Y[4000:5000], dtype=np.int32)
Y_train = np.array(Y[:4000], dtype=np.int32)

X_test = X[5000:]
X_cv = X[4000:5000]
X_train = X[:4000]

In [None]:
spam_perc = np.sum(Y_train)/Y_train.shape[0]*100
print(spam_perc)

weight_dict = {
    0: 1,
    1: 2
}

13.375


In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=12000)
tokenizer.fit_on_texts(X_train)

def preprocess(untokenized):
  sequences = tokenizer.texts_to_sequences(untokenized)
  sequence_matrix = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=200)

  return sequence_matrix

In [None]:
def recall_m(y_true, y_pred):
        true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
        possible_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
        predicted_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+tf.keras.backend.epsilon()))

In [None]:
inputs = tf.keras.Input(shape=[200])
model = tf.keras.layers.Embedding(12000, 100, input_length=150)(inputs)
model = tf.keras.layers.LSTM(64, activation='tanh', return_sequences=True)(model)
model = tf.keras.layers.LSTM(1, activation='sigmoid')(model)
model = tf.keras.Model(inputs=inputs, outputs=model)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m, tf.keras.metrics.AUC()])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          1200000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 64)           42240     
_________________________________________________________________
lstm_2 (LSTM)                (None, 1)                 264       
Total params: 1,242,504
Trainable params: 1,242,504
Non-trainable params: 0
_________________________________________________________________


In [None]:
inputs = tf.keras.Input(shape=[200])
model = tf.keras.layers.Embedding(12000, 100, input_length=150)(inputs)
model = tf.keras.layers.LSTM(64, activation='tanh', return_sequences=True)(model)
model = tf.keras.layers.LSTM(1, activation='tanh')(model)
model = tf.keras.layers.Activation('sigmoid')(model)
model = tf.keras.Model(inputs=inputs, outputs=model)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m, tf.keras.metrics.AUC()])
model.summary()

Model: "model_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        [(None, 200)]             0         
_________________________________________________________________
embedding_22 (Embedding)     (None, 200, 100)          1200000   
_________________________________________________________________
lstm_28 (LSTM)               (None, 200, 64)           42240     
_________________________________________________________________
lstm_29 (LSTM)               (None, 1)                 264       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 1,242,504
Trainable params: 1,242,504
Non-trainable params: 0
_________________________________________________________________


In [None]:
inputs = tf.keras.Input(shape=[200])
model = tf.keras.layers.Embedding(12000, 100, input_length=150)(inputs)
#model = tf.keras.layers.Dropout(0.5)(model)
model = tf.keras.layers.LSTM(16, activation='tanh', return_sequences=False)(model) #32
#model = tf.keras.layers.Dropout(0.5)(model)
#model = tf.keras.layers.LSTM(1, activation='tanh')(model)
#model = tf.keras.layers.Activation('sigmoid')(model)
model = tf.keras.layers.Dense(1, activation='sigmoid')(model)
model = tf.keras.Model(inputs=inputs, outputs=model)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m, tf.keras.metrics.AUC()])
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 100)          1200000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 16)                7488      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,207,505
Trainable params: 1,207,505
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(preprocess(X_train), Y_train, batch_size=128, epochs=10, class_weight=weight_dict)
#model.fit(X_train, Y_train, batch_size=128, epochs=10, class_weight=weight_dict)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8a7e74ada0>

In [None]:
model.evaluate(preprocess(X_cv), Y_cv)
#model.evaluate(X_cv, Y_cv)



[0.05900684744119644,
 0.9850000143051147,
 0.9152867197990417,
 0.9879451394081116]

In [None]:
model.evaluate(preprocess(X_test), Y_test)
#model.evaluate(X_test, Y_test)



[0.056248318403959274,
 0.9877622127532959,
 0.8901469111442566,
 0.9915608167648315]

# Future work:
-try transformer instead of LSTM