In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [48]:
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *

#nltk.download('punkt')

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [52]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)
#from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

In [None]:
from keras.layers.embeddings import Embedding

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline

In [9]:
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

In [11]:
df = pd.read_csv("data/spam.csv", encoding = 'latin-1') 
df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))

In [13]:
max(df['message_len'])

171

### Pre-process data

In [14]:
# Define stop words
# nltk.download('stopwords')
stop_words = stopwords.words('english')
add_words = ['hehe', 'im', 'c', 'u']
stop_words = stop_words + add_words 

# Define stemmer 
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    # Set to lower case, remove symbols, numbers, breaks
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    # Remove stop words
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    
    # Apply stemmer
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [15]:
df['message_clean'] = df['message'].apply(preprocess_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [16]:
# Encode target variable
le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,message,message_len,message_clean,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...,0
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...,1
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though,0


In [17]:
X = df['message_clean']
y = df['target_encoded']

print(len(X), len(y))

5572 5572


In [18]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

4179 4179
1393 1393


In [19]:
# Vectorize
vect = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=100)
vect.fit(X_train)

CountVectorizer(max_features=100, ngram_range=(1, 2), stop_words='english')

In [20]:
# Create document-term matrices
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [21]:
# TF-IDF
tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(X_train_dtm)
X_train_tfidf = tfidf_transformer.transform(X_train_dtm)


In [22]:
X_train_tfidf

<4179x100 sparse matrix of type '<class 'numpy.float64'>'
	with 9160 stored elements in Compressed Sparse Row format>

### GloVE

In [36]:
# Length of the vocabulary
texts = df['message_clean']
target = df['target_encoded']


word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

6769

In [49]:
# Pad sequences
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = pad_sequences(
    embed(texts), 
    length_long_sentence, 
    padding='post'
)

train_padded_sentences

array([[   2, 3199,  276, ...,    0,    0,    0],
       [   8,  239,  532, ...,    0,    0,    0],
       [   9,  361,  591, ...,    0,    0,    0],
       ...,
       [6767, 1007, 6768, ...,    0,    0,    0],
       [ 139, 1257, 1612, ...,    0,    0,    0],
       [1998,  382,  171, ...,    0,    0,    0]], dtype=int32)

In [None]:
# Prepare embedding dictionary
embeddings_dict = {}
embedding_dim = 50

with open("data/glove.6B/glove.6B.50d.txt") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype = 'float32')
        embeddings_dict[word] = vector_dimensions

In [37]:
# Build embedding matrix
embedding_matrix = np.zeros((vocab_length, embedding_dim))

In [38]:
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.098201  ,  0.39923999,  0.25696999, ...,  0.70283997,
         0.32207   ,  0.77503002],
       [ 0.14827999,  0.17761   ,  0.42346001, ..., -0.2182    ,
         0.12971   ,  0.32953   ],
       ...,
       [-0.68614   , -0.20372   , -0.12739   , ..., -0.18347   ,
         0.54004002,  0.77217001],
       [ 0.21509001, -0.2832    ,  0.16023999, ...,  0.15110999,
        -0.12344   ,  1.00170004],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### Naive Bayes, Bag of Words

In [40]:
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)

MultinomialNB()

In [59]:
y_pred = NB.predict(X_test_dtm)
y_pred_prob = NB.predict_proba(X_test_dtm)[:,1]

In [52]:
print(metrics.accuracy_score(y_test, y_pred))

0.9361091170136396


In [53]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1180   22]
 [  67  124]]


In [54]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1202
           1       0.85      0.65      0.74       191

    accuracy                           0.94      1393
   macro avg       0.90      0.82      0.85      1393
weighted avg       0.93      0.94      0.93      1393



In [61]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9340475298586126


### Naive Bayes, TF-IDF

In [63]:
pipeline = Pipeline([('bow', CountVectorizer()),
                     ('tfdif', TfidfTransformer()),
                     ('model', MultinomialNB())])

In [70]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]

In [71]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1202
           1       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [72]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9745624656985303


### XGBoost, TF-IDF

In [80]:
pipeline = Pipeline([('bow', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('model',xgb.XGBClassifier(
                 learning_rate = 0.1,
                 max_depth = 7,
                 n_estimators = 100,
                 use_label_encoder = False,
                 eval_metric = 'auc'))])

In [81]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='auc', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.1, max_delta_step=0, max_depth=7,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', use_label_encoder=False,
                  

In [82]:
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]

In [83]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1202
           1       0.98      0.77      0.87       191

    accuracy                           0.97      1393
   macro avg       0.97      0.89      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [85]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.977243425007187


### LSTM

In [53]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    target, 
    test_size=0.25
)

In [54]:
def glove_lstm():
    model = Sequential()
    
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence
    ))
    
    model.add(Bidirectional(LSTM(
        length_long_sentence, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = glove_lstm()
model.summary()

2023-03-19 10:52:18.676937: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 50)            338450    
                                                                 
 bidirectional (Bidirectiona  (None, 80, 160)          83840     
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 160)              0         
 lMaxPooling1D)                                                  
                                                                 
 batch_normalization (BatchN  (None, 160)              640       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 160)               0         
                                                        

In [56]:
# Load the model and train
model = glove_lstm()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
history = model.fit(
    X_train, 
    y_train, 
    epochs = 7,
    batch_size = 32,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)

Epoch 1/7
Epoch 1: val_loss improved from inf to 0.29293, saving model to model.h5
Epoch 2/7
Epoch 2: val_loss improved from 0.29293 to 0.17593, saving model to model.h5
Epoch 3/7
Epoch 3: val_loss improved from 0.17593 to 0.13843, saving model to model.h5
Epoch 4/7
Epoch 4: val_loss improved from 0.13843 to 0.10567, saving model to model.h5
Epoch 5/7
Epoch 5: val_loss did not improve from 0.10567
Epoch 6/7
Epoch 6: val_loss did not improve from 0.10567
Epoch 7/7
Epoch 7: val_loss did not improve from 0.10567


In [64]:
# Evaluate model
y_preds = (model.predict(X_test) > 0.5).astype("int32")
#y_preds = (model.predict(X_test)).astype("int32")


print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1208
           1       0.92      0.89      0.90       185

    accuracy                           0.97      1393
   macro avg       0.95      0.94      0.94      1393
weighted avg       0.97      0.97      0.97      1393

