# Spam Detection Model: Final Model Selection

In [1]:
import nltk
#import os
import string 
import re 
import pandas as pd 
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

## Read Data

In [2]:
path = r"C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP"
data = pd.read_csv(path+'\SMSSpamCollection.tsv', sep='\t', )
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Clean Text 

In [3]:
# Import the SnowballStemmer, get list of punctuations
ss = nltk.SnowballStemmer('english')
punct = string.punctuation

In [4]:
# Clean text function
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = [ss.stem(word) for word in tokens if word not in stopwords] # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

# Function to count punctuation
def punct_count(text): # Count the percent of text that are punctuation
    return round((sum([1 for char in text if char in punct])/(len(text) - text.count(" ")))*100,3)

In [5]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct_percent'] = data['body_text'].apply(lambda x: punct_count(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct_percent
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,128,4.688
1,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.082
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,62,3.226
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,135,4.444


## Split Train/Test

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct_percent']], 
                                                    data['label'],
                                                    test_size=0.2,
                                                    random_state=2)

In [18]:
x_train.head()

Unnamed: 0,body_text,body_len,punct_percent
2163,Yes.he have good crickiting mind,28,3.571
2821,"Congratulations - Thanks to a good friend U have WON the £2,000 Xmas prize. 2 claim is easy, jus...",131,6.107
4361,Ujhhhhhhh computer shipped out with address to sandiago and parantella lane. Wtf. Poop.,75,4.0
2238,Nope wif my sis lor... Aft bathing my dog then i can bathe... Looks like it's going 2 rain soon.,77,10.39
2808,"Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Te...",135,11.111


In [19]:
x_test.head()

Unnamed: 0,body_text,body_len,punct_percent
1209,"Yeah, probably but not sure. Ilol let u know, but personally I wuldnt bother, then again if ur g...",100,6.0
1583,"Dont search love, let love find U. Thats why its called falling in love, bcoz U dont force yours...",128,5.469
3972,I wish u were here. I feel so alone,27,3.704
4938,"I'm eatin now lor, but goin back to work soon... E mountain deer show huh... I watch b4 liao, ve...",86,13.953
4750,Ok lor... Or u wan me go look 4 u?,25,16.0


## Vectorize Text 

- Train the vectorizer on the Training data 
- Use the trained vectorizer to transform the train & test data 
    - This means that unlike before, we are not vectorizing the entire dataset first and then splitting into train/test afterwards 
    - Since we are only fitting/training the vectorizer on the training data, some words that are only in the test data would be unrecognized by the vectorizer (as it is only fitted/trained on the training data)
    - This process is the correct way to vectorize NLP data 
    - Some insight into this topic https://stackoverflow.com/questions/47778403/computing-tf-idf-on-the-whole-dataset-or-only-on-training-data

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(x_train['body_text'])

In [10]:
# Vectorize Training Data
train_arr = tfidf_vect_fit.transform(x_train['body_text'])
test_arr = tfidf_vect_fit.transform(x_test['body_text'])

In [None]:
# Create training & testing data DataFrames

In [14]:
df_train = pd.DataFrame(train_arr.toarray())
df_test = pd.DataFrame(test_arr.toarray())

In [22]:
df_train = pd.concat([x_train[['body_len','punct_percent']].reset_index(drop=True), df_train], axis=1)

In [24]:
df_test = pd.concat([x_test[['body_len','punct_percent']].reset_index(drop=True), df_test], axis=1)

In [28]:
df_train.head()

Unnamed: 0,body_len,punct_percent,0,1,2,3,4,5,6,7,...,7188,7189,7190,7191,7192,7193,7194,7195,7196,7197
0,28,3.571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,131,6.107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,75,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,77,10.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,11.111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_test.head()

Unnamed: 0,body_len,punct_percent,0,1,2,3,4,5,6,7,...,7188,7189,7190,7191,7192,7193,7194,7195,7196,7197
0,100,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,5.469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27,3.704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86,13.953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model Evaluation

In [30]:
import time 

In [37]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1 )

start_time = time.time() # start timer

rf_model = rf.fit(df_train, y_train) # fitted model
rf_pred = rf_model.predict(df_test) # predict using fitted model

fit_time = (time.time() - start_time) # fit time

rf_prec, rf_recall, rf_fscore, rf_support = score(y_test, rf_pred, pos_label='spam', average='binary') # evaluation metrics

print(f'Fit Time: {fit_time} / Precision: {rf_prec} / Recall: {rf_recall} / Accuracy: {round((rf_pred==y_test).sum()/len(rf_pred),3)}')

Fit Time: 6.965034246444702 / Precision: 1.0 / Recall: 0.821656050955414 / Accuracy: 0.975


In [38]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=7)

gb_model = rf.fit(df_train, y_train) # fitted model
gb_pred = rf_model.predict(df_test) # predict using fitted model

fit_time = (time.time() - start_time) # fit time

gb_prec, gb_recall, gb_fscore, gb_support = score(y_test, gb_pred, pos_label='spam', average='binary') # evaluation metrics

print(f'Fit Time: {fit_time} / Precision: {gb_prec} / Recall: {gb_recall} / Accuracy: {round((gb_pred==y_test).sum()/len(gb_pred),3)}')

Fit Time: 14.04383111000061 / Precision: 1.0 / Recall: 0.8407643312101911 / Accuracy: 0.978


## Results
- Both of the models are very close in performance 
- While they are both similar, the RandomForest model takes less time to fit 
- By small margins, the GB model does perform better, with higher scores for Recall and Accuracy
    - However, it is good to keep in mind that over several iterations of fiting & predicting, the results may sway to indicate the other model performs better. Testing more hyperparamaters, test variations, and more data can potentially indicate the better overall model