# Spam Detection Model

In [1]:
import nltk
import string 
import re 
import pandas as pd 
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_colwidth', 100)

In [2]:
import os
cwd = os.getcwd()
print(cwd)

C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP\Final_Project


In [3]:
path = r"C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP"

In [4]:
data = pd.read_csv(path+'\SMSSpamCollection.tsv', sep='\t', )
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Clean data 
- Remove punctuation
- Tokenize
- Remove stopwords
- Use the Snowball Stemmer to truncate the words 
    - The Snowball Stemmer is an updrage to the Porter Stemmer available in the NLTK library 

In [5]:
# Import the SnowballStemmer, get list of punctuations

ss = nltk.SnowballStemmer('english')
punct = string.punctuation

In [6]:
# Custom Functions 

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = [ss.stem(word) for word in tokens if word not in stopwords] # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

In [7]:
data['body_clean'] = data['body_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,body_text,body_clean
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ..."


## Apply TF-IDF Vectorizer

In [8]:
idf_vect = TfidfVectorizer(analyzer=clean_text)
idf_arr = idf_vect.fit_transform(data['body_text'])

In [9]:
idf_df = pd.DataFrame(idf_arr.toarray())
idf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8092,8093,8094,8095,8096,8097,8098,8099,8100,8101
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Engineering
- % of punctuation per document
- Body length of each document

Note:  These features were explored in another notebook

In [10]:
def punct_count(text): # Count the percent of text that are punctuation
    return round((sum([1 for char in text if char in punct])/(len(text) - text.count(" ")))*100,3)

In [11]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct_percent'] = data['body_text'].apply(lambda x: punct_count(x))
data.head()

Unnamed: 0,label,body_text,body_clean,body_len,punct_percent
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",128,4.688
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]",49,4.082
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]",62,3.226
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28,7.143
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ...",135,4.444


In [12]:
print(data.shape)
print(idf_df.shape)

(5567, 5)
(5567, 8102)


In [13]:
idf_df = pd.concat([data['body_len'], data['punct_percent'], idf_df], axis=1)

In [14]:
idf_df.head()

Unnamed: 0,body_len,punct_percent,0,1,2,3,4,5,6,7,...,8092,8093,8094,8095,8096,8097,8098,8099,8100,8101
0,128,4.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Random-Forest Classifier 

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
print(dir(RandomForestClassifier))

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']


In [17]:
rf = RandomForestClassifier()
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Cross-Validation 

In [18]:
from sklearn.model_selection import KFold, cross_val_score

In [19]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)

In [20]:
# model, features, labels, cross-validation split, scoring method, n_jobs)
cv_score = cross_val_score(rf, idf_df, data['label'], cv=k_fold, scoring='accuracy',n_jobs=-1)

In [21]:
print(cv_score)

[0.97576302 0.97935368 0.97304582 0.96495957 0.96855346]


- The scores above represent accuracy achieved for each iteration of the k-fold 

## Random Forest on a holdout test set 

In [22]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(idf_df, data['label'], test_size=0.2)

In [24]:
# We will creation another random forest classifier 
rf_2 = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf2_model = rf_2.fit(x_train,y_train) # train the model

In [25]:
sorted(zip(rf2_model.feature_importances_, x_train.columns), reverse=True)[0:10]

[(0.06979623473773314, 'body_len'),
 (0.04604655470142143, 7346),
 (0.0374109876463628, 4796),
 (0.03599494763107086, 3134),
 (0.026766501014024023, 2036),
 (0.0229853197500519, 1808),
 (0.02239104871720843, 5723),
 (0.017207110849646972, 7020),
 (0.016573598425197088, 1366),
 (0.015696357184651143, 394)]

- Use the **'seed'** paramater in the train_test_split to keep the same random sample each time the code is run or the scores & model paramters will change each time 

**Predict**

In [26]:
y_pred = rf2_model.predict(x_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam',average='binary')

In [27]:
print(f'Precision: {precision}\nRecall: {round(recall,3)}\nfscore: {round(fscore,3)}\nAccuracy:{round((y_pred==y_test).sum()/len(y_pred),3)}')

Precision: 1.0
Recall: 0.572
fscore: 0.728
Accuracy:0.944


## Grid Search on Model

### Manually build Grid-search

In [28]:
def train_RF(n_est, depth):
    rf_3 = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf3_model = rf_3.fit(x_train,y_train)
    y_pred = rf3_model.predict(x_test)
    precision, recall, fscore, support = score(y_test,y_pred,pos_label='spam', average='binary')
    print(f'Est:{n_est} / Depth:{depth} ---- Precision: {precision} / Recall: {recall} / Accuracy: {round((y_pred==y_test).sum()/len(y_pred),3)}')

In [29]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est:10 / Depth:10 ---- Precision: 1.0 / Recall: 0.27586206896551724 / Accuracy: 0.906
Est:10 / Depth:20 ---- Precision: 1.0 / Recall: 0.5172413793103449 / Accuracy: 0.937
Est:10 / Depth:30 ---- Precision: 1.0 / Recall: 0.6689655172413793 / Accuracy: 0.957
Est:10 / Depth:None ---- Precision: 0.9818181818181818 / Recall: 0.7448275862068966 / Accuracy: 0.965
Est:50 / Depth:10 ---- Precision: 1.0 / Recall: 0.2206896551724138 / Accuracy: 0.899
Est:50 / Depth:20 ---- Precision: 1.0 / Recall: 0.5655172413793104 / Accuracy: 0.943
Est:50 / Depth:30 ---- Precision: 1.0 / Recall: 0.6551724137931034 / Accuracy: 0.955
Est:50 / Depth:None ---- Precision: 0.9818181818181818 / Recall: 0.7448275862068966 / Accuracy: 0.965
Est:100 / Depth:10 ---- Precision: 1.0 / Recall: 0.2482758620689655 / Accuracy: 0.902
Est:100 / Depth:20 ---- Precision: 1.0 / Recall: 0.5793103448275863 / Accuracy: 0.945
Est:100 / Depth:30 ---- Precision: 1.0 / Recall: 0.6689655172413793 / Accuracy: 0.957
Est:100 / Depth:None ---- P

## Random Forest w/ GridSearchCV

In [30]:
from sklearn.model_selection import GridSearchCV

In [32]:
rf4 = RandomForestClassifier()
param = {'n_estimators':[10, 150, 300], # Parameters stored in dictionary. The GridSearchCV will loop over these parameters
        'max_depth':[30, 60, 90, None]}

gs = GridSearchCV(rf4, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(idf_df, data['label'])

In [33]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,46.012506,0.398973,0.572669,0.018172,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977558,0.977558,0.974843,0.967655,0.972147,0.973952,0.003734,1
11,85.804452,1.505496,0.661002,0.058976,,300,"{'max_depth': None, 'n_estimators': 300}",0.978456,0.976661,0.973944,0.968553,0.97035,0.973593,0.003717,2
8,91.79839,3.075617,0.922462,0.154095,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.979354,0.975763,0.973944,0.968553,0.969452,0.973413,0.00401,3
3,3.902802,0.21866,0.358968,0.027883,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.979354,0.978456,0.968553,0.965858,0.973944,0.973233,0.00532,4
10,49.782437,1.83675,0.560596,0.043342,,150,"{'max_depth': None, 'n_estimators': 150}",0.974865,0.975763,0.975741,0.967655,0.97035,0.972875,0.003291,5


**A 'scoring' parameter can be passed to the GridSearchCV to output the best model based on the score you chose --> reclass, precision, ROC AUC, etc**