# Spam Detection Model: Final Model Selection

In [8]:
import nltk
#import os
import string 
import re 
import pandas as pd 
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

## Read Data

In [2]:
path = r"C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP"
data = pd.read_csv(path+'\SMSSpamCollection.tsv', sep='\t', )
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Clean Text 

In [3]:
# Import the SnowballStemmer, get list of punctuations
ss = nltk.SnowballStemmer('english')
punct = string.punctuation

In [4]:
# Clean text function
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = [ss.stem(word) for word in tokens if word not in stopwords] # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

# Function to count punctuation
def punct_count(text): # Count the percent of text that are punctuation
    return round((sum([1 for char in text if char in punct])/(len(text) - text.count(" ")))*100,3)

In [5]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct_percent'] = data['body_text'].apply(lambda x: punct_count(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct_percent
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,128,4.688
1,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.082
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,62,3.226
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,135,4.444


## Split Train/Test

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct_percent']], 
                                                    data['label'],
                                                    test_size=0.2,
                                                    random_state=2)

## Vectorize Text 

- Train the vectorizer on the Training data 
- Use the trained vectorizer to transform the train & test data 
    - This means that unlike before, we are not vectorizing the entire dataset first and then splitting into train/test afterwards 
    - Since we are only fitting/training the vectorizer on the training data, some words that are only in the test data would be unrecognized by the vectorizer (as it is only fitted/trained on the training data)
    - This process is the correct way to vectorize NLP data 
    - Some insight into this topic https://stackoverflow.com/questions/47778403/computing-tf-idf-on-the-whole-dataset-or-only-on-training-data

## 