# Building Machine Learning Classifier - Model Selection and Evaluation

## Reading and Cleaning the data

In [3]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords


dtext = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header= None)
dtext.columns = ['label','data']

def per_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

dtext['data_len'] = dtext['data'].apply(lambda x: len(x) - x.count(" ")) #length of each SMS
dtext['percent_punct'] = dtext['data'].apply(lambda x: per_punct(x))

#string.punctuation
stopword = stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def clean_data(text):
    text_nopunc = "".join([char for char in text  if char not in string.punctuation])
    tokens = re.split('\W+',text_nopunc) #'\W+' indicates non word character
    text_stemmed = [ps.stem(word) for word in tokens if word not in stopword]
    #text_lemma = [wn.lemmatize(word) for word in text] #lemmatization
    return text_stemmed

In [4]:
dtext.head()

Unnamed: 0,label,data,data_len,percent_punct
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


## Split into train/test set

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(dtext[['data','data_len','percent_punct']],dtext['label'],test_size = 0.2)

## Vectorize Text

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vect_tfidf = TfidfVectorizer(analyzer = clean_data)
x_tfidf = count_vect_tfidf.fit(x_train['data'])

tfidf_train = x_tfidf.transform(x_train['data'])
tfidf_test = x_tfidf.transform(x_test['data'])

X_train_features = pd.concat([x_train[['data_len','percent_punct']].reset_index(drop=True),pd.DataFrame(tfidf_train.toarray())],axis=1)
X_test_features = pd.concat([x_test[['data_len','percent_punct']].reset_index(drop=True),pd.DataFrame(tfidf_test.toarray())],axis=1)

X_train_features.head()

Unnamed: 0,data_len,percent_punct,0,1,2,3,4,5,6,7,...,7411,7412,7413,7414,7415,7416,7417,7418,7419,7420
0,57,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,45,8.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.310864,0.0,0.0,0.0
3,33,15.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,108,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Building the model - Random Forest and Gradient Boosting

In [17]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time


rf = RandomForestClassifier(n_estimators = 150, max_depth = None, n_jobs = -1)

start = time.time()
rf_model =rf.fit(X_train_features, y_train)
end = time.time()
fit_time = (end - start)


start = time.time()
y_pred = rf_model.predict(X_test_features)
end = time.time()
pred_time = (end - start)

precision,recall,fscore,support = score(y_test,y_pred, pos_label = 'spam', average='binary')
print('Fit Time {} / Prediction: {} ------ Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time,3),
                                                                                             round(pred_time,3),
                                                                                             round(precision,3),
                                                         round(recall,3),
                                                         round(((y_test==y_pred).sum()/len(y_pred)),3)))

Fit Time 5.941 / Prediction: 0.209 ------ Precision: 1.0 / Recall: 0.836 / Accuracy: 0.978


In [19]:

gbs = GradientBoostingClassifier(n_estimators = 150, max_depth = 11)

start = time.time()
gbs_model = gbs.fit(X_train_features, y_train)
end = time.time()
fit_time = (end - start)


start = time.time()
y_pred = gbs_model.predict(X_test_features)
end = time.time()
pred_time = (end - start)

precision,recall,fscore,support = score(y_test,y_pred, pos_label = 'spam', average='binary')
print('Fit Time {} / Prediction: {} ------ Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time,3),
                                                                                             round(pred_time,3),
                                                                                             round(precision,3),
                                                         round(recall,3),
                                                         round(((y_test==y_pred).sum()/len(y_pred)),3)))

Fit Time 224.528 / Prediction: 0.094 ------ Precision: 0.946 / Recall: 0.836 / Accuracy: 0.972
