Reading and Cleaning of Data

In [7]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t',header=None)
data.columns = ['Label','Text']


def clean_text(text):
    no_punct = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',no_punct)
    stemmed = [ps.stem(word) for word in tokens if word not in stopwords] 
    return stemmed

data['Text_len'] = data['Text'].apply(lambda x: len(x)-x.count(" "))
data["punct_%"] = data['Text'].apply(lambda x: round((len([char for char in x if char in string.punctuation])/(len(x)-x.count(" ")))*100,3))

data.head()


Unnamed: 0,Label,Text,Text_len,punct_%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.688
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.082
3,ham,Even my brother is not like to speak with me. ...,62,3.226
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143


In [8]:
# Using the TF-IDF Vectorizer to convert text into numerical form.

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['Text'])

X_tfidf_features = pd.concat([data['Text_len'], data['punct_%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_features.columns = X_tfidf_features.columns.astype(str)
X_tfidf_features.head()

Unnamed: 0,Text_len,punct_%,0,1,2,3,4,5,6,7,...,8181,8182,8183,8184,8185,8186,8187,8188,8189,8190
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Using the Count Vectorizer for vectirization of text data.

count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['Text'])

X_count_features = pd.concat([data['Text_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_features.columns = X_count_features.columns.astype(str)
X_count_features.head()

Unnamed: 0,Text_len,punct_%,0,1,2,3,4,5,6,7,...,8181,8182,8183,8184,8185,8186,8187,8188,8189,8190
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.688,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.082,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.226,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.143,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

print(RandomForestClassifier().get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [11]:
data['Label'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5568 entries, 0 to 5567
Series name: Label
Non-Null Count  Dtype 
--------------  ----- 
5568 non-null   object
dtypes: object(1)
memory usage: 43.6+ KB


In [12]:
# Random Forest with 5-fold cross validation

rf = RandomForestClassifier(n_jobs=-1)

k_fold = KFold(n_splits=5)
cross_val_score(rf, X_tfidf_features, data['Label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97666068, 0.98025135, 0.97666068, 0.96585804, 0.97394429])

In [13]:
# Random Forest with a holdout set
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_features, data['Label'], test_size=0.2)
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [14]:
sorted(zip(rf.feature_importances_,X_train.columns),reverse=True)[0:10]

[(np.float64(0.05841429714977207), 'Text_len'),
 (np.float64(0.04842010899834462), '1819'),
 (np.float64(0.035709875351445104), '4838'),
 (np.float64(0.031026661833053647), '3159'),
 (np.float64(0.030004886017932417), '2048'),
 (np.float64(0.019180499381649427), '7422'),
 (np.float64(0.01624436393889786), '397'),
 (np.float64(0.0155366858725338), '5779'),
 (np.float64(0.01547773991724206), '5123'),
 (np.float64(0.014981658001607548), '6343')]

In [15]:
y_pred = rf_model.predict(X_test)
precision,recall,fscore,support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision,3),
                                                         round(recall,3),
                                                         round((y_pred==y_test).sum()/len(y_pred),3)))

Precision: 1.0 / Recall: 0.548 / Accuracy: 0.937


In [16]:
# Random Forest with Grid Search

def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision,recall,fscore,support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision,3), round(recall,3), round((y_pred==y_test).sum()/len(y_pred),3)))

In [17]:
for n_est in [10,50,100]:
    for depth in [10,20,40,None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.219 / Accuracy: 0.891
Est: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.516 / Accuracy: 0.933
Est: 10 / Depth: 40 ---- Precision: 1.0 / Recall: 0.71 / Accuracy: 0.96
Est: 10 / Depth: None ---- Precision: 0.983 / Recall: 0.729 / Accuracy: 0.961
Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.168 / Accuracy: 0.884
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.581 / Accuracy: 0.942
Est: 50 / Depth: 40 ---- Precision: 0.981 / Recall: 0.684 / Accuracy: 0.954
Est: 50 / Depth: None ---- Precision: 0.984 / Recall: 0.774 / Accuracy: 0.967
Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.213 / Accuracy: 0.89
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.6 / Accuracy: 0.944
Est: 100 / Depth: 40 ---- Precision: 0.982 / Recall: 0.703 / Accuracy: 0.957
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 0.787 / Accuracy: 0.97


In [18]:
# Random Forest using Grid Search CV

from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
param = {'n_estimators':[10,150,300],
         'max_depth':[30,60,90,None]}
gs = GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_tfidf = gs.fit(X_tfidf_features,data['Label'])
pd.DataFrame(gs_tfidf.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,54.154308,0.45445,0.507985,0.020298,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978456,0.977558,0.973968,0.967655,0.972147,0.973957,0.003904,1
6,3.1864,0.274719,0.229581,0.018783,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.977558,0.978456,0.973968,0.96496,0.974843,0.973957,0.004795,1
10,29.58242,1.036349,0.376584,0.029611,,150,"{'max_depth': None, 'n_estimators': 150}",0.976661,0.978456,0.975763,0.966757,0.972147,0.973957,0.004146,3
11,49.724098,0.579079,0.353867,0.057864,,300,"{'max_depth': None, 'n_estimators': 300}",0.977558,0.974865,0.974865,0.967655,0.973944,0.973778,0.003292,4
7,27.418532,0.609725,0.351145,0.019441,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.976661,0.978456,0.975763,0.96496,0.973046,0.973777,0.004742,5


In [19]:
gs_count = gs.fit(X_count_features,data['Label'])
pd.DataFrame(gs_count.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,48.872508,0.709928,0.373385,0.046213,,300,"{'max_depth': None, 'n_estimators': 300}",0.978456,0.976661,0.97307,0.965858,0.973046,0.973418,0.00432,1
7,27.403618,0.479911,0.368512,0.010677,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.979354,0.97307,0.974865,0.965858,0.973046,0.973239,0.004349,2
8,53.112026,0.571892,0.533696,0.020611,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975763,0.973968,0.974865,0.967655,0.973046,0.973059,0.00285,3
10,29.049883,0.814027,0.388999,0.031546,,150,"{'max_depth': None, 'n_estimators': 150}",0.976661,0.974865,0.973968,0.967655,0.971249,0.97288,0.003143,4
4,23.00085,0.255744,0.345485,0.023144,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.974865,0.973968,0.972172,0.963163,0.973046,0.971443,0.004237,5
