In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t',header=None)
data.columns = ['Label','Text']


def clean_text(text):
    no_punct = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',no_punct)
    stemmed = [ps.stem(word) for word in tokens if word not in stopwords] 
    return stemmed

data['Text_len'] = data['Text'].apply(lambda x: len(x)-x.count(" "))
data["punct_%"] = data['Text'].apply(lambda x: round((len([char for char in x if char in string.punctuation])/(len(x)-x.count(" ")))*100,3))

data.head()

Unnamed: 0,Label,Text,Text_len,punct_%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.688
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.082
3,ham,Even my brother is not like to speak with me. ...,62,3.226
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143


In [None]:
# Using the TF-IDF Vectorizer to convert text into numerical form.

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['Text'])

X_tfidf_feat = pd.concat([data['Text_len'], data['punct_%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat.columns = X_tfidf_feat.columns.astype(str)
X_tfidf_feat.head()

Unnamed: 0,Text_len,punct_%,0,1,2,3,4,5,6,7,...,8181,8182,8183,8184,8185,8186,8187,8188,8189,8190
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['Text'])
X_count_feat = pd.concat([data['Text_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,Text_len,punct_%,0,1,2,3,4,5,6,7,...,8181,8182,8183,8184,8185,8186,8187,8188,8189,8190
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.688,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.082,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.226,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.143,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, data['Label'], test_size=0.2)

In [6]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))

In [7]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.87
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.908 / Recall: 0.745 / Accuracy: 0.957
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.007 / Accuracy: 0.871
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.9 / Recall: 0.807 / Accuracy: 0.963
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.028 / Accuracy: 0.873
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.889 / Recall: 0.828 / Accuracy: 0.964
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.007 / Accuracy: 0.871
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.876 / Recall: 0.828 / Accuracy: 0.962
Est: 100 / Depth: 3 / LR: 0.01 ---- Precision: 0.879 / Recall: 0.552 / Accuracy: 0.932
Est: 100 / Depth: 3 / LR: 0.1 ---- Precision: 0.919 / Recall: 0.779 / Accuracy: 0.962
Est: 100 / Depth: 7 / LR: 0.01 ---- Precision: 0.895 / Recall: 0.703 / Accuracy: 0.951
Est: 100 / Depth: 7 / LR: 0.1 ---- Precision: 0.91 / Recall: 0.84

In [None]:
# Using Grid-Search CV for parameter tuning
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]