# Email Classifier

In [4]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import os

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [8]:

data_path = "{0}/spam_ham_dataset.csv".format(os.getcwd())
#Read csv file
data = pd.read_csv(data_path)[['label', 'text']]
print(data.head())

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in text if word not in stopwords]
    return text

  label                                               text
0   ham   enron methanol ; meter # : 988291\r\nthis is ...
1   ham   hpl nom for january 9 , 2001\r\n( see attache...
2   ham   neon retreat\r\nho ho ho , we ' re around to ...
3  spam   photoshop , windows , office . cheap . main t...
4   ham   re : indian springs\r\nthis deal is to book t...


## Split into train/test

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['text', 'body_len', 'punct%']], data['label'], test_size=0.2)

## Vectorize text

In [10]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['text'])

tfidf_train = tfidf_vect_fit.transform(X_train['text'])
tfidf_test = tfidf_vect_fit.transform(X_test['text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,411,6.6,0.0,0.0,0.0,0.084564,0.084564,0.0,0.0,0.0,...,0.197625,0.187939,0.105454,0.0,0.239137,0.11625,0.050764,0.029308,0.0,0.0
1,605,14.7,0.0,0.0,0.0,0.06533,0.06533,0.0,0.0,0.0,...,0.084083,0.083596,0.018104,0.034113,0.109968,0.07369,0.020916,0.03019,0.0,0.0
2,185,5.4,0.0,0.0,0.0,0.085515,0.085515,0.0,0.0,0.0,...,0.104268,0.103665,0.035546,0.0,0.172733,0.054257,0.041068,0.019759,0.0,0.035992
3,195,4.1,0.0,0.0,0.0,0.15471,0.15471,0.0,0.0,0.0,...,0.216147,0.156289,0.080387,0.0,0.117188,0.102249,0.0,0.178733,0.0,0.0
4,135,0.7,0.0,0.0,0.0,0.023959,0.023959,0.0,0.0,0.0,...,0.121721,0.266237,0.124489,0.0,0.169382,0.050671,0.028765,0.138396,0.0,0.0


## Final evaluation of models

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [16]:
### Random ForestClassifier

In [12]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 1.146 / Predict time: 0.082 ---- Precision: 0.863 / Recall: 0.736 / Accuracy: 0.893


### Gradient Boosting Classifier

In [13]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 21.224 / Predict time: 0.02 ---- Precision: 0.86 / Recall: 0.777 / Accuracy: 0.901


### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
start = time.time()
lor_model = lor.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = lor_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.117 / Predict time: 0.003 ---- Precision: 0.787 / Recall: 0.582 / Accuracy: 0.838


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### K Nearest Neighbour

In [15]:
from sklearn.neighbors import KNeighborsClassifier

weights = ['uniform', 'distance']
algos = ['ball_tree', 'kd_tree', 'brute']
neighs = [5, 10, 15, 20, 30, 50]
for weight in weights:
    for algo in algos:
        for neigh in neighs:
            knn = KNeighborsClassifier(weights = weight, algorithm = algo, n_neighbors = neigh)
            start = time.time()
            knn_model = knn.fit(X_train_vect, y_train)
            end = time.time()
            fit_time = (end - start)

            start = time.time()
            y_pred = knn_model.predict(X_test_vect)
            end = time.time()
            pred_time = (end - start)

            precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
            print('Weight: {} / Algo: {} / Neighbors: {} / Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(weight,
                algo, neigh, round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Weight: uniform / Algo: ball_tree / Neighbors: 5 / Fit time: 0.05 / Predict time: 0.082 ---- Precision: 0.469 / Recall: 0.308 / Accuracy: 0.706
Weight: uniform / Algo: ball_tree / Neighbors: 10 / Fit time: 0.045 / Predict time: 0.062 ---- Precision: 0.523 / Recall: 0.192 / Accuracy: 0.723
Weight: uniform / Algo: ball_tree / Neighbors: 15 / Fit time: 0.035 / Predict time: 0.133 ---- Precision: 0.468 / Recall: 0.199 / Accuracy: 0.71
Weight: uniform / Algo: ball_tree / Neighbors: 20 / Fit time: 0.037 / Predict time: 0.067 ---- Precision: 0.516 / Recall: 0.161 / Accuracy: 0.721
Weight: uniform / Algo: ball_tree / Neighbors: 30 / Fit time: 0.035 / Predict time: 0.072 ---- Precision: 0.529 / Recall: 0.123 / Accuracy: 0.722
Weight: uniform / Algo: ball_tree / Neighbors: 50 / Fit time: 0.041 / Predict time: 0.086 ---- Precision: 0.596 / Recall: 0.116 / Accuracy: 0.729
Weight: uniform / Algo: kd_tree / Neighbors: 5 / Fit time: 0.066 / Predict time: 0.087 ---- Precision: 0.469 / Recall: 0.308 / 