In [1]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import glob
import numpy as np
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline


np.random.seed(42)


In [2]:
def load_dataset(dir):
    dataset = []
    for file in glob.glob(dir + "/*"):
        with open(file, 'rb') as f:
            dataset.append(f.read().decode('utf-8', errors='ignore'))
    return dataset


In [3]:
spam_dir = 'Data\spam'
easy_ham_dir = 'Data\easy_ham'
hard_ham_dir = 'Data\hard_ham'


In [4]:
spam = load_dataset(spam_dir)
spam = spam[1:]
easy_ham = load_dataset(easy_ham_dir)
hard_ham = load_dataset(hard_ham_dir)


In [5]:
def remove_header(email):
    try:
        return email[email.index('\n\n')+1:]
    except:
        print('cannot find header')
        print(email)
        return email


In [6]:
def to_lower_case(email):
    return email.lower()


In [7]:
def find_url_regex(email):
    return re.findall(r'(https?://[^\s]+)', email)


In [8]:
def replace_url(email):
    urls = find_url_regex(email)
    for url in urls:
        email = email.replace(url, 'URL')
    return email


In [9]:
def find_numbers_regex(email):
    return re.findall(r'\d+', email)


def replace_numbers(email):
    numbers = find_numbers_regex(email)
    for number in numbers:
        email = email.replace(number, 'NUM')
    return email


In [10]:
def remove_punctuation(email):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~\n\t'''
    for letter in email:
        if letter in punctuations:
            email = email.replace(letter, '')
    return email


In [11]:
def stem_words(email):
    ps = PorterStemmer()
    words = word_tokenize(email)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)


In [12]:
class EmailPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, header=True, footer=True, num=True, punct=True, url=True, stem=True, to_lower=True):
        self.header = header
        self.footer = footer
        self.num = num
        self.punct = punct
        self.url = url
        self.stem = stem
        self.to_lower = to_lower

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.header:
            X = [remove_header(email) for email in X]
        if self.to_lower:
            X = [to_lower_case(email) for email in X]
        if self.url:
            X = [replace_url(email) for email in X]
        if self.num:
            X = [replace_numbers(email) for email in X]
        if self.punct:
            X = [remove_punctuation(email) for email in X]
        if self.stem:
            X = [stem_words(email) for email in X]
        return X


In [13]:
pipeline = Pipeline([('email_preprocessor', EmailPreprocessor()),
                    ('vectorizer', CountVectorizer(decode_error='ignore'))])


In [14]:
X = np.concatenate([spam, easy_ham, hard_ham])
y = np.concatenate([np.ones(len(spam)), np.zeros(
    len(easy_ham)), np.zeros(len(hard_ham))])


In [15]:
newX = pipeline.fit_transform(X)
Xtrain, Xtest, ytrain, ytest = train_test_split(
    newX, y, test_size=0.2, random_state=42)


In [16]:
svm_clf = SVC()
svm_clf.fit(Xtrain, ytrain)
pred = svm_clf.predict(Xtest)
accuracy_score(ytest, pred)


0.8819969742813918

In [17]:
lr_clf = LogisticRegression(max_iter=10000)
lr_clf.fit(Xtrain, ytrain)
pred = lr_clf.predict(Xtest)
accuracy_score(ytest, pred)


0.9818456883509834

In [18]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(Xtrain, ytrain)
pred = dt_clf.predict(Xtest)
accuracy_score(ytest, pred)


0.9576399394856279

In [32]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'penalty': ['l1', 'l2'], }


In [36]:
grid = GridSearchCV(LogisticRegression(max_iter=10000),param_grid,scoring='accuracy',verbose=5,cv=5,n_jobs=1)
grid.fit(Xtrain,ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.966 total time=   4.6s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.962 total time=   3.8s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.956 total time=   2.9s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.964 total time=   4.5s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.968 total time=   4.0s
[CV 1/5] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....................C=1, penalty=l

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP User\anaconda3\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP User\anaconda3\envs\ml\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\HP User\anaconda3\envs\ml\lib\site-packages\sklearn\linear_model\_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalt

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=10000), n_jobs=1,
             param_grid={'C': [0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
             scoring='accuracy', verbose=7)

In [37]:
grid.best_params_

{'C': 1, 'penalty': 'l2'}

In [38]:
grid.best_score_

0.9753787878787878

In [39]:
lr_best = LogisticRegression(**grid.best_params_,max_iter=10000)
lr_best.fit(Xtrain,ytrain)
pred = lr_best.predict(Xtest)
accuracy_score(ytest,pred)

0.9818456883509834

In [40]:
confusion_matrix(ytest,pred)

array([[535,   4],
       [  8, 114]], dtype=int64)

In [41]:
precision_score(ytest,pred)

0.9661016949152542

In [42]:
recall_score(ytest,pred)

0.9344262295081968