# Preparation


In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold
from sklearn.pipeline import make_pipeline
import nltk
import re
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.extmath import density
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from time import time
import matplotlib.pyplot as plt
import seaborn.objects as so

nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = np.array(pd.read_csv('drive/MyDrive/DataSet/Processed_Email').append(pd.read_csv('drive/MyDrive/DataSet/Processed_Spam')).dropna())

  data = np.array(pd.read_csv('drive/MyDrive/DataSet/Processed_Email').append(pd.read_csv('drive/MyDrive/DataSet/Processed_Spam')).dropna())


In [5]:
data = data.T

words = set(nltk.corpus.words.words())

for i in range(data.shape[1]):
  data[0,i] = re.sub(r'[0-9]+', '', " ".join(w for w in nltk.wordpunct_tokenize(str(data[0,i])) if w in words and len(w) != 1))

In [6]:
temp = data
print(temp)

[['user id original message highlight sent june highlight subject june daily labor report daily labor report highlight table content june registered web subscriber access full text article link information becoming subscriber free web trial available call customer relation mon highlight circuit overturn decision workplace conduct policy firm two workplace policy barring abusive threatening language solicitation distribution constitute unfair labor practice district circuit rule national labor relation board decision abb na board workplace conduct policy abb transportation na employee exercise right national labor relation act board ordered second election determine whether employee machinist automotive trade district lodge northern appeal court find lack jurisdiction consider appeal order new election final order court reject board view ban abusive threatening language unfair labor practice regardless whether actually chilled employee exercise right judge find rule legal conduct workin

In [7]:
data = pd.DataFrame(temp)

In [8]:
data = data.T
data.drop_duplicates(inplace=True)
data[0].replace('', np.nan, inplace=True)
data.dropna(inplace=True)
data = data.T

In [9]:
data = np.array(data)
X = data[0]
# Made 1 to signify spam instead of 0
y = 1 - data[1].astype(int)
print(data.shape)
print(np.unique(y, return_counts=True))

(2, 100620)
(array([0, 1]), array([61476, 39144]))


In [10]:
print(X[-100:])
print(y[-100:])

['drug male impotence duty result god men woman study discover ignorance nothing great world accomplished without passion'
 'think see light end zone boyer know youve landed take full power taxi arbitrary system system nothing general said save nothing general said hero get girl villain get girl everybody movie get girl one problem internalize everything cant express anger grow tumor instead woody measure hail golf ball would rather trust woman instinct man reason well reality year doctor happy state finally dont read doesnt say anything profound anyway man cynical capital sin common'
 'probably aware traditional animator long time worked magic drawing layer president specular security tight opening show rock roll hall fame museum fear material might stolen town sitting comfortably python nothing beat good day second ridicule philosophy really philosophize drive roomie nut shave one eyebrow sharp shiny like icicle true love nice catch stone tour wave know'
 'world record holder blowing

In [11]:
print(np.count_nonzero(data[1]==0)/data.shape[1] * 100)

38.90280262373286


# Feature Extraction

In [12]:
def benchmark2(X, y, vec, r):
    # Using stratified K-folds CV due to class imbalance, ensures each fold contains good distribution of ham and spam
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=r)

    # Dictionaries to store the performance data, feel free to add more
    ext_t = 0
    n_features = 0
    mnb_metrics = {"train_t":0, "train_acc":0, "test_acc":0, "f1_score":0, "mcc":0, "tcr":0, "precision":0, "recall":0}
    sgd_metrics = {"train_t":0, "train_acc":0, "test_acc":0, "f1_score":0, "mcc":0, "tcr":0, "precision":0, "recall":0}
    # TCR (Total cost ratio) is defined:
    # Real total number of spams / (cost * false positives + false negatives)
    # TCR < 1 implies the model misclassifies hams in an undesirable rate. The larger the number, the better.
    # Idea is that misclassifying hams as spams are much more damaging than misclassifying spams as hams

    # Iterate the K folds
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(X, y)), total=kf.get_n_splits()):
      # Initializing datasets, classweights and models
      X_train = X[train_index]
      y_train = y[train_index]
      X_test = X[test_index]
      y_test = y[test_index]
      classWeight = compute_class_weight(class_weight="balanced",classes=np.unique(y_train),y=y_train)
      classWeight = {0:classWeight[0], 1:classWeight[1]}
      clf1 = MultinomialNB(alpha=0.001)
      clf2 = SGDClassifier(tol=1e-4, class_weight=classWeight)

      # Feature extraction
      t0 = time()
      X_train = vec.fit_transform(X_train)
      X_test = vec.transform(X_test)
      ext_t += time() - t0
      n_features += X_train.shape[1]

      # Feature selection
      fs = SelectKBest(chi2, k=100000)
      X_train = fs.fit_transform(X_train, y_train)
      X_test = fs.transform(X_test)

      # Fitting MNB
      t0 = time()
      clf1.fit(X_train, y_train)
      mnb_metrics['train_t'] += time() - t0

      # Fitting SGD
      t0 = time()
      clf2.fit(X_train, y_train)
      sgd_metrics['train_t'] += time() - t0

      # Testing MNB
      pred = clf1.predict(X_test)
      mnb_metrics['train_acc'] += clf1.score(X_train, y_train)
      mnb_metrics['test_acc'] += clf1.score(X_test, y_test)
      mnb_metrics['f1_score'] += metrics.f1_score(y_test, pred)
      mnb_metrics['mcc'] += metrics.matthews_corrcoef(y_test, pred)
      mnb_metrics['precision'] += metrics.precision_score(y_test, pred)
      mnb_metrics['recall'] += metrics.recall_score(y_test, pred)
      tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
      mnb_metrics['tcr'] += np.count_nonzero((y_test==1).astype(int))/(9*fp+fn)

      # Testing SGD
      pred = clf2.predict(X_test)
      sgd_metrics['train_acc'] += clf2.score(X_train, y_train)
      sgd_metrics['test_acc'] += clf2.score(X_test, y_test)
      sgd_metrics['f1_score'] += metrics.f1_score(y_test, pred)
      sgd_metrics['mcc'] += metrics.matthews_corrcoef(y_test, pred)
      sgd_metrics['precision'] += metrics.precision_score(y_test, pred)
      sgd_metrics['recall'] += metrics.recall_score(y_test, pred)
      tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
      sgd_metrics['tcr'] += np.count_nonzero((y_test==1).astype(int))/(9*fp+fn)

    # Averaging metrics over K folds
    ext_t /= kf.get_n_splits()
    n_features /= kf.get_n_splits()
    for key in mnb_metrics.keys():
      mnb_metrics[key] /= kf.get_n_splits()
      sgd_metrics[key] /= kf.get_n_splits()

    return ext_t, n_features, mnb_metrics, sgd_metrics

In [13]:
names = ["bag of n-gram", "TF-IDF", "hashing vectorizor"]
funcs = [CountVectorizer, TfidfVectorizer, HashingVectorizer]
r = 1337
for i in range(4,5):
  for j in range(2,3):
    if i < 3 and j < 2:
      continue
    X = data[0]
    y = data[1].astype(int)
    for k in range(1,i+1):
      print("="*50)
      print("Performing {0} with ngram_range = ({1}, {2})".format(names[j], k ,i))
      try:
        vec = funcs[j](n_features=100000*(101*i-100), ngram_range = (k,i), alternate_sign=False)
      except:
        vec = funcs[j](ngram_range = (k,i))
      ext_t, n_features, mnb_metrics, sgd_metrics = benchmark2(X, y, vec, r)
      print()
      print("Extraction time: {0:0.5f}, No. of features: {1:0.3f}".format(ext_t, n_features))
      print("Naive Bayes:")
      print("Train time: {0:0.5f}, Train accuracy: {1:0.5f}, Test accuracy: {2:0.5f}, F1 score: {3:0.5f}".format(mnb_metrics['train_t'], mnb_metrics['train_acc'], mnb_metrics['test_acc'], mnb_metrics['f1_score']))
      print("MCC: {0:0.5f}, TCR: {1:0.5f} Precision: {2:0.5f}, Recall: {3:0.5f},".format(mnb_metrics['mcc'], mnb_metrics['tcr'], mnb_metrics['precision'], mnb_metrics['recall']))
      print("SGDClassifier: ")
      print("Train time: {0:0.5f}, Train accuracy: {1:0.5f}, Test accuracy: {2:0.5f}, F1 score: {3:0.5f}".format(sgd_metrics['train_t'], sgd_metrics['train_acc'], sgd_metrics['test_acc'], sgd_metrics['f1_score']))
      print("MCC: {0:0.5f}, TCR: {1:0.5f} Precision: {2:0.5f}, Recall: {3:0.5f},".format(sgd_metrics['mcc'], sgd_metrics['tcr'], sgd_metrics['precision'], sgd_metrics['recall']))

Performing hashing vectorizor with ngram_range = (1, 4)


100%|██████████| 10/10 [1:01:09<00:00, 366.97s/it]



Extraction time: 28.82798, No. of features: 30400000.000
Naive Bayes:
Train time: 0.12800, Train accuracy: 0.94947, Test accuracy: 0.93654, F1 score: 0.95039
MCC: 0.86949, TCR: 1.12286 Precision: 0.90984, Recall: 0.99473,
SGDClassifier: 
Train time: 1.12451, Train accuracy: 0.94800, Test accuracy: 0.94510, F1 score: 0.95412
MCC: 0.88708, TCR: 3.52117 Precision: 0.97462, Recall: 0.93448,
Performing hashing vectorizor with ngram_range = (2, 4)


100%|██████████| 10/10 [1:00:20<00:00, 362.03s/it]



Extraction time: 24.96698, No. of features: 30400000.000
Naive Bayes:
Train time: 0.06568, Train accuracy: 0.93683, Test accuracy: 0.91544, F1 score: 0.93489
MCC: 0.82717, TCR: 0.83987 Precision: 0.88286, Recall: 0.99346,
SGDClassifier: 
Train time: 0.76039, Train accuracy: 0.86656, Test accuracy: 0.86211, F1 score: 0.87436
MCC: 0.74915, TCR: 3.18656 Precision: 0.98611, Recall: 0.78538,
Performing hashing vectorizor with ngram_range = (3, 4)


100%|██████████| 10/10 [1:03:30<00:00, 381.02s/it]



Extraction time: 18.76336, No. of features: 30400000.000
Naive Bayes:
Train time: 0.04623, Train accuracy: 0.92620, Test accuracy: 0.88347, F1 score: 0.91114
MCC: 0.75894, TCR: 0.65046 Precision: 0.85304, Recall: 0.97773,
SGDClassifier: 
Train time: 0.57690, Train accuracy: 0.88308, Test accuracy: 0.85937, F1 score: 0.89642
MCC: 0.71832, TCR: 0.49087 Precision: 0.81500, Recall: 0.99593,
Performing hashing vectorizor with ngram_range = (4, 4)


100%|██████████| 10/10 [1:11:41<00:00, 430.16s/it]


Extraction time: 12.29830, No. of features: 30400000.000
Naive Bayes:
Train time: 0.03979, Train accuracy: 0.93906, Test accuracy: 0.87269, F1 score: 0.90341
MCC: 0.73626, TCR: 0.59909 Precision: 0.84208, Recall: 0.97438,
SGDClassifier: 
Train time: 0.58308, Train accuracy: 0.81905, Test accuracy: 0.81011, F1 score: 0.86544
MCC: 0.62450, TCR: 0.35811 Precision: 0.76311, Recall: 0.99946,



