In [1]:
# DAT640 - Assignment 1b
# Name:    Rabbir Bin Rabbani
# ID:      247988
# Team:    004

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from IPython.display import clear_output # Using IPython.display.clear_output to clear the output of a cell.

#### Please run the preprocessor notebook file to get the preprocessed datasets.
I have separated the files for simplicity, convinience and saving time. The datasets can also be downloaded from [this dropbox link](https://www.dropbox.com/sh/htywy8h8rtbm3u7/AAD_zpLznXeBnMMQIzPJa8lda?dl=0).

* https://www.dropbox.com/sh/htywy8h8rtbm3u7/AAD_zpLznXeBnMMQIzPJa8lda?dl=0

In [3]:
# Load data from drive
data = pd.read_csv("processed_train_dataset.csv")
data['email'] = data['email'].replace(np.nan, '', regex=True) 

data.head()

Unnamed: 0,Id,email,subject_length,body_length,hasReturnPath,Label
0,train/000/000,fw june bna inc daily labor reportuse...,9,2219,1,ham
1,train/000/002,re intranet siterika these new orig...,3,71,1,ham
2,train/000/003,fw ena upstream company informationjohn geral...,5,228,1,ham
3,train/000/004,new master physicalgerald and stacy attache...,3,49,1,ham
4,train/000/005,fw ena upstream company mirant gisbfyi below...,5,196,1,ham


In [4]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data, data['Label'], test_size=0.20, random_state = 20)

In [5]:
# Create the Vectorizer vocabulary
vectorizer = CountVectorizer()
vectorizer.fit(data['email'])

len(vectorizer.vocabulary_)

9572788

In [6]:
# Count Vectors
X_train_vec = vectorizer.transform(X_train['email'])
X_val_vec = vectorizer.transform(X_val['email'])

print(X_train_vec.shape, X_val_vec.shape)

(66324, 9572788) (16582, 9572788)


In [7]:
X_train_vec

<66324x9572788 sparse matrix of type '<class 'numpy.int64'>'
	with 20042132 stored elements in Compressed Sparse Row format>

In [8]:
# TF Vectors
tf_transformer = TfidfTransformer(norm='l1', use_idf=False)

X_train_vec_tf = tf_transformer.fit_transform(X_train_vec)
X_val_vec_tf = tf_transformer.fit_transform(X_val_vec)

print(X_train_vec_tf.shape, X_val_vec_tf.shape)

(66324, 9572788) (16582, 9572788)


In [9]:
# TF-IDF Vectors
tfidf_transformer = TfidfTransformer(norm='l1', use_idf=True)

X_train_vec_tfidf = tfidf_transformer.fit_transform(X_train_vec)
X_val_vec_tfidf = tfidf_transformer.fit_transform(X_val_vec)

print(X_train_vec_tfidf.shape, X_val_vec_tfidf.shape)

(66324, 9572788) (16582, 9572788)


In [10]:
scores = {
    'Algorithm': [],
    'Term Weighting': [],
    'Acc': [],
    'Prec': [],
    'FPR': []
}

In [11]:
# Naive-Bayes Classifier
classifier = MultinomialNB()

# Naive-Bayes with Count
classifier.fit(X_train_vec, y_train)
predictions = classifier.predict(X_val_vec)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Naive Bayes')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# Naive-Bayes with Term Frequency
classifier.fit(X_train_vec_tf, y_train)
predictions = classifier.predict(X_val_vec_tf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Naive Bayes')
scores['Term Weighting'].append('TF')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# Naive-Bayes with Term Frequency-Inverse Document Frequency
classifier.fit(X_train_vec_tfidf, y_train)
predictions = classifier.predict(X_val_vec_tfidf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Naive Bayes')
scores['Term Weighting'].append('TF-IDF')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

In [12]:
# Support Vector Machine Classifier
classifier = LinearSVC(random_state = 0)

# SVM with Count
classifier.fit(X_train_vec, y_train)
predictions = classifier.predict(X_val_vec)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('SVM')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# SVM with Term Frequency
classifier.fit(X_train_vec_tf, y_train)
predictions = classifier.predict(X_val_vec_tf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('SVM')
scores['Term Weighting'].append('TF')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# SVM with Term Frequency-Inverse Document Frequency
classifier.fit(X_train_vec_tfidf, y_train)
predictions = classifier.predict(X_val_vec_tfidf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('SVM')
scores['Term Weighting'].append('TF-IDF')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))



In [13]:
pd.DataFrame(scores)

Unnamed: 0,Algorithm,Term Weighting,Acc,Prec,FPR
0,Naive Bayes,Count,0.924979,0.895366,0.155033
1,Naive Bayes,TF,0.895489,0.847713,0.241399
2,Naive Bayes,TF-IDF,0.910264,0.866947,0.206145
3,SVM,Count,0.98227,0.98092,0.02591
4,SVM,TF,0.961163,0.951562,0.067393
5,SVM,TF-IDF,0.967314,0.956102,0.061164


#### Adding the new Features to the training data

In [14]:
vectorizer = CountVectorizer()
vectorizer.fit(data['email'])
len(vectorizer.vocabulary_)

9572788

In [15]:
data_vec = vectorizer.transform(data['email'])
data_vec.shape

(82906, 9572788)

#### Add the additional features to the sparse matrix to get the count data

In [16]:
# Combine the data sparse matrix with the additional featurer
data_vec = sparse.hstack((data_vec, data[['subject_length', 'body_length', 'hasReturnPath']].to_numpy()))

In [17]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data_vec, data['Label'], test_size=0.20, random_state = 20)

In [18]:
scores = {
    'Algorithm': [],
    'Term Weighting': [],
    'Acc': [],
    'Prec': [],
    'FPR': []
}

In [19]:
# Naive-Bayes
classifier = MultinomialNB().fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Naive Bayes')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# Linear SVM
classifier = LinearSVC(random_state = 0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('SVM')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# Logistic Regression
classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Logistic Regression')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))

# Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
scores['Algorithm'].append('Decision Tree')
scores['Term Weighting'].append('Count')
scores['Acc'].append((tp + tn) / (tp + tn + fp + fn))
scores['Prec'].append((tp / (tp + fp)))
scores['FPR'].append((fp / (fp + tn)))



In [20]:
pd.DataFrame(scores)

Unnamed: 0,Algorithm,Term Weighting,Acc,Prec,FPR
0,Naive Bayes,Count,0.878241,0.831272,0.270423
1,SVM,Count,0.990894,0.991501,0.011468
2,Logistic Regression,Count,0.975697,0.98707,0.017132
3,Decision Tree,Count,0.989446,0.990859,0.012318


#### Train SVM with entire train dataset

In [21]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(data_vec, data['Label'])



In [22]:
# Read and transform the test dataset.
test_data = pd.read_csv("processed_test_dataset.csv")
test_data['email'] = test_data['email'].replace(np.nan, '', regex=True) 

test_data_vec = vectorizer.transform(test_data['email'])
test_data_vec = sparse.hstack((test_data_vec, test_data[['subject_length', 'body_length', 'hasReturnPath']].to_numpy()))

In [23]:
# Output the predictions to csv for submitting to Kaggle.
test_data['Label'] = classifier.predict(test_data_vec)
test_data[['Id','Label']].to_csv('submission.csv', index = False)