In [1]:
# DAT640 - Assignment 1a
# Name:    Rabbir Bin Rabbani
# ID:      247988
# Team:    004

In [2]:
# Uncomment and run if the words library isn't already downloaded in this PC
# import nltk
# nltk.download('words')

In [3]:
import nltk
import numpy as np
import pandas as pd
import email
import re
import math
import os
from collections import Counter

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix


# define wordlist and stopwords in English.
stopwords = set(nltk.corpus.stopwords.words('english'))
wordlists = set(nltk.corpus.words.words())


from IPython.display import clear_output # Using IPython.display.clear_output to clear the output of a cell.

In [4]:
main_file_path = 'data/' # Change this to main data directory (keep the ending '/')

data = pd.read_csv(main_file_path+'train/labels.csv')

## Preprocessing

In [5]:
def process_email_data(file_path):
    print(file_path)
    email_body, email_subject = "", ""
    
    mail = email.message_from_string(open(file_path, encoding = 'ISO-8859-1').read())
    
    # Get E-mail subject
    email_subject = mail['subject']
    
    # Get E-mail body
    if mail.is_multipart():
        for payload in mail.get_payload():
            # if payload.is_multipart() then.. 
            email_body = payload.get_payload()
    else:
        email_body = mail.get_payload()
    
    # If body has no value
    if type(email_body) is not str:
        # If body has no value then make it an empty string
        if not email_body:
            email_body = ""
        # If body was multipart then the payload will be 
        # returned as an array with the first index as 
        # the actual message in the mail
        elif type(email_body) is list:
            email_body = email_body[0].as_string()
    
    if type(email_subject) is not str:
        if not email_subject:
            email_subject = ""
        elif type(email_subject) is list:            
            email_subject = email_subject[0]
    
    # Remove numbers and all special characters except space
    email_body = re.sub(r'[^a-zA-Z]', ' ', email_body).lower()
    email_subject = re.sub(r'[^a-zA-Z]', ' ', email_subject).lower()
    
    full_email = email_subject + email_body
    
    # Remove stopwords
    full_email = ' '.join([word for word in full_email.split() if word not in (stopwords)])
    
    # remove words that are not part of the english dictionary - using nltk tokenizer (Wordlist can be found in cell 3)
    full_email = " ".join(w for w in nltk.wordpunct_tokenize(full_email) if w.lower() in wordlists or not w.isalpha())
    
    # Remove single letter words like 'b', 'j', etc..
    full_email = re.sub(r"\b[a-zA-Z]\b", "", full_email)
    
    clear_output()
    return full_email

In [6]:
data['email'] = data.apply(lambda row: process_email_data(main_file_path+row['Id']), axis=1).apply(pd.Series)
data.head()

Unnamed: 0,Id,Label,email
0,train/000/000,ham,june daily labor id original message sent june...
1,train/000/002,ham,new original message sent june mark subject s...
2,train/000/003,ham,upstream company currently trading spot el ups...
3,train/000/004,ham,new master attached new master physical berry ...
4,train/000/005,ham,upstream company copy communication regarding ...


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82906 entries, 0 to 82905
Data columns (total 3 columns):
Id       82906 non-null object
Label    82906 non-null object
email    82906 non-null object
dtypes: object(3)
memory usage: 1.9+ MB


## Training and Validation

In [8]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(data[['email']], data[['Label']], test_size=0.20, random_state = 20)

In [9]:
# Assumption -
#     Spam emails contains certain words more than Non-spam emails and vice-versa.

# Idea for the Classifier -
#     For training - Find the most common words in spam and non-spam emails.
#     For prediction -
#         1. Count how many spam and non-spam words are in the E-mail.
#         2. If the mail has more spam words than non-spam words, then label the mail as spam, otherwise label it as ham.

class EmailClassifier:
    __frequent_spam_words = None
    __frequent_ham_words = None
    __word_percentage = 5 # What percent of most frequent words to take for checking during training (Default - 5%).
    __model_is_trained = False
    
    def __init__(self, frequent_word_percentage = 5):
        self.__word_percentage = frequent_word_percentage
    
    def train(self, X_train, y_train):
        self.__frequent_spam_words = None
        self.__frequent_ham_words = None
        self.__model_is_trained = False
        
        # Get the indexes of all labeled spam and ham mails
        spam_indx = list(y_train[y_train['Label'] == 'spam'].index.values)
        ham_indx = list(y_train[y_train['Label'] == 'ham'].index.values)

        # Get all words in spam and ham emails
        spam_words = X_train['email'][spam_indx].str.cat().split()
        ham_words = X_train['email'][ham_indx].str.cat().split()
        
        uniq_word_count = len(set(X_train['email'].str.cat().split()))

        # Take (__word_percentage)% of the most frequent spam and ham words
        self.__frequent_spam_words = Counter(spam_words).most_common(round(uniq_word_count*(self.__word_percentage/100)))
        self.__frequent_spam_words = [w[0] for w in self.__frequent_spam_words]
        
        self.__frequent_ham_words = Counter(ham_words).most_common(round(uniq_word_count*(self.__word_percentage/100)))
        self.__frequent_ham_words = [w[0] for w in self.__frequent_ham_words]
        
        # Indicate that the model has been trained
        self.__model_is_trained = True

    def predict(self, X_vals):
        predictions = []
        
        if self.__model_is_trained:
            
            total_size = X_vals.shape[0]
            s = 0
            for mail in X_vals['email']:
                spam_words_count = 0
                ham_words_count = 0
                
                unique_words_in_email = set(mail.split())
                
                # Calculate number of spam and ham matches with test set words
                for word in unique_words_in_email:
                    spam_words_count += 1 if word in self.__frequent_spam_words else 0
                    ham_words_count += 1 if word in self.__frequent_ham_words else 0
                
                # If there are more spam matches than ham, then predict as spam, otherwise predict as ham
                if(spam_words_count > ham_words_count):
                    predictions.append('spam')
                else:
                    predictions.append('ham')
                
                clear_output()
                s += 1
                print(str(s)+" out of "+str(total_size)+"("+str(100*s/total_size)+"%)")
                
        else:
            print("Model must be trained first.")
        
        print("Prediction Complete.")
        
        return predictions        

In [10]:
classifier = EmailClassifier(frequent_word_percentage = 2)
classifier.train(X_train, y_train)

In [11]:
predictions = classifier.predict(X_validation)

16582 out of 16582(100.0%)
Prediction Complete.


In [12]:
cnf_matrix = confusion_matrix(y_validation, predictions)

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
FPR = FP/(FP+TN)

print("Accuracy Score:", accuracy_score(y_validation, predictions))
print("Precision (Macro | Micro):", precision_score(y_validation, predictions, average='macro'), " | ", precision_score(y_validation, predictions, average='micro'))
print("False Positive Rate (FPR):", FPR[0], " | ", FPR[1])
cnf_matrix


Accuracy Score: 0.8285490290676637
Precision (Macro | Micro): 0.8417988488213207  |  0.8285490290676637
False Positive Rate (FPR): 0.26452358441012713  |  0.04601444145547218


array([[6738,  325],
       [2518, 7001]], dtype=int64)

## Testing

In [13]:
# Get test datapaths
path = main_file_path + 'test'
file_paths = pd.Series(['test/' + filename + '/' + fname for filename in os.listdir(path) for fname in os.listdir(main_file_path + 'test/' + filename)])

In [14]:
# Get test emails using the filepaths
test_data = file_paths.apply(lambda row: process_email_data(main_file_path + row)).apply(pd.Series)
test_data.columns = ['email']
test_data.head()

Unnamed: 0,email
0,plan hi tonight rolling new report currently a...
1,advertising working east power desk purchase s...
2,oil ready bill us oil couple want know send in...
3,per eric moon attached find slide prepay north
4,truly return path full name message id date...


In [15]:
# Train with all training data
classifier.train(data[['email']], data[['Label']])

In [19]:
# Predictions for Test Data
predictions = classifier.predict(test_data)

9283 out of 9283(100.0%)
Prediction Complete.


In [17]:
# Create the submission file for Kaggle
submission = pd.DataFrame({
    'Id': file_paths,
    'Label': predictions
})

submission.to_csv('submission.csv', index = False)

In [18]:
# Gives 0.82958 score on Kaggle