# EL-GY-9133 Machine Learning for Cyber-Security

## Lab 2: E-mail Spam Filtering

#### Name: Varuni Buereddy
#### Net-ID: vb2386


### Overview
In this lab, you will design an e-mail spam filter using a Naïve Bayes and SVM based classification on the ling-spam dataset. You will explore the impact of feature selection and compare the performance of different variants of an NB classifier and also implement your own SVM based classifier. (Note: You may use the scikitt learn classifiers to only compare the accuracy of their model to yours).

### Dataset
The ling-spam corpus contains e-mails from the Linguist mailing list categorized as either legitimate or spam emails. The corpus is divided into four sub-folders that contain the same emails that are pre-processed with/without lemmatization and with/without stop-word removal. The e-mails in each sub-folder partitioned into 10 "folds."
In this lab, we will use the first 9 folds from the ling-spam corpus as training data, and the 10th fold as test data.



In [14]:
import os
import numpy as np
from collections import Counter

In [2]:
path_to_dataset = './lingspam_public/lemm_stop'

In [3]:
trainfolder = os.listdir(path_to_dataset)
trainfolder.remove('part10')
testfolder = 'part10'        

In [5]:
# Data Loading from folder

def load_from_folder(folder, email_texts, labels):
    folder = os.path.join(path_to_dataset,folder)
    files = [os.path.join(folder,f) for f in os.listdir(folder)]
    for file in files:
        with open(file, 'r') as f:
            for i,line in enumerate(f):
                if(i==2): 
                    email_texts.append(line)
        if(file.startswith(folder+'/spmsg')):
            labels.append(1)
        else:
            labels.append(0)   


In [6]:
train_emails = []
train_labels = []
test_emails = []
test_labels = []

for folder in trainfolder:
    load_from_folder(folder, train_emails, train_labels)
    
load_from_folder(testfolder, test_emails, test_labels)

spam_train_mails = sum(train_labels)
spam_test_mails = sum(test_labels)
print("************Training dataset:**************")
print(f"Number of Spam Emails: {sum(train_labels)}")
print(f"Number of Non-Spam Emails: {len(train_labels)-sum(train_labels)}")

print("************Test dataset:**************")
print(f"Number of Spam Emails: {sum(test_labels)}")
print(f"Number of Non-Spam Emails: {len(test_labels)-sum(test_labels)}")

************Training dataset:**************
Number of Spam Emails: 432
Number of Non-Spam Emails: 2170
************Test dataset:**************
Number of Spam Emails: 49
Number of Non-Spam Emails: 242


In [11]:
## Text Preprocessing

def remove_punctuation(test_str):
    result = ''.join(filter(lambda x: x.isalpha() or x.isdigit() or x.isspace(), test_str))
    return result

def get_features(email):
    text = remove_punctuation(email)
    return set([token for token in text.split(' ')])

In [26]:
class BernoulliNBTextClassifier():
    def __init__(self):
        self._log_priors = None
        self.likelihood = None
        self.features = None
    
    def train(self, emails, labels):
        label_counts = Counter(labels)
        N = float(sum(label_counts.values()))
        self._log_priors = {k: np.log(v/N) for k, v in label_counts.items()}
        X = [set(get_features(mail)) for mail in emails]
        self.features = set([f for features in X for f in features])
        self.likelihood = {l: {f: 0. for f in self.features} for l in self._log_priors}
        
        for x, l in zip(X, labels):
            for f in x:
                self.likelihood[l][f] += 1.

        # Now, compute log probs
        for l in self.likelihood:
            N = label_counts[l]
            self.likelihood[l] = {f: (v + 1.) / (N + 2.) for f, v in self.likelihood[l].items()}
    
    
    def predict(self, text):
        # Extract features
        x = get_features(text)

        pred_class = None
        max_ = float("-inf")

        for l in self._log_priors:
            log_sum = self._log_priors[l]
            for f in self.features:
                prob = self.likelihood[l][f]
                log_sum += np.log(prob if f in x else 1. - prob)
            if log_sum > max_:
                max_ = log_sum
                pred_class = l

        return pred_class


In [28]:
nb = BernoulliNBTextClassifier()
nb.train(train_emails, train_labels)

print('Testing model...')
f = lambda doc, l: 1. if nb.predict(doc) != l else 0.
num_missed = sum([f(doc, l) for doc, l in zip(test_emails, test_labels)])

Testing model...


In [30]:
N = len(test_labels) * 1.
error_rate = round(100. * (num_missed / N), 3)

print('Error rate of {0}% ({1}/{2})'.format(error_rate, int(num_missed), int(N)))

Error rate of 10.997% (32/291)


In [10]:
X_train_spam_count = []
X_train_ham_count = []
for i in range(len(processed_train)):
    if train_labels[i]==1:
        X_train_spam_count.append(X_train_count[i])
    
    else:
        X_train_ham_count.append(X_train_count[i])

n = np.sum(np.array(X_train_spam_count), axis = 0)
m = np.sum(np.array(X_train_ham_count), axis = 0)

In [12]:
def mutual_info(feature, feature_matrix, N, N_spam):
    """
    feature - each term in the vocabulary created
    feature_matrix - 
    N - Number of train_emails
    """
    
    N11 = feature_matrix[0][feature]
    N10 = feature_matrix[1][feature]
    N01 = (N_spam - N11)
    N00 = N - (N11+N01+N10)
    N1dot = N11+N10
    Ndot1 = N_spam
    N0dot = N01+N00
    Ndot0 = N-N_spam
    keys = [N11, N10, N01, N00]
    values = [N1dot*Ndot1, N1dot*Ndot0, N0dot*Ndot1, N0dot*Ndot0]
    mi = 0
    for i in range(4):
        if keys[i]==0:
            mi+=0
        else:
            mi+= (keys[i]/N)*np.log2(keys[i]*N/values[i])
    return mi 

IG = []
for i in range(len(vocab)):
    IG.append(mutual_info(i, feature_matrix, len(train_labels), spam_train_mails))
    
sorted_index = np.argsort(np.array(IG))[::-1][:10000]
features = [vocab[s] for s in sorted_index]

In [13]:
## X, Y
def classifier(X, feature_matrix, features, N, N_spam):
    Px_spam = 1
    Px_legit = 1
    for i in range(len(features)):
        #print(feature_matrix[0][i]+1)
        p_is = (feature_matrix[0][i]+1)/(N_spam+2)
        p_il = (feature_matrix[1][i]+1)/(N-N_spam+2)
        if features[i] in X:
            Px_spam = Px_spam*p_is
            Px_legit = Px_legit*p_il
            
        else:
            Px_spam = Px_spam*(1-p_is)
            Px_legit = Px_legit*(1-p_il)
            
    return Px_spam, (1-Px_spam)#Px_legit #(1-Px_spam)#Px_legit


Px_spam, Px_legit = classifier(remove_punctuation(test_emails[0]).split(' '), feature_matrix, features, len(train_labels), spam_train_mails)


In [14]:
P_spam = spam_train_mails/len(train_labels)
P_legit = 1-P_spam
Px = P_spam*Px_spam + P_legit*Px_legit

Pspam_x = P_spam*Px_spam/Px

print(Px_spam)
print(Px_legit)
print(Px)
print(Pspam_x)

1.2183112842324327e-173
7.283366435851722e-182
2.0227151829105156e-174
0.9999999699703778


In [15]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(X_train_count, train_labels)

In [16]:
test_email_count = v.transform(processed_test)
test_pred = model.predict(test_email_count)

In [17]:
print(classification_report(test_labels, test_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       242
           1       1.00      0.35      0.52        49

    accuracy                           0.89       291
   macro avg       0.94      0.67      0.73       291
weighted avg       0.90      0.89      0.87       291



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
s = ['The dog  is in the   well ', 'The food is  delicious', 'the cat']

v = CountVectorizer(binary=True)
vec = v.fit_transform(s)
vocab = v.vocabulary_
vocab = dict((v,k) for k,v in vocab.items())

print(vocab)