In [None]:
import os
import re
import nltk.corpus
import numpy as np
import pandas as pd
import unicodedata
import math
import copy
from sklearn.metrics import confusion_matrix
from zipfile import ZipFile
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import warnings
warnings.filterwarnings("ignore")
stop = stopwords.words('english')

<h1>EVALUATION OF MODELS</h1>

In [None]:
def get_evaluation_metrics(Y, Y_pred):
    conf = confusion_matrix(y_true = Y, y_pred=Y_pred)
    TN, FP, FN, TP = conf.ravel()

    accuracy = (TN + TP)  / (TN + FP + FN + TP)

    precision = TP / (TP + FP)

    recall = TP / (FN + TP)

    f1 = 2 * precision * recall / (precision + recall)

    return accuracy, precision, recall, f1

In [None]:
# metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}

def print_metrics_dict(metrics_dict):
    s = ""
    for t in metrics_dict:
        for k in metrics_dict[t]:
            s += f'Set: {t}, Dataset: {k} -> Accuracy: {metrics_dict[t][k][0]}, Precision: {metrics_dict[t][k][1]}, Recall: {metrics_dict[t][k][2]}, F1-Score: {metrics_dict[t][k][3]}\n'
            
    return s

<h1>FILE HANDLING</h1>
1. Run the below cell to create a folder named: rhugaved_data
2. Upload zip files of dataset to the folder


In [None]:
os.mkdir("/content/rhugaved_data")
os.chdir("/content/rhugaved_data")

Run below cell to unzip the files

In [None]:
for filename in os.listdir("/content/rhugaved_data"):
    if not os.path.isdir(filename):
        with ZipFile(filename, 'r') as zip:
            # extracting all the files
            zip.extractall()

Getting the folders containing the ham and spam datasets of different datasets

In [None]:
dataset_folders = {"hw1": {"train": [], "test": []}, 
                   "enron1": {"train": [], "test": []}, 
                   "enron4": {"train": [], "test": []}
                   }

base_folders_to_filter = ["train", "test", "enron1", "enron4"]
def get_dataset_folder_paths(dataset_folders):
    for filename in os.listdir("/content/rhugaved_data"):
        folders = [os.path.abspath(filename) for filename in os.listdir(".") if os.path.isdir(filename) and filename in base_folders_to_filter]

    # Add train and test folders to enron
    copy = folders.copy()
    for i, f in enumerate(copy):
        if "enron" in f:
            folders.append(folders[i] + "/test")
            folders[i] += "/train"
            
    # print(folders)

    for i in range(len(folders)):
        folders.append(folders[i] + "/spam")
        folders[i] += "/ham"
    # print((folders))

    for i in folders:
        flag = 0
        for k in dataset_folders:
            if k in i:
                if "train" in i:
                    dataset_folders[k]["train"].append(i)
                else:
                    dataset_folders[k]["test"].append(i)
                flag = 1
        if not flag:
            if "train" in i:
                    dataset_folders["hw1"]["train"].append(i)
            else:
                dataset_folders["hw1"]["test"].append(i)
            

    print(dataset_folders)
    return dataset_folders
dataset_folders = get_dataset_folder_paths(dataset_folders)


{'hw1': {'train': ['/content/rhugaved_data/train/ham', '/content/rhugaved_data/train/spam'], 'test': ['/content/rhugaved_data/test/ham', '/content/rhugaved_data/test/spam']}, 'enron1': {'train': ['/content/rhugaved_data/enron1/train/ham', '/content/rhugaved_data/enron1/train/spam'], 'test': ['/content/rhugaved_data/enron1/test/ham', '/content/rhugaved_data/enron1/test/spam']}, 'enron4': {'train': ['/content/rhugaved_data/enron4/train/ham', '/content/rhugaved_data/enron4/train/spam'], 'test': ['/content/rhugaved_data/enron4/test/ham', '/content/rhugaved_data/enron4/test/spam']}}


Making a copy of the structure of dataset_folders dict. We will store the actual text data in data as dataframes

In [None]:
data = copy.deepcopy(dataset_folders)
lr_data = copy.deepcopy(dataset_folders)

for k in data:
    for t in data[k]:
        data[k][t] = {"ham": dict(), "spam": dict()}
        lr_data[k][t] = dict()

dataset_folders

{'hw1': {'train': ['/content/rhugaved_data/train/ham',
   '/content/rhugaved_data/train/spam'],
  'test': ['/content/rhugaved_data/test/ham',
   '/content/rhugaved_data/test/spam']},
 'enron1': {'train': ['/content/rhugaved_data/enron1/train/ham',
   '/content/rhugaved_data/enron1/train/spam'],
  'test': ['/content/rhugaved_data/enron1/test/ham',
   '/content/rhugaved_data/enron1/test/spam']},
 'enron4': {'train': ['/content/rhugaved_data/enron4/train/ham',
   '/content/rhugaved_data/enron4/train/spam'],
  'test': ['/content/rhugaved_data/enron4/test/ham',
   '/content/rhugaved_data/enron4/test/spam']}}

In [None]:
lr_data

{'hw1': {'train': {}, 'test': {}},
 'enron1': {'train': {}, 'test': {}},
 'enron4': {'train': {}, 'test': {}}}

In [None]:
def get_text_from_dataset(dataset_folders, data):
    
    for k in dataset_folders:
        for t in dataset_folders[k]:
            for hs in dataset_folders[k][t]:
                all_text = []
                for filename in os.listdir(hs):
                    with open(hs + "/" + filename, 'r', encoding="latin-1") as text:
                        all_text.append(text.read())
                        # print(all_text)
                        # return
                if "ham" in filename:
                    data[k][t]['ham']["X"] = all_text
                    data[k][t]['ham']["Y"] = [0] * len(all_text)
                else:
                    data[k][t]['spam']["X"] = all_text
                    data[k][t]['spam']["Y"] = [1] * len(all_text)
get_text_from_dataset(dataset_folders, data)

<h1>Cleaning of Text Data:</h1>
1. Normalizing i.e lower case all letters
2. Remove unicode characters and numbers
3. Remove links and emails
3. Remove Stop words


METHOD TO CLEAN TEXT PER DOCUMENT

In [None]:
def clean_text_document(text):
    text = text.lower()
    text = re.sub(r"(\n)", " ", text)
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^A-Za-z ])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = re.sub(r"(\t)|( +)", " ", text)
    text = " ".join([word for word in text.split() if word not in (stop) and len(word) > 2])
    words_list = text.split()
    return text, words_list

<h1>VOCAB BUILDER</h1>

In [None]:
def vocab_builder(all_text):
    vocab = []
    for text in all_text:
        _, words_list = clean_text_document(text)
        vocab.extend(words_list)
    return sorted(set(vocab))

<h1>Bag of Words</h1>
and
<h1>Bernoulli</h1>

In [None]:

def bow_builder(local_all_text, local_vocab):
    bag_of_words_corpus = []
    bag_of_words = np.zeros(len(local_vocab))  
    for text in local_all_text:        
        _, words_list = clean_text_document(text)   
                
        for word in words_list:            
            for i, w in enumerate(local_vocab):                
                if word == w:                     
                    bag_of_words[i] += 1  
                    
    return bag_of_words


<h1>Multinomial NB</h1>

Contional Probabilites

In [None]:
def get_cond_prob_multi(class_bow, class_all_text, vocab):
    cond_prob = []
    length = 0
    for l in class_all_text:
        length += len(l)
    for i, w in enumerate(class_bow):
        p = (w + 1) /(length + len(vocab))
        cond_prob.append(p)
    return cond_prob

In [None]:
def apply_multi_NB(class_prior, cond_prob, vocab, doc):
    bag_of_words_doc = bow_builder([doc], vocab)
    # for class ham
    prob_ham = math.log(class_prior[0])
    for i, p in enumerate(cond_prob[0]):
        prob_ham += math.log(p) * bag_of_words_doc[i]

    # for class spam
    prob_spam = math.log(class_prior[1])
    for i, p in enumerate(cond_prob[1]):
        prob_spam += math.log(p) * bag_of_words_doc[i]

    return prob_ham, prob_spam

<h3>Making a function which when called will run mulitNB on all datasets and return the predictions</h3>

In [None]:
vocab_dict = {"hw1": None, "enron1": None, "enron4": None}
bag_of_words_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
prior_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
multi_cond_prob_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
Y_pred_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
def multi_NB(data):
    for k in data:
        print("Dataset: ", k)
        # Assign X and Y and spam and ham texts
        ham_all_text = data[k]['train']['ham']['X']
        spam_all_text = data[k]['train']['spam']['X']
        X = ham_all_text + spam_all_text
        Y = data[k]['train']['ham']['Y'] + data[k]['train']['spam']['Y']

        # Call vocab builder and save it in vocab_dict
        vocab_dict[k] = vocab_builder(X)
        # print(len(vocab_dict[k]))

        # Make bow and save respective ham and spam in the dict
        bag_of_words_dict["ham"][k] = bow_builder(ham_all_text, vocab_dict[k])
        bag_of_words_dict["spam"][k] = bow_builder(spam_all_text, vocab_dict[k])

        # calculate priors for each dataset and save in dict
        prior_dict["ham"][k] = len(ham_all_text) / (len(ham_all_text) + len(spam_all_text))
        prior_dict["spam"][k] = len(spam_all_text) / (len(ham_all_text) + len(spam_all_text))

        # calculate conditional probablity and store in dict
        multi_cond_prob_dict["ham"][k] = get_cond_prob_multi(bag_of_words_dict["ham"][k], ham_all_text, vocab_dict[k])
        multi_cond_prob_dict["spam"][k] = get_cond_prob_multi(bag_of_words_dict["spam"][k], spam_all_text, vocab_dict[k])
        
        
        correct = 0
        # calculate the training set metrics
        for i, doc in enumerate(X):
            p = apply_multi_NB([prior_dict["ham"][k], prior_dict["spam"][k]], [multi_cond_prob_dict["ham"][k], 
                                multi_cond_prob_dict["spam"][k]], vocab_dict[k], doc)
            
            if p[0] > p[1]:
                Y_pred_dict["train"][k].append(0)
                if Y[i] == 0:
                    correct += 1

            elif p[0] < p[1]:
                Y_pred_dict["train"][k].append(1)
                if Y[i] == 1:
                    correct += 1
        # print(correct, len(Y), len(Y_pred_dict["train"][k]))
        print(f'Correctly predicted: {correct}, total train examples: {len(Y)}, total predicted examples: {len(Y_pred_dict["train"][k])}')        

        # Evalute on test data
        X_test = data[k]['test']['ham']['X'] + data[k]['test']['spam']['X']
        Y_test = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
        test_correct = 0
        for i, doc in enumerate(X_test):
            p = apply_multi_NB([prior_dict["ham"][k], prior_dict["spam"][k]], [multi_cond_prob_dict["ham"][k], 
                                multi_cond_prob_dict["spam"][k]], vocab_dict[k], doc)
            
            if p[0] > p[1]:
                Y_pred_dict["test"][k].append(0)
                if Y_test[i] == 0:
                    test_correct += 1

            elif p[0] < p[1]:
                Y_pred_dict["test"][k].append(1)
                if Y_test[i] == 1:
                    test_correct += 1
        print(f'Correctly predicted: {test_correct}, total test examples: {len(Y_test)}, total predicted examples: {len(Y_pred_dict["test"][k])}')        
        # metrics_dict["train"][k] = get_evaluation_metrics(Y, Y_pred_dict["train"][k])
        

        

multi_NB(data)

Dataset:  hw1
Correctly predicted: 456, total test examples: 463, total predicted examples: 463
Correctly predicted: 451, total test examples: 478, total predicted examples: 478
Dataset:  enron1
Correctly predicted: 447, total test examples: 450, total predicted examples: 450
Correctly predicted: 428, total test examples: 456, total predicted examples: 456
Dataset:  enron4
Correctly predicted: 417, total test examples: 535, total predicted examples: 535
Correctly predicted: 466, total test examples: 543, total predicted examples: 543


In [None]:
for k in data:
    # evalution on training data
    # print(k)
    Y = data[k]['train']['ham']['Y'] + data[k]['train']['spam']['Y']
    # print(np.shape(Y))
    # print(np.shape(Y_pred_dict["train"][k]))
    metrics_dict["train"][k] = get_evaluation_metrics(Y, Y_pred_dict["train"][k])

    # evaluation on testing data
    Y = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
    # print(np.shape(Y))
    # print(np.shape(Y_pred_dict["test"][k]))
    metrics_dict["test"][k] = get_evaluation_metrics(Y, Y_pred_dict["test"][k])
print(print_metrics_dict(metrics_dict))

Set: train, Dataset: hw1 -> Accuracy: 0.9848812095032398, Precision: 0.953125, Recall: 0.991869918699187, F1-Score: 0.9721115537848606
Set: train, Dataset: enron1 -> Accuracy: 0.9933333333333333, Precision: 0.9776119402985075, Recall: 1.0, F1-Score: 0.9886792452830189
Set: train, Dataset: enron4 -> Accuracy: 0.7794392523364486, Precision: 1.0, Recall: 0.7064676616915423, F1-Score: 0.8279883381924198
Set: test, Dataset: hw1 -> Accuracy: 0.9435146443514645, Precision: 0.8503401360544217, Recall: 0.9615384615384616, F1-Score: 0.9025270758122743
Set: test, Dataset: enron1 -> Accuracy: 0.9385964912280702, Precision: 0.8903225806451613, Recall: 0.9261744966442953, F1-Score: 0.9078947368421052
Set: test, Dataset: enron4 -> Accuracy: 0.858195211786372, Precision: 1.0, Recall: 0.80306905370844, F1-Score: 0.8907801418439717



<h1>BERNOULLI</h1>

In [None]:

def bernoulli_builder(local_all_text, local_vocab):
    # bernoulli = np.zeros(len(local_vocab))  
    bernoulli_doc_occurances = np.zeros(len(local_vocab))  

    for text in local_all_text:        
        _, words_list = clean_text_document(text)   
        # For bernoulli we can have just unique words for calculation
        words_list = list(set(words_list))
        for word in words_list:            
            for i, w in enumerate(local_vocab):                
                if word == w:                     
                    # bernoulli[i] = 1  
                    bernoulli_doc_occurances[i] = bernoulli_doc_occurances[i] + 1
                    
    return bernoulli_doc_occurances


In [None]:
def get_cond_prob_bernoulli(class_bernoulli_doc_occurances, class_all_text, vocab):
    cond_prob = []
    no_docs = len(class_all_text)
    for i, w in enumerate(class_bernoulli_doc_occurances):
        p = (w + 1) /(no_docs + 2)
        cond_prob.append(p)
    return cond_prob

In [None]:
def apply_bernoulli_NB(class_prior, cond_prob, vocab, doc):
    bernoulli_doc = bow_builder([doc], vocab)
    # for class ham
    prob_ham = math.log(class_prior[0])
    # print(cond_prob[0])
    for i, p in enumerate(cond_prob[0]):
        if bernoulli_doc[i]:
            prob_ham += math.log(p)
        else:
            q = 1 - p
            prob_ham += math.log(q)

    # for class spam
    prob_spam = math.log(class_prior[1])
    for i, p in enumerate(cond_prob[1]):
        if bernoulli_doc[i]:
            prob_spam += math.log(p)
        else:
            q = 1 - p
            prob_spam += math.log(q)
    return prob_ham, prob_spam

Primary function to apply bernoulli NB on all datasets

In [None]:
# vocab_dict = {"hw1": None, "enron1": None, "enron4": None}
bernoulli_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
# prior_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
bernoulli_cond_prob_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
bernoulli_Y_pred_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
bernoulli_metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
def bernoulli_NB(data):
    for k in data:
        print("Dataset: ", k)
        # Assign X and Y and spam and ham texts
        ham_all_text = data[k]['train']['ham']['X']
        spam_all_text = data[k]['train']['spam']['X']
        X = ham_all_text + spam_all_text
        Y = data[k]['train']['ham']['Y'] + data[k]['train']['spam']['Y']

        # Call vocab builder and save it in vocab_dict
        # vocab_dict[k] = vocab_builder(X)
        # print(len(vocab_dict[k]))

        # Make bow and save respective ham and spam in the dict
        bernoulli_dict["ham"][k] = bernoulli_builder(ham_all_text, vocab_dict[k])
        bernoulli_dict["spam"][k] = bernoulli_builder(spam_all_text, vocab_dict[k])

        # calculate priors for each dataset and save in dict
        # prior_dict["ham"][k] = len(ham_all_text) / (len(ham_all_text) + len(spam_all_text))
        # prior_dict["spam"][k] = len(spam_all_text) / (len(ham_all_text) + len(spam_all_text))

        # calculate conditional probablity and store in dict
        bernoulli_cond_prob_dict["ham"][k] = get_cond_prob_bernoulli(bernoulli_dict["ham"][k], ham_all_text, vocab_dict[k])
        bernoulli_cond_prob_dict["spam"][k] = get_cond_prob_bernoulli(bernoulli_dict["spam"][k], spam_all_text, vocab_dict[k])
        
        
        correct = 0
        # calculate the training set metrics
        for i, doc in enumerate(X):
            p = apply_bernoulli_NB([prior_dict["ham"][k], prior_dict["spam"][k]], [bernoulli_cond_prob_dict["ham"][k], 
                                bernoulli_cond_prob_dict["spam"][k]], vocab_dict[k], doc)
            
            if p[0] > p[1]:
                bernoulli_Y_pred_dict["train"][k].append(0)
                if Y[i] == 0:
                    correct += 1

            elif p[0] < p[1]:
                bernoulli_Y_pred_dict["train"][k].append(1)
                if Y[i] == 1:
                    correct += 1
        # print(correct, len(Y), len(bernoulli_Y_pred_dict["train"][k]))
        print(f'Correctly predicted: {correct}, total train examples: {len(Y)}, total predicted examples: {len(bernoulli_Y_pred_dict["train"][k])}')        


        # Evalute on test data
        X_test = data[k]['test']['ham']['X'] + data[k]['test']['spam']['X']
        Y_test = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
        test_correct = 0
        for i, doc in enumerate(X_test):
            p = apply_bernoulli_NB([prior_dict["ham"][k], prior_dict["spam"][k]], [bernoulli_cond_prob_dict["ham"][k], 
                                bernoulli_cond_prob_dict["spam"][k]], vocab_dict[k], doc)
            
            if p[0] > p[1]:
                bernoulli_Y_pred_dict["test"][k].append(0)
                if Y_test[i] == 0:
                    test_correct += 1

            elif p[0] < p[1]:
                bernoulli_Y_pred_dict["test"][k].append(1)
                if Y_test[i] == 1:
                    test_correct += 1
        # print(test_correct, len(Y_test), len(bernoulli_Y_pred_dict["test"][k]))  
        print(f'Correctly predicted: {test_correct}, total test examples: {len(Y_test)}, total predicted examples: {len(bernoulli_Y_pred_dict["test"][k])}')        
      
        # metrics_dict["train"][k] = get_evaluation_metrics(Y, Y_pred_dict["train"][k])
        


bernoulli_NB(data)

Dataset:  hw1
Correctly predicted: 390, total test examples: 463, total predicted examples: 463
Correctly predicted: 369, total test examples: 478, total predicted examples: 478
Dataset:  enron1
Correctly predicted: 374, total test examples: 450, total predicted examples: 450
Correctly predicted: 333, total test examples: 456, total predicted examples: 456
Dataset:  enron4
Correctly predicted: 493, total test examples: 535, total predicted examples: 535
Correctly predicted: 496, total test examples: 543, total predicted examples: 543


In [None]:
for k in data:
    # evalution on training data
    # print(k)
    Y = data[k]['train']['ham']['Y'] + data[k]['train']['spam']['Y']
    # print(np.shape(Y))
    # print(np.shape(bernoulli_Y_pred_dict["train"][k]))
    metrics_dict["train"][k] = get_evaluation_metrics(Y, bernoulli_Y_pred_dict["train"][k])

    # evaluation on testing data
    Y = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
    # print(np.shape(Y))
    # print(np.shape(bernoulli_Y_pred_dict["test"][k]))
    metrics_dict["test"][k] = get_evaluation_metrics(Y, bernoulli_Y_pred_dict["test"][k])
print(print_metrics_dict(metrics_dict))

Set: train, Dataset: hw1 -> Accuracy: 0.8423326133909287, Precision: 0.9464285714285714, Recall: 0.43089430894308944, F1-Score: 0.5921787709497207
Set: train, Dataset: enron1 -> Accuracy: 0.8311111111111111, Precision: 1.0, Recall: 0.4198473282442748, F1-Score: 0.5913978494623656
Set: train, Dataset: enron4 -> Accuracy: 0.9214953271028037, Precision: 0.9347826086956522, Recall: 0.9626865671641791, F1-Score: 0.948529411764706
Set: test, Dataset: hw1 -> Accuracy: 0.7719665271966527, Precision: 0.8888888888888888, Recall: 0.18461538461538463, F1-Score: 0.3057324840764331
Set: test, Dataset: enron1 -> Accuracy: 0.7302631578947368, Precision: 0.90625, Recall: 0.19463087248322147, F1-Score: 0.3204419889502762
Set: test, Dataset: enron4 -> Accuracy: 0.9134438305709024, Precision: 0.8926940639269406, Recall: 1.0, F1-Score: 0.9433051869722556



<h1>Logistic Regression</h1>

In [None]:

def bow_bernoulli(local_all_text, local_vocab):
    bag_of_words_corpus = []
    bernoulli_corpus = []

    for text in local_all_text:        
        _, words_list = clean_text_document(text)   
        bag_of_words = np.zeros(len(local_vocab))  
        bernoulli = np.zeros(len(local_vocab))        
        for word in words_list:            
            for i, w in enumerate(local_vocab):                
                if word == w:                     
                    bag_of_words[i] += 1  
                    bernoulli[i] = 1             
                    # print("{0}\n{1}\n".format(_, np.array(bag_of_words)))
        bag_of_words_corpus.append(bag_of_words)
        bernoulli_corpus.append(bernoulli)
        # print("{0}\n{1}\n".format(cleaned_text, np.array(bag_of_words)))
    # print(np.shape(bag_of_words_corpus))
    # print(np.shape(bernoulli_corpus))
    return bag_of_words_corpus, bernoulli_corpus


In [None]:
class log_regression():
    def __init__(self, lr = 0.1, max_iter = 64, reg_para = 0.01):
        self.lr = lr
        self.max_iter = max_iter
        self.reg_para = reg_para


    def fit(self, X, Y):
        # First lets define a weight matrix which includes w0 for bais term
        self.weights = np.zeros(X.shape[1] + 1)

        # Add x0 ie. bais term of 1 in each X
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)

        for i in range(self.max_iter):
            product = np.matmul(X, self.weights)

            h = 1 / (1 + np.exp(-product))
            # print(h)
            no_of_features = X.shape[1]

            # Update Weights
            grad = self.lr * np.matmul(X.transpose(), (Y - h))
            gradient_regularization = self.lr * self.reg_para * self.weights
            # print(grad.shape)
            # print(gradient_regularization.shape)
            self.weights = self.weights + grad - gradient_regularization
            # self.weights += self.lr * np.sum(X.transpose() * (Y - h)) - self.lr * self.reg_para * self.weights

        return self

    
    def predict(self, X):
        prob = np.matmul(X, self.weights[1:]) + self.weights[0]

        prob = 1 / (1 + np.exp(-prob))  

        return prob


<h1>VALIDATING FOR VALUES OF LAMBDA</H1>

In [None]:
from sklearn.metrics import accuracy_score

for k in lr_data:
    lr_data[k]["validation"] = dict()
    lr_data[k]["train0.7"] = dict()

# vocab_dict = {"hw1": None, "enron1": None, "enron4": None}
lr_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
# prior_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
# lr_cond_prob_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
lr_Y_pred_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "train0.7": {"hw1": [], "enron1": [], "enron4": []}, "validation": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
# metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "validation": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
                  
lr_metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "validation": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
def lr_for_lambda(data, lambda_list):
    seed = 69
    np.random.seed(seed)
    
    for k in data:
    # k = "hw1"
    # if k == "hw1":
        # Assign X and Y and spam and ham texts
        ham_all_text = data[k]['train']['ham']['X']
        spam_all_text = data[k]['train']['spam']['X']
        # X = ham_all_text + spam_all_text
        Y_all = data[k]['train']['ham']['Y'] + data[k]['train']['spam']['Y']

        # Call vocab builder and save it in vocab_dict
        # vocab_dict[k] = vocab_builder(X)
        # print(len(vocab_dict[k]))

        # Make bow and save respective ham and spam in the dict
        lr_dict["ham"][k] = bow_bernoulli(ham_all_text, vocab_dict[k])
        lr_dict["spam"][k] = bow_bernoulli(spam_all_text, vocab_dict[k])


        X_all0 = np.array(lr_dict["ham"][k][0] + lr_dict["spam"][k][0])
        X_all1 = np.array(lr_dict["ham"][k][1] + lr_dict["spam"][k][1])

        # Randomize the X and Y before training
        np.random.shuffle(X_all0)
        np.random.shuffle(X_all1)
        np.random.shuffle(Y_all)

        # Store complete training set in "train"
        lr_data[k]["train"]["X"] = [X_all0, X_all1]
        lr_data[k]["train"]["Y"] = Y_all


        # Split train into train0.7+validation
        lr_data[k]["train0.7"]["X"] = [X_all0[:math.floor(0.7 * len(X_all0))], X_all1[:math.floor(0.7 * len(X_all1))]]
        lr_data[k]["validation"]["X"] = [X_all0[math.floor(0.7 * len(X_all0)):], X_all1[math.floor(0.7 * len(X_all1)):]]

        lr_data[k]["train0.7"]["Y"] = Y_all[:math.floor(0.7 * len(Y_all))]
        lr_data[k]["validation"]["Y"] = Y_all[math.floor(0.7 * len(Y_all)):]

        
        # Evalute on test data
        X_test = data[k]['test']['ham']['X'] + data[k]['test']['spam']['X']
        Y_test = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
        X_test = np.array(bow_bernoulli(X_test, vocab_dict[k]))

        # Save test in lr_data. X is after applying bow_bernoulli
        lr_data[k]["test"]["X"] = X_test
        lr_data[k]["test"]["Y"] = Y_test

###################################################################

    for lambd in lambda_list:
        lr_Y_pred_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "train0.7": {"hw1": [], "enron1": [], "enron4": []}, "validation": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
        # print("LAMBDA: ", lambd)
        for k in data:
            # Loop for bernoulli and BOW:
            for b in range(2):
                # print("Ber or Bow: ", b)
                # print("K: ", k)
                # # Set X and Y as training parts
                X = lr_data[k]["train0.7"]["X"][b]
                Y = lr_data[k]["train0.7"]["Y"]



                # print(X.shape)
                # print(X)
                output = log_regression(reg_para=lambd).fit(X, Y)

                
                # TRAINING ON TRAIN 0.7 PART OF DATA
                correct = 0
                lr_Y_pred_dict["train0.7"][k].append([])
                # lr_Y_pred_dict["train"][k] = Y.copy()
                for i, d in enumerate(X):
                    # print(len(d))
                    p = output.predict(d)
                    # print(p)
                    if p > 0.5:
                        lr_Y_pred_dict["train0.7"][k][b].append(1)
                        if Y[i] == 1:
                            correct += 1
                    else:
                        lr_Y_pred_dict["train0.7"][k][b].append(0)
                        if Y[i] == 0:
                            correct += 1
                    
                # print(correct, len(Y), len(lr_Y_pred_dict["train"][k]))

                # EVALUTE ON VALIDATION SET
                X = lr_data[k]["validation"]["X"][b]
                Y = lr_data[k]["validation"]["Y"]
                correct = 0
                lr_Y_pred_dict["validation"][k].append([])

                # lr_Y_pred_dict["train"][k] = Y.copy()
                for i, d in enumerate(X):
                    # print(len(d))
                    p = output.predict(d)
                    # print(p)
                    if p > 0.5:
                        lr_Y_pred_dict["validation"][k][b].append(1)
                        if Y[i] == 1:
                            correct += 1
                    else:
                        lr_Y_pred_dict["validation"][k][b].append(0)
                        if Y[i] == 0:
                            correct += 1
                    
                # print(correct, len(Y), len(lr_Y_pred_dict["validation"][k]))
                if b == 0:
                    ber_bag = "Bag Of Words model"
                else:
                    ber_bag = "Bernoulli model"

                print(f'Lambda: {lambd}, Model: {ber_bag}, Dataset: {k} -> ', end=" ")
                print('Accuracy: {:.2f}'.format(accuracy_score(Y, lr_Y_pred_dict["validation"][k][b])))

        
lambda_list = [0.0001, 0.001, 0.01, 0.1]
lr_for_lambda(data, lambda_list)

Lambda: 0.001, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.59
Lambda: 0.001, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.71
Lambda: 0.001, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.60
Lambda: 0.001, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.56
Lambda: 0.001, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.63
Lambda: 0.001, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.75
Lambda: 0.01, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.63
Lambda: 0.01, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.71
Lambda: 0.01, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.61
Lambda: 0.01, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.56
Lambda: 0.01, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.68
Lambda: 0.01, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.73
Lambda: 0.1, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.66
Lambda: 0.1, Model: Ba

<H1>TRAINING ON THE BEST LAMBDA VALUE AND TESTING ON THE TEST SET</H1>

In [205]:
# for k in lr_data:
#     lr_data[k]["validation"] = dict()
#     lr_data[k]["train0.7"] = dict()

# vocab_dict = {"hw1": None, "enron1": None, "enron4": None}
# lr_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
# prior_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
# lr_cond_prob_dict = {"ham": {"hw1": None, "enron1": None, "enron4": None}, "spam": {"hw1": None, "enron1": None, "enron4": None}}
lr_Y_pred_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
# metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "validation": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
                  
lr_metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}
def lr(data, lambd, l_rate):
    # seed = np.random.randint(0, 10000)
    # np.random.seed(seed)
    for k in data:
        # Bow or Bernoulli:
        for b in range(2):
        
            # Set X and Y as training parts
            X = lr_data[k]["train"]["X"][b]
            Y = lr_data[k]["train"]["Y"]



            # print(X.shape)
            # print(X)
            output = log_regression(reg_para=lambd, lr=l_rate).fit(X, Y)

            
            # TRAINING ON TRAIN 0.7 PART OF DATA
            correct = 0
            lr_Y_pred_dict["train"][k].append([])
            # lr_Y_pred_dict["train"][k] = Y.copy()
            for i, d in enumerate(X):
                # print(len(d))
                p = output.predict(d)
                # print(p)
                if p > 0.5:
                    lr_Y_pred_dict["train"][k][b].append(1)
                    if Y[i] == 1:
                        correct += 1
                else:
                    lr_Y_pred_dict["train"][k][b].append(0)
                    if Y[i] == 0:
                        correct += 1
                
            # print("BOW or BION and K: TRAIN:", b, k)
            # print(correct, len(Y), len(lr_Y_pred_dict["train"][k][b]))
            if b == 0:
                ber_bag = "Bag Of Words model"
            else:
                ber_bag = "Bernoulli model"

            print(f'TRAIN: Lambda: {lambd}, Model: {ber_bag}, Dataset: {k} -> ', end=" ")
            print('Accuracy: {:.2f}'.format(accuracy_score(Y, lr_Y_pred_dict["train"][k][b])))

            # Save test in lr_data. X is after applying bow_bernoulli
            X_test = lr_data[k]["test"]["X"] 
            Y_test = lr_data[k]["test"]["Y"] 


            test_correct = 0
            lr_Y_pred_dict["test"][k].append([])
            for i, d in enumerate(X_test[0]):
                # print(len(d))
                # print(i, d)
                p = output.predict(d)
                # print(p)
                if p > 0.5:
                    lr_Y_pred_dict["test"][k][b].append(1)
                    if Y_test[i] == 1:
                        test_correct += 1
                else:
                    lr_Y_pred_dict["test"][k][b].append(0)
                    if Y_test[i] == 0:
                        test_correct += 1
                
            # print("BOW or BION and K: TEST:", b, k)
            # print(test_correct, len(Y_test), len(lr_Y_pred_dict["test"][k][b]))
            if b == 0:
                ber_bag = "Bag Of Words model"
            else:
                ber_bag = "Bernoulli model"

            print(f'TEST: Lambda: {lambd}, Model: {ber_bag}, Dataset: {k} -> ', end=" ")
            print('Accuracy: {:.2f}'.format(accuracy_score(Y_test, lr_Y_pred_dict["test"][k][b])))

            # metrics_dict["train"][k] = get_evaluation_metrics(Y, Y_pred_dict["train"][k])
        

        

lr(data, 0.1, 0.01)

TRAIN: Lambda: 0.1, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.98
TEST: Lambda: 0.1, Model: Bag Of Words model, Dataset: hw1 ->  Accuracy: 0.64
TRAIN: Lambda: 0.1, Model: Bernoulli model, Dataset: hw1 ->  Accuracy: 0.97
TEST: Lambda: 0.1, Model: Bernoulli model, Dataset: hw1 ->  Accuracy: 0.69
TRAIN: Lambda: 0.1, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.98
TEST: Lambda: 0.1, Model: Bag Of Words model, Dataset: enron1 ->  Accuracy: 0.53
TRAIN: Lambda: 0.1, Model: Bernoulli model, Dataset: enron1 ->  Accuracy: 0.97
TEST: Lambda: 0.1, Model: Bernoulli model, Dataset: enron1 ->  Accuracy: 0.51
TRAIN: Lambda: 0.1, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.93
TEST: Lambda: 0.1, Model: Bag Of Words model, Dataset: enron4 ->  Accuracy: 0.77
TRAIN: Lambda: 0.1, Model: Bernoulli model, Dataset: enron4 ->  Accuracy: 0.97
TEST: Lambda: 0.1, Model: Bernoulli model, Dataset: enron4 ->  Accuracy: 0.68


In [203]:
for b in range(2):
    if b == 0:
        ber_bag = "Bag Of Words model"
    else:
        ber_bag = "Bernoulli model"
    print(f'Model: {ber_bag} -> ')
    metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}

    for k in lr_data:
        # evalution on training data
        # print(k)
        Y = lr_data[k]["train"]["Y"]
        # print(np.shape(Y))
        # print(np.shape(lr_Y_pred_dict["train"][k]))
        metrics_dict["train"][k] = get_evaluation_metrics(Y, lr_Y_pred_dict["train"][k][b])

        # evaluation on testing data
        Y = data[k]['test']['ham']['Y'] + data[k]['test']['spam']['Y']
        # print(np.shape(Y))
        # print(np.shape(lr_Y_pred_dict["test"][k]))
        metrics_dict["test"][k] = get_evaluation_metrics(Y, lr_Y_pred_dict["test"][k][b])
    print(print_metrics_dict(metrics_dict))

Model: Bag Of Words model -> 
Set: train, Dataset: hw1 -> Accuracy: 0.9762419006479481, Precision: 0.9590163934426229, Recall: 0.9512195121951219, F1-Score: 0.9551020408163264
Set: train, Dataset: enron1 -> Accuracy: 0.9755555555555555, Precision: 0.9918032786885246, Recall: 0.9236641221374046, F1-Score: 0.9565217391304348
Set: train, Dataset: enron4 -> Accuracy: 0.9345794392523364, Precision: 0.9717223650385605, Recall: 0.9402985074626866, F1-Score: 0.9557522123893806
Set: test, Dataset: hw1 -> Accuracy: 0.6443514644351465, Precision: 0.2619047619047619, Recall: 0.16923076923076924, F1-Score: 0.205607476635514
Set: test, Dataset: enron1 -> Accuracy: 0.5307017543859649, Precision: 0.23140495867768596, Recall: 0.18791946308724833, F1-Score: 0.2074074074074074
Set: test, Dataset: enron4 -> Accuracy: 0.7716390423572744, Precision: 0.801354401805869, Recall: 0.907928388746803, F1-Score: 0.8513189448441246

Model: Bernoulli model -> 
Set: train, Dataset: hw1 -> Accuracy: 0.9654427645788337,

Testing the randomizatoin of array

<h1>SGD-CLASSIFIER</h1>

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

Using GridSearch to tune hyperparameters

In [None]:
parameters = {"penalty": ["none", "l1", "l2"],
              "alpha": [0.0001, 0.001, 0.01, 0.1],
              "loss": ["log", "squared_hinge", "log"]
              }

sgd = SGDClassifier(max_iter=64)
grid_search = GridSearchCV(sgd, param_grid=parameters)

for k in data:
    # For BOW and Bernoulli
    for b in range(2):
        grid_search.fit(lr_data[k]["train"]["X"][b], lr_data[k]["train"]["Y"])
        if b == 0:
            ber_bag = "Bag Of Words model"
        else:
            ber_bag = "Bernoulli model"
        print(f'Dataset: {k}, and Model: {ber_bag} -> Best Parameters: ', end=" ")
        print(grid_search.best_params_)

Dataset: hw1, and Model: Bag Of Words model -> Best Parameters:  {'alpha': 0.1, 'loss': 'log', 'penalty': 'l1'}
Dataset: hw1, and Model: Bernoulli model -> Best Parameters:  {'alpha': 0.01, 'loss': 'log', 'penalty': 'l2'}
Dataset: enron1, and Model: Bag Of Words model -> Best Parameters:  {'alpha': 0.1, 'loss': 'log', 'penalty': 'l1'}
Dataset: enron1, and Model: Bernoulli model -> Best Parameters:  {'alpha': 0.1, 'loss': 'log', 'penalty': 'l1'}
Dataset: enron4, and Model: Bag Of Words model -> Best Parameters:  {'alpha': 0.01, 'loss': 'log', 'penalty': 'l2'}
Dataset: enron4, and Model: Bernoulli model -> Best Parameters:  {'alpha': 0.01, 'loss': 'log', 'penalty': 'l1'}


Using parameters from GRIDSEARCH on SGDClassifier

In [155]:
sgd = SGDClassifier(loss="log", penalty="l2", alpha=0.01, max_iter=64)

# For BOW and Bernoulli
for b in range(2):
    if b == 0:
        ber_bag = "Bag Of Words model"
    else:
        ber_bag = "Bernoulli model"
    print(f'Model: {ber_bag} ->')
    metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}

    for k in data:
        sgd.fit(lr_data[k]["train"]["X"][b], lr_data[k]["train"]["Y"])

        Y_pred = sgd.predict(lr_data[k]["train"]["X"][b])
        metrics_dict["train"][k] = get_evaluation_metrics(lr_data[k]["train"]["Y"], Y_pred)

        Y_pred = sgd.predict(lr_data[k]["test"]["X"][b])
        if b == 0:
            ber_bag = "Bag Of Words model"
        else:
            ber_bag = "Bernoulli model"
        print(f'Dataset: {k}, and Model: {ber_bag} -> ', end=" ")
        print('Accuracy: {:.2f}'.format(accuracy_score(lr_data[k]["test"]["Y"], Y_pred)))
        metrics_dict["test"][k] = get_evaluation_metrics(lr_data[k]["test"]["Y"], Y_pred)
    print(print_metrics_dict(metrics_dict))

Model: Bag Of Words model ->
Dataset: hw1, and Model: Bag Of Words model ->  Accuracy: 0.68
Dataset: enron1, and Model: Bag Of Words model ->  Accuracy: 0.57
Dataset: enron4, and Model: Bag Of Words model ->  Accuracy: 0.71
Set: train, Dataset: hw1 -> Accuracy: 0.9481641468682506, Precision: 1.0, Recall: 0.8048780487804879, F1-Score: 0.8918918918918919
Set: train, Dataset: enron1 -> Accuracy: 0.9622222222222222, Precision: 0.9913793103448276, Recall: 0.8778625954198473, F1-Score: 0.9311740890688259
Set: train, Dataset: enron4 -> Accuracy: 0.9439252336448598, Precision: 0.9305555555555556, Recall: 1.0, F1-Score: 0.9640287769784173
Set: test, Dataset: hw1 -> Accuracy: 0.6820083682008368, Precision: 0.25, Recall: 0.08461538461538462, F1-Score: 0.1264367816091954
Set: test, Dataset: enron1 -> Accuracy: 0.5657894736842105, Precision: 0.12307692307692308, Recall: 0.053691275167785234, F1-Score: 0.07476635514018691
Set: test, Dataset: enron4 -> Accuracy: 0.7108655616942909, Precision: 0.72762

In [147]:
sgd = SGDClassifier(loss="log", penalty="l1", alpha=0.01, max_iter=64)

# For BOW and Bernoulli
for b in range(2):
    if b == 0:
        ber_bag = "Bag Of Words model"
    else:
        ber_bag = "Bernoulli model"
    print(f'Model: {ber_bag} ->')
    metrics_dict = {"train": {"hw1": [], "enron1": [], "enron4": []}, "test": {"hw1": [], "enron1": [], "enron4": []}}

    for k in data:
        sgd.fit(lr_data[k]["train"]["X"][b], lr_data[k]["train"]["Y"])

        Y_pred = sgd.predict(lr_data[k]["train"]["X"][b])
        metrics_dict["train"][k] = get_evaluation_metrics(lr_data[k]["train"]["Y"], Y_pred)

        Y_pred = sgd.predict(lr_data[k]["test"]["X"][b])
        if b == 0:
            ber_bag = "Bag Of Words model"
        else:
            ber_bag = "Bernoulli model"
        print(f'Dataset: {k}, and Model: {ber_bag} -> ', end=" ")
        print('Accuracy: {:.2f}'.format(accuracy_score(lr_data[k]["test"]["Y"], Y_pred)))
        metrics_dict["test"][k] = get_evaluation_metrics(lr_data[k]["test"]["Y"], Y_pred)
    print(print_metrics_dict(metrics_dict))

Model: Bag Of Words model ->
Dataset: hw1, and Model: Bag Of Words model ->  Accuracy: 0.68
Dataset: enron1, and Model: Bag Of Words model ->  Accuracy: 0.63
Dataset: enron4, and Model: Bag Of Words model ->  Accuracy: 0.68
Set: train, Dataset: hw1 -> Accuracy: 0.7624190064794817, Precision: 0.76, Recall: 0.15447154471544716, F1-Score: 0.25675675675675674
Set: train, Dataset: enron1 -> Accuracy: 0.7377777777777778, Precision: 0.782608695652174, Recall: 0.13740458015267176, F1-Score: 0.2337662337662338
Set: train, Dataset: enron4 -> Accuracy: 0.7775700934579439, Precision: 0.7779960707269156, Recall: 0.9850746268656716, F1-Score: 0.8693743139407244
Set: test, Dataset: hw1 -> Accuracy: 0.6820083682008368, Precision: 0.041666666666666664, Recall: 0.007692307692307693, F1-Score: 0.012987012987012988
Set: test, Dataset: enron1 -> Accuracy: 0.6271929824561403, Precision: 0.08, Recall: 0.013422818791946308, F1-Score: 0.022988505747126436
Set: test, Dataset: enron4 -> Accuracy: 0.6777163904235