In [15]:
import numpy as np
import nltk
import os
import math
from scipy.special import expit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nltk.download('punkt_tab')

no_of_datasets = 3

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vigha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:

#To read data from the folders
train_data_folders = ["\\enron1_train\\enron1\\train", "\\enron2_train\\train", "\\enron4_train\\enron4\\train"]
test_data_folders = ["\\enron1_test\\enron1\\test", "\\enron2_test\\test", "\\enron4_test\\enron4\\test"]

train_data_folders=[ os.getcwd() + t for t in train_data_folders]
test_data_folders=[ os.getcwd() + t for t in test_data_folders]


#0, 1 ,2 indices of list will be for the 3 datasets
#Each list is vocabulary for i+1th dataset 
train_vocab_lists = [[] for i in range(no_of_datasets)]
#counter to help creating vector of words later
no_message = []
for dataset, train_folder in enumerate(train_data_folders):
    #-1 is for Subject which I am removing
    count = 0
    for filename in os.listdir(train_folder + "\\ham"):
        count += 1
        with open(os.path.join(train_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                    if word not in train_vocab_lists[dataset]:
                        train_vocab_lists[dataset].append(word)
                
                    
    for filename in os.listdir(train_folder + "\\spam"):
        count += 1
        with open(os.path.join(train_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                    if word not in train_vocab_lists[dataset]:
                        train_vocab_lists[dataset].append(word)
    no_message.append(count)


In [17]:
bow_matrices = []

#iterating over number of messages to create bag of word matrices for all 3 datasets
#Last column of the numpy array is the inference i.e 0 for ham and 1 for spam
for i in range(no_of_datasets):
    bow_matrices.append(np.zeros((no_message[i], len(train_vocab_lists[i])+1)))


for dataset, train_folder in enumerate(train_data_folders):
    msg_index = 0
    for filename in os.listdir(train_folder + "\\ham"): 
        with open(os.path.join(train_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                    (bow_matrices[dataset])[msg_index][train_vocab_lists[dataset].index(word)] += 1
                    (bow_matrices[dataset])[msg_index][-1] = 0
            msg_index += 1
                
                    
    for filename in os.listdir(train_folder + "\\spam"):
        with open(os.path.join(train_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                    (bow_matrices[dataset])[msg_index][train_vocab_lists[dataset].index(word)] += 1
                    (bow_matrices[dataset])[msg_index][-1] = 1
            msg_index += 1

print("Using the BAG OF WORD MODEL")
print("The features x examples matrices are: ")

print("Matrix for dataset 1", bow_matrices[0])
print("Matrix for dataset 2", bow_matrices[1])
print("Matrix for dataset 3", bow_matrices[2])

Using the BAG OF WORD MODEL
The features x examples matrices are: 
Matrix for dataset 1 [[1. 6. 3. ... 0. 0. 0.]
 [1. 3. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 9. 0. ... 0. 0. 1.]
 [1. 3. 0. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]]
Matrix for dataset 2 [[ 1. 19.  2. ...  0.  0.  0.]
 [ 1. 11.  0. ...  0.  0.  0.]
 [ 1.  1.  0. ...  0.  0.  0.]
 ...
 [ 1.  1.  0. ...  0.  0.  1.]
 [ 1.  1.  0. ...  0.  0.  1.]
 [ 1.  6.  0. ...  1.  2.  1.]]
Matrix for dataset 3 [[1. 7. 3. ... 0. 0. 0.]
 [1. 2. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 0. ... 0. 0. 1.]
 [1. 2. 0. ... 1. 1. 1.]
 [1. 2. 0. ... 0. 0. 1.]]


In [18]:
bnouli_matrices = []

#iterating over number of messages to create bag of word matrices for all 3 datasets
#Last column of the numpy array is the inference i.e 0 for ham and 1 for spam
for i in range(no_of_datasets):
    bnouli_matrices.append(np.zeros((no_message[i], len(train_vocab_lists[i])+1)))


for dataset_index, train_folder in enumerate(train_data_folders):
    msg_index = 0
    for filename in os.listdir(train_folder + "\\ham"): 
        with open(os.path.join(train_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                    (bnouli_matrices[dataset_index])[msg_index][train_vocab_lists[dataset_index].index(word)] = 1
                    (bnouli_matrices[dataset_index])[msg_index][-1] = 0
            msg_index += 1
                
                    
    for filename in os.listdir(train_folder + "\\spam"):
        with open(os.path.join(train_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            for sent in f.readlines():
                for word in nltk.tokenize.word_tokenize(sent):
                        (bnouli_matrices[dataset_index])[msg_index][train_vocab_lists[dataset_index].index(word)] = 1
                        (bnouli_matrices[dataset_index])[msg_index][-1] = 1
            msg_index += 1

print("Using the BERNOULLI MODEL")
print("The features x examples matrices are: ")

print("Matrix for dataset 1", bnouli_matrices[0])
print("Matrix for dataset 2", bnouli_matrices[1])
print("Matrix for dataset 3", bnouli_matrices[2])

Using the BERNOULLI MODEL
The features x examples matrices are: 
Matrix for dataset 1 [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 0. ... 0. 0. 1.]
 [1. 1. 0. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]]
Matrix for dataset 2 [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 0. ... 0. 0. 1.]
 [1. 1. 0. ... 0. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]]
Matrix for dataset 3 [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 0. ... 0. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 [1. 1. 0. ... 0. 0. 1.]]


In [19]:
#Training multinomial Naive Bayes Model
#We have already extracted Vocabulary = train_vocab_list and N = no_message
class_types = [0, 1]


#Precomputing stuff for training
#No of emails in each class
message_in_class_count = []

for dataset, train_folder in enumerate(train_data_folders):
    msg_cnt = [0,0]
    #Counting ham messages
    for filename in os.listdir(train_folder + "\\ham"): 
        with open(os.path.join(train_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            msg_cnt[0] += 1
                
    #Counting Spam messages                    
    for filename in os.listdir(train_folder + "\\spam"):
        with open(os.path.join(train_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            msg_cnt[1] += 1
    message_in_class_count.append(msg_cnt)


prior = [[0.0, 0.0] for i in range(no_of_datasets)]


#The weight of the probabilities
cond_probability = [[[0.0 for k in range(len(train_vocab_lists[i]))] for _ in range(len(class_types))] for i in range(no_of_datasets)]

#Training the model
for i in range(no_of_datasets):
    for c in class_types:
        #prior probability i is dataset index and c is class
        prior[i][c] = message_in_class_count[i][c]/no_message[i]
        tsum = 0.0
        tt = [[0.0 for k in range(len(train_vocab_lists[i]))] for _ in range(len(class_types))]

        tsum = 0.0
        for word_index in range(len(train_vocab_lists[i])):
            for wc in range(len(bow_matrices[i])):
                if(bow_matrices[i][wc][-1] == c):
                    tt[c][word_index] += bow_matrices[i][wc][word_index]
                    tsum += bow_matrices[i][wc][word_index]
        
        for word_index in range(len(train_vocab_lists[i])):    
            cond_probability[i][c][word_index] = (tt[c][word_index] + 0.5)/(tsum + len(train_vocab_lists[i]))



In [20]:
#Applying the model and calculating scores of each email and actual values

score = [[[] for _ in class_types] for _ in range(no_of_datasets)]
actual = [[] for _ in range(no_of_datasets)]
for dataset, test_folder in enumerate(test_data_folders):
    for filename in os.listdir(test_folder + "\\ham"):
        with open(os.path.join(test_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            file_content = f.readlines()
            actual[dataset].append(0)
            for c in class_types:
                tscore = math.log(prior[dataset][c])
                for sent in file_content:
                    for word in nltk.tokenize.word_tokenize(sent):
                        index = -1
                        try:
                            index = train_vocab_lists[dataset].index(word)
                        except:
                            continue
                        finally:
                            tscore += math.log(cond_probability[dataset][c][index])
                score[dataset][c].append(tscore)
                    
                    
    for filename in os.listdir(test_folder + "\\spam"):
        with open(os.path.join(test_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            file_content = f.readlines()
            actual[dataset].append(1)
            for c in class_types:
                tscore = math.log(prior[dataset][c])
                for sent in file_content:
                    for word in nltk.tokenize.word_tokenize(sent):
                        try:
                            index = train_vocab_lists[dataset].index(word)
                        except:
                            continue
                        else:
                            tscore += math.log(cond_probability[dataset][c][index])
                score[dataset][c].append(tscore)


In [21]:
predictions = [[] for _ in range(no_of_datasets)]
for dataset in range(no_of_datasets):
    for i in range(len(actual[dataset])):
        predict = -1
        if score[dataset][0][i] <= score[dataset][1][i]:
            predict = 1 #It is spam
        else:
            predict = 0 #It is ham
            
        predictions[dataset].append(predict)
    
    accuracy = accuracy_score(actual[dataset], predictions[dataset])
    precision = precision_score(actual[dataset], predictions[dataset])
    recall = recall_score(actual[dataset], predictions[dataset])
    f1 = f1_score(actual[dataset], predictions[dataset])

    print("---------------------------------- For Naive Bayes Multinomial model (BOW)-----------------------------------------")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("---------------------------------------------------------------------------")
         

---------------------------------- For Naive Bayes Multinomial model (BOW)-----------------------------------------
Accuracy: 0.9057017543859649
Precision: 0.8785714285714286
Recall: 0.825503355704698
F1 Score: 0.8512110726643599
---------------------------------------------------------------------------
---------------------------------- For Naive Bayes Multinomial model (BOW)-----------------------------------------
Accuracy: 0.9205020920502092
Precision: 0.8285714285714286
Recall: 0.8923076923076924
F1 Score: 0.8592592592592593
---------------------------------------------------------------------------
---------------------------------- For Naive Bayes Multinomial model (BOW)-----------------------------------------
Accuracy: 0.9502762430939227
Precision: 0.946078431372549
Recall: 0.9872122762148338
F1 Score: 0.9662077596996246
---------------------------------------------------------------------------


In [34]:
#Training discrete Naive Bayes discrete Model
#We have already extracted Vocabulary = train_vocab_list and N = no_message
class_types = [0, 1]


#Precomputing stuff for training
#No of emails in each class
message_in_class_count = []

for dataset, train_folder in enumerate(train_data_folders):
    msg_cnt = [0,0]
    #Counting ham messages
    for filename in os.listdir(train_folder + "\\ham"): 
        with open(os.path.join(train_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            msg_cnt[0] += 1
                
    #Counting Spam messages                    
    for filename in os.listdir(train_folder + "\\spam"):
        with open(os.path.join(train_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            msg_cnt[1] += 1
    message_in_class_count.append(msg_cnt)


prior = [[0.0, 0.0] for i in range(no_of_datasets)]


#The weight of the probabilities
cond_probability = [[[0.0 for k in range(len(train_vocab_lists[i]))] for _ in range(len(class_types))] for i in range(no_of_datasets)]

#Training the model
for i in range(no_of_datasets):
    for c in class_types:
        #prior probability i is dataset index and c is class
        prior[i][c] = message_in_class_count[i][c]/no_message[i]
        tt = [[0.0 for k in range(len(train_vocab_lists[i]))] for _ in range(len(class_types))]

        for word_index in range(len(train_vocab_lists[i])):
            for wc in range(len(bnouli_matrices[i])):
                if(bnouli_matrices[i][wc][-1] == c):
                    tt[c][word_index] += bnouli_matrices[i][wc][word_index]
    
        
        for word_index in range(len(train_vocab_lists[i])):    
            cond_probability[i][c][word_index] = (tt[c][word_index] + 0.5)/(message_in_class_count[i][c] + 1)

In [35]:
#Applying the model and calculating scores of each email and actual values

#Calculating the sum of log(1-condprob[t][c]) for all the datasets
#For a word existing in document, I can subtract log(1-condprob[t][c]) and add condprob[t][c]
init_scores = [[0 for _ in class_types] for i in range(no_of_datasets)]
for dataset in range(no_of_datasets):
    for c in class_types:
        for word_index in range(len(train_vocab_lists[dataset])):
            init_scores[dataset][c] += math.log(1-cond_probability[dataset][c][word_index])

score = [[[] for _ in class_types] for i in range(no_of_datasets)]
actual = [[] for _ in range(no_of_datasets)]
for dataset, test_folder in enumerate(test_data_folders):
    for filename in os.listdir(test_folder + "\\ham"):
        with open(os.path.join(test_folder + "\\ham\\" + filename), 'r', errors='ignore') as f:
            file_content = f.readlines()
            actual[dataset].append(0)
            for c in class_types:
                tscore = math.log(prior[dataset][c]) + init_scores[dataset][c]
                for sent in file_content:
                    for word in nltk.tokenize.word_tokenize(sent):
                        index = -1
                        try:
                            index = train_vocab_lists[dataset].index(word)
                        except:
                            continue
                        finally:
                            tscore += math.log(cond_probability[dataset][c][index])
                            tscore -= math.log(1-cond_probability[dataset][c][index])
                score[dataset][c].append(tscore)
                    
                    
    for filename in os.listdir(test_folder + "\\spam"):
        with open(os.path.join(test_folder + "\\spam\\" + filename), 'r', errors='ignore') as f:
            file_content = f.readlines()
            actual[dataset].append(1)
            for c in class_types:
                tscore = math.log(prior[dataset][c]) + init_scores[dataset][c]
                for sent in file_content:
                    for word in nltk.tokenize.word_tokenize(sent):
                        try:
                            index = train_vocab_lists[dataset].index(word)
                        except:
                            continue
                        else:
                            tscore += math.log(cond_probability[dataset][c][index])
                            tscore -= math.log(1-cond_probability[dataset][c][index])
                score[dataset][c].append(tscore)

print("The scores of all test emails are: ")
print(score)

The scores of all test emails are: 
[[[-305.93836740307256, -226.05629770122374, -190.01724322069512, -262.420810928128, -293.3935875604103, -140.14271344800773, -170.91344562767668, -246.42652261998848, -249.98209165351193, -248.56650836683295, -651.507043665416, -428.65621458508184, -1309.3761429637111, -138.63414062551252, -487.163075742694, -345.2030485523059, -211.58718101487176, -312.84129170824065, -68.47050155193239, -291.98601952718224, -276.17196553593647, -223.08334847801663, -408.42046773672223, -786.1930478205312, -194.8070905114539, -327.3337358648066, -415.67457333689293, -294.34515209982607, -470.61826423952925, -218.6843827701223, -254.49526430573562, -164.04835016176494, -560.015476854955, -270.82301191410363, -132.17065606741784, -143.40878324636793, -192.21909693340186, -290.2160605911423, -655.9059551439846, -322.24317314540065, -1017.301915035357, -822.4785779949475, -201.20677425495, -191.2259418002553, -634.1119393518596, -537.4125672165993, -102.49150614334286,

In [36]:
predictions = [[] for _ in range(no_of_datasets)]
for dataset in range(no_of_datasets):
    for i in range(len(actual[dataset])):
        predict = -1
        if score[dataset][0][i] <= score[dataset][1][i]:
            predict = 1 #It is spam
        else:
            predict = 0 #It is ham
            
        predictions[dataset].append(predict)
    
    accuracy = accuracy_score(actual[dataset], predictions[dataset])
    precision = precision_score(actual[dataset], predictions[dataset])
    recall = recall_score(actual[dataset], predictions[dataset])
    f1 = f1_score(actual[dataset], predictions[dataset])

    print("------------------------------ For the Naive Bayes Discrete Model (Bernoulli)---------------------------------------------")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("---------------------------------------------------------------------------")

------------------------------ For the Naive Bayes Discrete Model (Bernoulli)---------------------------------------------
Accuracy: 0.7697368421052632
Precision: 0.84375
Recall: 0.3624161073825503
F1 Score: 0.5070422535211268
---------------------------------------------------------------------------
------------------------------ For the Naive Bayes Discrete Model (Bernoulli)---------------------------------------------
Accuracy: 0.801255230125523
Precision: 0.7966101694915254
Recall: 0.36153846153846153
F1 Score: 0.4973544973544973
---------------------------------------------------------------------------
------------------------------ For the Naive Bayes Discrete Model (Bernoulli)---------------------------------------------
Accuracy: 0.8950276243093923
Precision: 0.9798850574712644
Recall: 0.8721227621483376
F1 Score: 0.9228687415426252
---------------------------------------------------------------------------
