### Naive Bayes Implementation for Email Spam Detection

In [1]:
import csv
import random
import math
import operator

### Classification using self implemented naive_bayes

In [2]:
def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

def loadCsv(filename):
    lines = csv.reader(open(filename))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset
 
def splitDataset(data,ratio):
    dataset=[]
    with open(data,'r') as spamfile:
        dataset=list(csv.reader(spamfile))
    for i in range(len(dataset)):
        for j in range(57):
            dataset[i][j]=float(dataset[i][j])
        dataset[i][-1]=int(dataset[i][-1])
    trainSize=int(len(dataset)*ratio)
    trainSet=[]
    copy=list(dataset)
    while len(trainSet)<trainSize:
        index=random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet,copy]
# 
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated
 
def mean(numbers):
    return safe_div(sum(numbers),float(len(numbers)))

def stdev(numbers):
    avg = mean(numbers)
    variance = safe_div(sum([pow(x-avg,2) for x in numbers]),float(len(numbers)-1))
    return math.sqrt(variance)
 
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries
 
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    #in separated lies all mail combined as lists under keys 0 and 1
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
#In summaries all 57 attributes mean and std are present categorized under classValues, and for classValue 1 values are bit greater than for classValues 0
    return summaries
 
    
# def calculateProbability(x,mean,std):
#     exponent=math.exp(-1*math.pow(x-mean,2)/(0.000001+2*math.pow(std,2)))
#     return (1/(0.000001+math.sqrt(2*math.pi)*std))*exponent

def calculateProbability(x, mean, stdev):
    exponent = math.exp(-safe_div(math.pow(x-mean,2),(2*math.pow(stdev,2))))
    final = safe_div(1 , (math.sqrt(2*math.pi) * stdev)) * exponent
    return final

 
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    #print(probabilities)

    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        #print(classValue,'->',probability)
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

 
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions
 
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    accuracy = safe_div(correct,float(len(testSet))) * 100.0
    return accuracy

trainSet,testSet=splitDataset('spambase.data',0.75)

print('Train and test set lengths are : ',len(trainSet),len(testSet))
summaries=summarizeByClass(trainSet)

predictions=getPredictions(summaries,testSet)
accuracy=getAccuracy(testSet,predictions)
# for i in range(len(testSet)):
#     print(testSet[i][-1],'->',predictions[i])
print('Accuracy: ',accuracy)


Train and test set lengths are :  3450 1151
Accuracy:  81.49435273675066


In [7]:

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
y_test=[]
for i in testSet:
    y_test.append(i[-1])
print(classification_report(y_test,predictions))
conf_matrix=confusion_matrix(y_test,predictions)
print('confusion matrix :\n',conf_matrix)
print('Spam correctly classified as spam',conf_matrix[0][0])
print('Ham being classified as spam',conf_matrix[0][1])
print('Ham being classified as not ham',conf_matrix[1][0])
print('Spam being classified as Ham',conf_matrix[1][1])
print('Accuracy :\n',accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

           0       0.96      0.73      0.83       702
           1       0.69      0.95      0.80       449

    accuracy                           0.81      1151
   macro avg       0.83      0.84      0.81      1151
weighted avg       0.86      0.81      0.82      1151

confusion matrix :
 [[510 192]
 [ 21 428]]
Spam correctly classified as spam 510
Ham being classified as spam 192
Ham being classified as not ham 21
Spam being classified as Ham 428
Accuracy :
 0.8149435273675065


### using sklearn naive_bayes

In [8]:


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

x_train=[]
y_train=[]
for i in trainSet:
    x_train.append(i[:-1])
    y_train.append(i[-1])
x_test=[]
y_test=[]
for i in testSet:
    x_test.append(i[:-1])
    y_test.append(i[-1])

classifier=MultinomialNB().fit(x_train,y_train)

pred=classifier.predict(x_test)

print(classification_report(y_test,pred))

print('confusion matrix :\n',confusion_matrix(y_test,pred))
print('Accuracy :\n',accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83       702
           1       0.73      0.74      0.74       449

    accuracy                           0.79      1151
   macro avg       0.78      0.78      0.78      1151
weighted avg       0.79      0.79      0.79      1151

confusion matrix :
 [[578 124]
 [116 333]]
Accuracy :
 0.7914856646394439
