## Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd

import urllib

import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

In [3]:
# The spam dataset comes from University of California Irvine.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

import urllib.request

raw_data = urllib.request.urlopen(url)

dataset = np.loadtxt( raw_data, delimiter=',')

print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [4]:
# only analyse 48 features that describe word freq counts
x = dataset[:,0:48]
y = dataset[:,-1]

In [5]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=17)

In [7]:
# Binning to binarise the data
BernNB = BernoulliNB( binarize=True )

BernNB.fit( x_train, y_train )

y_expect = y_test

y_pred = BernNB.predict( x_test )

print( accuracy_score( y_expect, y_pred ) )

0.8577633007600435


In [8]:
MultiNB = MultinomialNB()

MultiNB.fit( x_train, y_train )

y_expect = y_test

y_pred = MultiNB.predict( x_test )

print( accuracy_score( y_expect, y_pred ) )

0.8816503800217155


In [9]:
# Test as all data is numerical
GausNB = GaussianNB()

GausNB.fit( x_train, y_train )

y_expect = y_test

y_pred = GausNB.predict( x_test )

print( accuracy_score( y_expect, y_pred ) )

0.8197611292073833


In [10]:
# Multinomial is the best model to use from above results

In [11]:
# However, after adjusting binning to binarise the data we get this to be the best result
BernNB = BernoulliNB( binarize=0.1 )

BernNB.fit( x_train, y_train )

y_expect = y_test

y_pred = BernNB.predict( x_test )

print( accuracy_score( y_expect, y_pred ) )

0.9109663409337676
