In [2]:
# Antonio Emanuele Cinà
# Simple spam filter using Naive Bayes classifier
import numpy as np
from sklearn.model_selection import cross_val_score

TRAINING_SET = "spambase/spambase.data"

def load_csv(filename):
    fread = open(filename, "r")
    data = np.loadtxt(fread, delimiter=",")
    return data

email = load_csv(TRAINING_SET)

In [1]:
ls

 Volume in drive C is Acer
 Volume Serial Number is C09D-00FF

 Directory of C:\Users\vishr\Downloads\spam-filter-master\spam-filter-master

19-04-2020  11:18 AM    <DIR>          .
19-04-2020  11:18 AM    <DIR>          ..
05-03-2019  08:24 PM             2,359 .gitignore
19-04-2020  11:18 AM    <DIR>          .ipynb_checkpoints
05-03-2019  08:24 PM             4,322 analysis.R
05-03-2019  08:24 PM             1,079 LICENSE
05-03-2019  08:24 PM               869 README.md
05-03-2019  08:24 PM         3,035,900 Report.pdf
05-03-2019  08:24 PM    <DIR>          spambase
19-04-2020  11:18 AM             8,088 SpamFilter_Bayes.ipynb
05-03-2019  08:24 PM            21,112 SpamFilter_SVM.ipynb
               7 File(s)      3,073,729 bytes
               4 Dir(s)  18,125,946,880 bytes free


In [24]:
from sklearn.base import BaseEstimator
class NaiveBayesClassifier(BaseEstimator):

    def score(self, X, Y):
        p_x_spam_i = (2*np.pi*self.var_spam)**(-1./2) * np.exp(-1./(2*self.var_spam)*np.power(X-self.mu_spam,2))
        p_x_ham_i = (2*np.pi*self.var_ham)**(-1./2) * np.exp(-1./(2*self.var_ham)*np.power(X-self.mu_ham,2))
        
        p_x_spam = np.prod(p_x_spam_i, axis= 1)
        p_x_ham = np.prod(p_x_ham_i, axis= 1)
        
        #p_x = (p_x_spam * self.p_spam + p_x_ham * self.p_ham)# + 1e-130
        p_spam_x = p_x_spam * self.p_spam#/p_x
        p_ham_x = p_x_ham * self.p_ham#/p_x
                           
        predicted_labels = np.argmax([p_ham_x,p_spam_x], axis = 0)
        return np.mean(predicted_labels == Y)

    def fit(self, X, Y, **kwargs):
        self.spam = X[Y == 1,:54]
        self.ham = X[Y == 0,:54]
        
        self.N = float(self.spam.shape[0] + self.ham.shape[0])
        self.k_spam = self.spam.shape[0] # frequency of spam
        self.k_ham = self.ham.shape[0] # frequency of ham

        self.p_spam = self.k_spam/self.N
        self.p_ham = self.k_ham/self.N
        
        self.mu_spam = np.mean(self.spam, axis=0)
        self.mu_ham = np.mean(self.ham, axis=0)
        
        # Avoid division by zero adding a small costant
        self.var_spam = np.var(self.spam, axis=0)+1e-128
        self.var_ham = np.var(self.ham, axis=0)+1e-128

In [25]:
np.random.shuffle(email)# shuffle dataset
Y = email[:,57] 
X = email[:,:54]

In [26]:
scores = cross_val_score(NaiveBayesClassifier(), X, Y, cv = 10)

In [27]:
print("Min Accuracy: " + str(scores.min())+"\n")
print("Mean Accuracy: " + str(scores.mean())+"\n")
print("Max Accuracy: " + str(scores.max())+"\n")
print("Variance/Std Accuracy: " + str(scores.var()) +" / " +str(scores.std())+"\n")

print("=================================")

Min Accuracy: 0.717391304347826

Mean Accuracy: 0.8011303404696785

Max Accuracy: 0.8521739130434782

Variance/Std Accuracy: 0.0010417468231695 / 0.03227610297370952



In [28]:
# Apply 10-Way Cross validation 'run' times and get all the scores 
def eval_model(data, classifier, run = 10):
    scores = np.array([])
    for i in range(run):
        np.random.shuffle(data)
        Y = email[:,57] 
        X = email[:,:54]
        scores = np.append(scores,cross_val_score(classifier, X, Y, cv = 10))
    return scores

In [29]:
scores_run = eval_model(email, NaiveBayesClassifier(),run = 20)
print("Min Accuracy: " + str(scores_run.min())+"\n")
print("Mean Accuracy: " + str(scores_run.mean())+"\n")
print("Max Accuracy: " + str(scores_run.max())+"\n")
print("Variance/Std Accuracy: " + str(scores_run.var()) +" / " +str(scores_run.std())+"\n")
print("=================================")

Min Accuracy: 0.7065217391304348

Mean Accuracy: 0.8041172073941337

Max Accuracy: 0.8586956521739131

Variance/Std Accuracy: 0.0009428050945681482 / 0.03070513140450873



In [30]:
from sklearn.model_selection import train_test_split
#from nltk import NaiveBayesClassifier
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

clf = NaiveBayesClassifier()
md = clf.fit(x_train,y_train)

print("Accuracy: "+str(clf.score(x_test, y_test)))

Accuracy: 0.8131788559015206


In [31]:
print("p(spam): "+str(clf.p_spam))
print("p(ham): "+str(clf.p_ham))

p(spam): 0.39285714285714285
p(ham): 0.6071428571428571


In [32]:
print("mu spam: "+str(np.round(clf.mu_spam,3)))
print("var spam: "+str(np.round(clf.var_spam,3)))

mu spam: [1.500e-01 1.790e-01 4.090e-01 1.490e-01 5.170e-01 1.720e-01 2.770e-01
 2.180e-01 1.720e-01 3.570e-01 1.180e-01 5.640e-01 1.500e-01 8.000e-02
 1.190e-01 4.630e-01 2.960e-01 3.260e-01 2.273e+00 1.980e-01 1.401e+00
 2.850e-01 2.420e-01 2.010e-01 1.600e-02 9.000e-03 2.000e-03 1.500e-02
 0.000e+00 2.000e-03 1.000e-03 1.000e-03 1.600e-02 2.000e-03 8.000e-03
 3.000e-02 4.700e-02 5.000e-03 1.100e-02 3.500e-02 0.000e+00 2.000e-03
 1.000e-02 7.000e-03 1.250e-01 1.500e-02 1.000e-03 2.000e-03 2.200e-02
 1.110e-01 9.000e-03 5.200e-01 1.770e-01 8.300e-02]
var spam: [9.700e-02 1.410e-01 2.270e-01 4.443e+00 4.780e-01 9.900e-02 3.130e-01
 3.470e-01 1.280e-01 4.180e-01 5.900e-02 4.130e-01 1.380e-01 8.300e-02
 1.550e-01 7.150e-01 4.130e-01 4.800e-01 2.363e+00 4.520e-01 1.534e+00
 2.615e+00 2.510e-01 2.510e-01 2.700e-02 9.000e-03 2.000e-03 6.500e-02
 0.000e+00 1.000e-03 2.000e-03 0.000e+00 1.500e-02 2.000e-03 5.000e-03
 2.200e-02 9.000e-02 3.000e-03 6.000e-03 1.900e-02 0.000e+00 1.000e-03
 3.000

In [33]:
print("mu spam: "+str(np.round(clf.mu_ham,3)))
print("var spam: "+str(np.round(clf.var_ham,3)))

mu spam: [7.100e-02 2.360e-01 2.040e-01 1.000e-03 1.870e-01 4.600e-02 1.200e-02
 4.300e-02 3.800e-02 1.640e-01 2.400e-02 5.270e-01 6.800e-02 4.000e-02
 6.000e-03 6.400e-02 4.900e-02 8.900e-02 1.284e+00 1.000e-02 4.450e-01
 3.400e-02 8.000e-03 1.900e-02 8.900e-01 4.130e-01 1.235e+00 1.970e-01
 1.710e-01 1.660e-01 1.060e-01 7.200e-02 1.440e-01 7.200e-02 1.790e-01
 1.340e-01 2.010e-01 1.500e-02 1.260e-01 7.900e-02 7.500e-02 2.310e-01
 6.800e-02 1.130e-01 4.340e-01 2.970e-01 6.000e-03 4.800e-02 4.300e-02
 1.570e-01 2.500e-02 1.290e-01 1.200e-02 2.600e-02]
var spam: [8.200e-02 2.600e+00 2.620e-01 0.000e+00 4.060e-01 5.000e-02 1.700e-02
 6.200e-02 4.000e-02 3.200e-01 2.700e-02 9.580e-01 7.600e-02 8.300e-02
 5.000e-03 1.100e-01 4.900e-02 1.220e-01 3.145e+00 1.300e-02 1.066e+00
 2.820e-01 5.000e-03 1.010e-01 4.216e+00 1.050e+00 1.790e+01 4.140e-01
 6.130e-01 3.190e-01 2.760e-01 1.490e-01 4.900e-01 1.490e-01 5.250e-01
 2.120e-01 2.470e-01 6.400e-02 2.940e-01 1.650e-01 2.370e-01 1.070e+00
 7.000