In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
import Columns


PERCENT = lambda x: x*100

In [2]:
'''
Load the data into a pandas dataframe
'''
col_names = Columns.names
data = pd.read_csv('spambase.data', header=None, names = list(range(1,59)))
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,49,50,51,52,53,54,55,56,57,58
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
'''
Require to remove correlated features because they are voted twice in the model and it can lead to over inflating importance.

Get highly correlated columns and drop one of them. We look at only the upper triangle of the correlation matrix so as to get rid of one feature from every highly correlated pair. In our case we drop columns with 
correlation greater than 0.95
'''
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

data.drop(data.columns[to_drop], axis=1, inplace=True)

In [4]:
'''
Split the data to keep aside a test set which'll be used once we're done training the model.
We might not need this for our particular case as we're not tuning any hyperparameters and get an average error from k-fold cross validation itself. If we were using DecisionTrees for instance, we could've tuned the max_depth value using cross_val and tested it's accuracy on the test set.
'''

X_train, X_test = train_test_split(data, test_size=0.8, shuffle=True)

X = X_train.iloc[:, :-1]
y = X_train.iloc[:,-1]

X.head()

#The reason for the indices not being in order and not starting from 0 is the train_test_split and 
#that we shiffled the data before the split

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,48,49,50,51,52,53,54,55,56,57
1068,0.16,0.0,0.67,0.0,0.33,0.16,0.33,0.84,0.16,0.5,...,0.0,0.0,0.224,0.0,1.151,0.056,0.0,4.928,63,621
3158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1
1520,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.17,0.17,0.0,6.266,41,94
2091,0.0,0.0,0.0,0.0,0.59,0.0,0.0,0.0,0.0,1.18,...,0.0,0.0,0.215,0.0,0.107,0.0,0.0,2.741,11,85
2054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.383,0.0,0.0,1.333,3,8


In [5]:
'''
1. Create k partitions of the training set
2. Go over every partition use=ing it as test set while using the rest for training
3. Get confusion matrix, which in our case can be simple done calculating y_test-y_pred :
    -1 -> False positive (0 misclassified as 1)
     1 -> False positive (1 misclassified as 0)
'''

kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

metric_columns = ["FP (0 classified as 1)", "FN (1 classified as 0)", "Error"]
final_metrics = list()
for i, split in enumerate(kf.split(X)):
    train_index, test_index = split
    
    Xtrain, Xtest = X.iloc[train_index,], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index,], y.iloc[test_index]
    
    '''Train NB Classifier and make a prediction on the test set'''
    clf = GaussianNB()
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    
    misclassified = (ytest-ypred)
    FP = np.count_nonzero(misclassified==-1)
    FN = np.count_nonzero(misclassified==1)
    
    #print("FP = {}, FN = {}".format(np.sum(FP), np.sum(FN)))
    
    tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()
    error = 1-accuracy_score(ytest, ypred) 
    final_metrics.append((fp, fn, error))

final_metrics_df = pd.DataFrame(final_metrics, columns = metric_columns)

In [6]:
'''Get the Mean of all folds and add an accuracy column'''
final_metrics_df.loc['Mean'] = final_metrics_df.mean()
final_metrics_df['Accuracy %'] = PERCENT(1-final_metrics_df['Error'])
final_metrics_df = final_metrics_df.round(4)

final_metrics_df

Unnamed: 0,FP (0 classified as 1),FN (1 classified as 0),Error,Accuracy %
0,14.0,1.0,0.163,83.6957
1,12.0,2.0,0.1522,84.7826
2,15.0,1.0,0.1739,82.6087
3,10.0,2.0,0.1304,86.9565
4,16.0,0.0,0.1739,82.6087
5,20.0,2.0,0.2391,76.087
6,12.0,0.0,0.1304,86.9565
7,10.0,3.0,0.1413,85.8696
8,18.0,3.0,0.2283,77.1739
9,14.0,4.0,0.1957,80.4348


In [7]:
x_test = X_test.iloc[:,:-1]
y_test = X_test.iloc[:,-1]
y_pred = clf.predict(x_test)

In [8]:
print("Accuracy on test set {:2.2f}%".format(PERCENT(accuracy_score(y_test, y_pred))))

Accuracy on test set 81.64%
