In [571]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
import seaborn as sns

PERCENT = lambda x: x*100

In [555]:
'''
Load the data into a pandas dataframe
'''

col_names = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'spam']
data = pd.read_csv('spambase.data', header=None, names = list(range(1,59)))
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,49,50,51,52,53,54,55,56,57,58
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [556]:
'''
Require to remove correlated features because they are voted twice in the model and it can 
lead to over inflating importance.

Get highly correlated columns and drop one of them. We look at only the upper triangle of the correlation matrix
so so as to get rid of one feature from every highly correlated pair. In our case we drop colums with 
correlation greater than 0.95
'''
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

data.drop(data.columns[to_drop], axis=1, inplace=True)

In [557]:
'''
Split the data to keep aside a test set which'll be used once we're done training the model. 
We might not need this for our particular case as we're not tuning any hyperparameters and get an average error from
k-fold cross validation itself. If we were using DecisionTrees for instance, we could've tuned the max_depth value 
using cross_val and tested it's accuracy on the test set.
'''

X_train, X_test = train_test_split(data, test_size=0.8, shuffle=True)

X = X_train.iloc[:, :-1]
y = X_train.iloc[:,-1]

X.head()

#The reason for the indices not being in order and not starting from 0 is the train_test_split and 
#that we shiffled the data before the split

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,48,49,50,51,52,53,54,55,56,57
1453,0.0,0.0,0.28,0.0,0.84,0.84,0.28,0.0,0.28,0.28,...,0.0,0.0,0.05,0.0,0.05,0.0,0.0,2.083,34,150
1268,0.1,0.2,1.01,0.0,0.8,0.8,0.5,0.0,0.8,0.1,...,0.0,0.0,0.11,0.0,0.49,0.158,0.015,8.55,669,1351
4344,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.0,0.0,2.43,...,0.0,0.0,0.271,0.0,0.0,0.0,0.09,6.09,71,201
2507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.142,2,8
442,0.59,0.09,0.09,0.0,0.29,0.09,0.0,0.59,0.59,2.09,...,0.0,0.044,0.078,0.0,0.334,0.133,0.011,15.493,1171,2541


In [558]:
'''
1. Create k partitions of the training set
2. Go over every partition use=ing it as test set while using the rest for training
3. Get confusion matrix, which in our case can be simple done calculating y_test-y_pred :
    -1 -> False positive (0 misclassified as 1)
     1 -> False positive (1 misclassified as 0)
'''

kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

metric_columns = ["FP (0 classified as 1)", "FN (1 classified as 0)", "Error"]
final_metrics = list()
for i, split in enumerate(kf.split(X)):
    train_index, test_index = split
    
    Xtrain, Xtest = X.iloc[train_index,], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index,], y.iloc[test_index]
    
    '''Train NB Classifier and make a prediction on the test set'''
    clf = GaussianNB()
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    
    misclassified = (ytest-ypred)
    FP = np.count_nonzero(misclassified==-1)
    FN = np.count_nonzero(misclassified==1)
    
    #print("FP = {}, FN = {}".format(np.sum(FP), np.sum(FN)))
    
    tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()
    error = 1-accuracy_score(ytest, ypred) 
    final_metrics.append((fp, fn, error))

final_metrics_df = pd.DataFrame(final_metrics, columns = metric_columns)

In [559]:
'''Get the Mean of all folds and add an accuracy column'''
final_metrics_df.loc['Mean'] = final_metrics_df.mean()
final_metrics_df['Accuracy %'] = PERCENT(1-final_metrics_df['Error'])
final_metrics_df = final_metrics_df.round(4)

final_metrics_df

Unnamed: 0,FP (0 classified as 1),FN (1 classified as 0),Error,Accuracy %
0,16.0,3.0,0.2065,79.3478
1,10.0,3.0,0.1413,85.8696
2,9.0,0.0,0.0978,90.2174
3,12.0,1.0,0.1413,85.8696
4,7.0,3.0,0.1087,89.1304
5,13.0,1.0,0.1522,84.7826
6,10.0,1.0,0.1196,88.0435
7,12.0,3.0,0.163,83.6957
8,10.0,0.0,0.1087,89.1304
9,16.0,1.0,0.1848,81.5217


In [560]:
x_test = X_test.iloc[:,:-1]
y_test = X_test.iloc[:,-1]
y_pred = clf.predict(x_test)

In [562]:
print("Accuracy on test set {:2.2f}%".format(PERCENT(accuracy_score(y_test, y_pred))))

Accuracy on test set 82.80%
