# Python Implementation of Naive Bayes Classification

In [116]:
from sklearn.metrics import confusion_matrix 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
import time
    
dataset = pd.read_csv("adultData.csv")
    
#Add column names into pandas dataframe
dataset.columns = ["AGE","WORKCLASS","FNLWGT","EDUCATION","EDUCATION-NUM","MARITAL-STATUS","OCCUPATION","RELATIONSHIP","RACE","SEX","CAPITAL-GAIN","CAPITAL-LOSS","HOURS-PER-WEEK","NATIVE-COUNTRY","CLASS"]
dataset.AGE = dataset.AGE.astype(float)
dataset = dataset.dropna(axis = 0, how ='any')
#Drop unneccesary columns for the test data
elements_to_drop = ["EDUCATION","NATIVE-COUNTRY","FNLWGT","RELATIONSHIP"]
#Encode categorical data
dataset= dataset.apply(LabelEncoder().fit_transform)
#Split age into categories
dataset.loc[(dataset.AGE >0) & (dataset.AGE <=25), 'AGE'] = 0   
dataset.loc[(dataset.AGE >25) & (dataset.AGE <=40), 'AGE'] = 1   
dataset.loc[(dataset.AGE >40) & (dataset.AGE <=60), 'AGE'] = 2
dataset.loc[(dataset.AGE >60) & (dataset.AGE <=80), 'AGE'] = 3
dataset.loc[(dataset.AGE >80),'AGE'] = 4  
y = dataset["CLASS"]
x=dataset.drop(elements_to_drop,axis=1)
 #Split Testing and Training data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=2) 
dataset.head()

Unnamed: 0,AGE,WORKCLASS,FNLWGT,EDUCATION,EDUCATION-NUM,MARITAL-STATUS,OCCUPATION,RELATIONSHIP,RACE,SEX,CAPITAL-GAIN,CAPITAL-LOSS,HOURS-PER-WEEK,NATIVE-COUNTRY,CLASS
0,1,6,2925,9,12,2,4,0,4,1,0,0,12,39,0
1,0,4,14085,11,8,0,6,1,4,1,0,0,39,39,0
2,1,4,15335,1,6,2,6,0,2,1,0,0,39,39,0
3,0,4,19354,9,12,2,10,5,2,0,0,0,39,5,0
4,0,4,17699,12,13,2,4,5,4,0,0,0,39,39,0


In [103]:
#Actual data classes from test data
Actual= x_test['CLASS']

In [104]:
#Split into class 0 and class 1
class1 = x_train[x_train['CLASS']==0]
num_class1=class1.shape[0]
class2 = x_train[x_train['CLASS']==1]
num_class2=class2.shape[0]

## Use dummy data from class 1

In [105]:
rec=dataset.iloc[9]
rec

AGE                   0
WORKCLASS             4
FNLWGT            17529
EDUCATION            15
EDUCATION-NUM         9
MARITAL-STATUS        2
OCCUPATION            4
RELATIONSHIP          0
RACE                  2
SEX                   1
CAPITAL-GAIN          0
CAPITAL-LOSS          0
HOURS-PER-WEEK       76
NATIVE-COUNTRY       39
CLASS                 1
Name: 9, dtype: int64

## Use dummy data from class 0


In [106]:
rec2 = dataset.iloc[1]
rec2

AGE                   0
WORKCLASS             4
FNLWGT            14085
EDUCATION            11
EDUCATION-NUM         8
MARITAL-STATUS        0
OCCUPATION            6
RELATIONSHIP          1
RACE                  4
SEX                   1
CAPITAL-GAIN          0
CAPITAL-LOSS          0
HOURS-PER-WEEK       39
NATIVE-COUNTRY       39
CLASS                 0
Name: 1, dtype: int64

In [107]:
def NaiveBayes(data1, data2,record):
    num_class1=data1.shape[0]
    num_class2=data2.shape[0]
    #Get priors from dataset
    prior_class1 =num_class1/(num_class1+num_class2)
    prior_class2 =num_class2/(num_class1+num_class2)
    columns=data2.columns
    prob_temp1=1
    prob_temp2=1
    for col in columns:
        #Get probabillities of each attribute
        prob_attr1=(data1[col].value_counts())/num_class1
        prob_attr2=(data2[col].value_counts())/num_class2
        try:
            prob_class1=prob_attr1[record[col]]
            prob_class2=prob_attr2[record[col]]
        except KeyError:
            prob_class1=1
            prob_class2=1
        #Multiply probabillities of each attribute    
        prob_temp1=prob_temp1*prob_class1
        prob_temp2=prob_temp2*prob_class2
    
    #Probability of data X under any Y (normalisation)    
    deno=(prob_temp1*prior_class1)+(prob_temp2*prior_class2) 
    prob_c1=(prob_temp1*prior_class1)/deno
    prob_c2=(prob_temp2*prior_class2)/deno
    
    #Predictions of classes based on highest probabilities
    if (prob_c1>prob_c2):
        prediction=0
    else:
        prediction=1
        
    return prediction

## Test if Naive Bayes classifies dummy data correctly

In [108]:
NaiveBayes(class1,class2,rec)


1

In [109]:
NaiveBayes(class1,class2,rec2)


0

### As seen above the algorithm classifies dummy data correctly

Now let us test the algorithm with our unbiased test and training data

In [118]:
Predicted=[]
x_test.index = range(len(x_test))
start_time = time.time()
for i in range(len(x_test)):
    Predicted.append(NaiveBayes(class1,class2,x_test.iloc[i,:]))
result=confusion_matrix(Actual,Predicted)
elapsed_time = time.time() - start_time
print("Confusion Matrix:")
print("")
print(result)
print("Total execution time: ", elapsed_time)

Confusion Matrix:

[[8854 1025]
 [1222 1923]]
Total execution time:  260.64484572410583


In [111]:
Accuracy = (result[0][0]+result[1][1])/(result[0][0]+result[1][1]+result[0][1]+result[1][0]) * 100
Accuracy

82.74723587223588