In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import GaussianNB
import math
import copy

In [62]:
df=pd.read_csv("LoanDataset/data.csv")
df.columns = ['id','age','exp','income','zip','family','spend','education','house','output','security','certi','net','creditcard']

In [63]:
X = df.drop(['output','zip','id'],axis=1)
Y = df['output']

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)

In [65]:
df = pd.concat([X_train,Y_train],axis=1)

In [66]:
numeric = ['age','education','exp','income','family','spend','house','output']
categorical = ['security','certi','net','creditcard','output']
numericdata = df[numeric]
catdata = df[categorical]

In [67]:
distinct = {}
def seperateClass(numericdata):
    for index,row in numericdata.iterrows():

        if row['output'] not in distinct:
            distinct[row['output']]=[]
        distinct[row['output']].append(list(row))

In [68]:
def mean(attrColumn):
    return float(sum(attrColumn))/len(attrColumn)

In [69]:
def standard_deviation(attrColumn):
    avg = mean(attrColumn)
    ans = 0
    for i in attrColumn:
        ans=ans+pow(i-avg,2)
    ans = float(ans)/(len(attrColumn)-1)
    return np.sqrt(ans)

In [70]:
def find_mean_sd_for_attr(numericdata):

    summaries = {}
    df = pd.DataFrame(numericdata)
    df.columns = numeric
    for i in df.columns:
        summaries[i]=(mean(df[i]),standard_deviation(df[i]))
    del summaries['output']
    return summaries

In [71]:
seperateClass(numericdata)
# print distinct
summary = {}
for label,data in distinct.items():
    summary[label] = find_mean_sd_for_attr(data)
    
# print summary

In [72]:
summarycat = {}
for j in catdata['output'].unique():
            summarycat[j] = []
def find_summary_category(catdata,o):
    temp = {}
    for j in categorical:
        temp1={}
        if j == 'output':
            continue
        for i in catdata[j].unique():
            num = len(catdata[j][catdata[j] == i][catdata['output'] == o])
            den = len(catdata[j][catdata['output'] == o])
            temp1[i]=num/den
        temp[j] = temp1
    return temp

In [73]:
for i in catdata['output'].unique():
    summarycat[i]=find_summary_category(catdata,i)
# summarycat

In [74]:
def calcProbability(x,mean,sd):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(sd,2))))
    return (1 / (math.sqrt(2*math.pi) * sd)) * exponent

In [75]:
def probability_by_class(inputdata,summary,summarycat):

    probabilities = {}
    for i in catdata['output'].unique():
        probabilities[i]=len(catdata['output'][catdata['output'] == i])/len(catdata)
                               
    for label, labeldata in summary.items():

        for key,value in labeldata.items():

            mean,sd = value
            inputattrdata = inputdata[key]
            probabilities[label] *= calcProbability(inputattrdata, mean, sd)
            
    for label, labeldata in summarycat.items():
        for key,value in labeldata.items():
            inputattrdata = inputdata[key]
            probabilities[label] *= summarycat[label][key][inputattrdata]
            
    return probabilities

In [76]:
def prediction(inputdata,summary,summarycat):
    probabilities = probability_by_class(inputdata,summary,summarycat)
    anslabel = ""
    maxprob = 0
    for label, probability in probabilities.items():
        if anslabel=="" or probability >= maxprob:
            maxprob = probability
            anslabel = label
#     print anslabel
    return anslabel

In [77]:
res = []

for index,row in X_test.iterrows():
    res.append(prediction(row,summary,summarycat))

print(confusion_matrix(Y_test,res))
print(classification_report(Y_test,res))
print(accuracy_score(Y_test, res)*100)

[[747  52]
 [ 35  66]]
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       799
           1       0.56      0.65      0.60       101

   micro avg       0.90      0.90      0.90       900
   macro avg       0.76      0.79      0.77       900
weighted avg       0.91      0.90      0.91       900

90.33333333333333


In [78]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
y_pred = gnb.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
print(accuracy_score(Y_test, y_pred)*100)

[[731  68]
 [ 42  59]]
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       799
           1       0.46      0.58      0.52       101

   micro avg       0.88      0.88      0.88       900
   macro avg       0.71      0.75      0.72       900
weighted avg       0.89      0.88      0.88       900

87.77777777777777
