In [84]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import GaussianNB
import math
import copy

In [85]:
df=pd.read_csv("LoanDataset/data.csv")
df.columns = ['id','age','exp','income','zip','family','spend','education','house','output','security','certi','net','creditcard']

In [86]:
X = df.drop(['output'],axis=1)
Y = df['output']

In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)

In [88]:
df = pd.concat([X_train,Y_train],axis=1)

In [89]:
distinct = {}
def seperateClass():
    for index,row in df.iterrows():

        if row['output'] not in distinct:
            distinct[row['output']]=[]
        distinct[row['output']].append(list(row))

In [90]:
def mean(attrColumn):
    return float(sum(attrColumn))/len(attrColumn)

In [91]:
def standard_deviation(attrColumn):
    avg = mean(attrColumn)
    ans = 0
    for i in attrColumn:
        ans=ans+pow(i-avg,2)
    ans = float(ans)/(len(attrColumn)-1)
    return np.sqrt(ans)

In [92]:
def find_mean_sd_for_attr(df):

    summaries = {}
    df = pd.DataFrame(df)
    df.columns = ['id','age','exp','income','zip','family','spend','education','house','security','certi','net','creditcard','output']
    for i in df.columns:
        summaries[i]=(mean(df[i]),standard_deviation(df[i]))
    del summaries['output']
    return summaries

In [93]:
seperateClass()
# print distinct
summary = {}
for label,data in distinct.items():
    summary[label] = find_mean_sd_for_attr(data)
    
# print summary

In [94]:
def calcProbability(x,mean,sd):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(sd,2))))
    return (1 / (math.sqrt(2*math.pi) * sd)) * exponent

In [95]:
def probability_by_class(inputdata,summary):

    probabilities = {}
    for label, labeldata in summary.items():
#         print labeldata
        probabilities[label] = 1
        for key,value in labeldata.items():
#             print value
            mean,sd = value
            inputattrdata = inputdata[key]
            probabilities[label] *= calcProbability(inputattrdata, mean, sd)
    return probabilities

In [96]:
def prediction(inputdata,summary):
    probabilities = probability_by_class(inputdata,summary)
    anslabel = ""
    maxprob = 0
    for label, probability in probabilities.items():
        if anslabel=="" or probability >= maxprob:
            maxprob = probability
            anslabel = label
#     print anslabel
    return anslabel

In [97]:
res = []

for index,row in X_test.iterrows():
    res.append(prediction(row,summary))

print(confusion_matrix(Y_test,res))
print(classification_report(Y_test,res))
print(accuracy_score(Y_test, res)*100)

[[705 107]
 [ 16  72]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.92       812
           1       0.40      0.82      0.54        88

   micro avg       0.86      0.86      0.86       900
   macro avg       0.69      0.84      0.73       900
weighted avg       0.92      0.86      0.88       900

86.33333333333333


In [98]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
y_pred = gnb.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
print(accuracy_score(Y_test, y_pred)*100)

[[731  81]
 [ 33  55]]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       812
           1       0.40      0.62      0.49        88

   micro avg       0.87      0.87      0.87       900
   macro avg       0.68      0.76      0.71       900
weighted avg       0.90      0.87      0.88       900

87.33333333333333
