In [30]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt


In [31]:
'''
Read the data as panda dataframe named df

Setting random seed to avoid unwanted shuffle in dict and python

Training and Test data is divide in the 8:2 ratio

Setting target label  is 'Label' which is the last col in the  table


'''
df = pd.read_csv("data.csv")
df.columns
np.random.seed(0)
msk = np.random.rand(len(df)) < 0.8
X_train = df[msk]
y_train = X_train['Label']

X_test = df[~msk]
y_test = X_test['Label']
X_test = X_test.drop(['Label'],axis=1)
N, M = df.shape

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12960 entries, 0 to 12959
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   parents   12960 non-null  object
 1   has_nurs  12960 non-null  object
 2   form      12960 non-null  object
 3   children  12960 non-null  object
 4   housing   12960 non-null  object
 5   finance   12960 non-null  object
 6   social    12960 non-null  object
 7   health    12960 non-null  object
 8   Label     12960 non-null  object
dtypes: object(9)
memory usage: 911.4+ KB


In [33]:
X_train.Label.unique()

array(['recommend', 'priority', 'not_recom', 'very_recom', 'spec_prior'],
      dtype=object)

In [34]:
'''
Likelihood_Dict and Priors as global dictionary which would be updated by training and used while predicting
'''
Likelihood_Dict= {
} 
Priors={
   
}
Dict_Labels = dict()

# ---------------------------------------Training Naive Bayes--------------------------------------------    

    
def NaiveBayesTrain(train,targetCol):
    likelihood_dict= dict() 
    priors=dict()
    data = np.array(train)
    r,c= train.shape
    classes = list(train[targetCol].unique())
    columns = list(train.columns)
    
    
# Calculate prior probablity

    prior = train.groupby(targetCol)[targetCol].agg(['count'])
    priors =  ((prior).T.to_dict())  
    
    for key in priors.keys():
        val = priors.get(key)
        for elm in val:
            val = round((val.get(elm)/r),2)
            priors.update({key:val})
            
# Helper function    

    def divide(dic, count):
        n_dic = dict()
        for key in dic.keys():
            val = round(dic.get(key)/count,2)
            n_dic.update({key:val})
        return n_dic
    
# Calculate likelihood values

    def cal_likelihood(data,cls):
        n,m = data.shape
        
        for col in columns:
            dic = dict()
            att_val = (data[col].unique())
            val = (data.groupby(col)[col].count()).to_dict()
            val =divide(val,n) 
            dic.update({col:val})

            if cls in likelihood_dict.keys():
                val = likelihood_dict.get(cls)
                val.update(dic)
            else:    
                likelihood_dict.update({cls:dic})
                
#  Update likeliood dictionary

    for cls in classes:
        data_ = train.loc[train[targetCol]== cls]
        cal_likelihood(data_,cls)

    return likelihood_dict,priors


In [35]:
'''
This is a predicting block. The likelihood and prior returned by the training function would be used for predicting

'''
# Helper function to sort the prior as per the likelihood dictionary. Would be used to map columns to list in further section


def sorted_prior():
    rlist = list()
    for key in Likelihood_Dict.keys():
        if key in Priors.keys():
            rlist.append(Priors.get(key))        
    return (rlist)        

# Helper function that return the feature vs label matrix, column stacked over each other

def feature_post(data,column):
    
    result =np.zeros(shape=(len(data),))
    
    for key in Likelihood_Dict.keys():
        val = Likelihood_Dict.get(key)
        col = val.get(column)
        res =[]
        
        for dat in data:
            if dat in col.keys():
                res.append(col.get(dat))
            else:
                res.append('0.001')
                
        res = np.array(res,dtype = float)
        result =np.column_stack((result,res))
        
    return (result[:,1:]) 

# Predict function

def NaiveBayesPredict(test,target):
    
    X_test = np.array(test)
    prediction = np.ones(shape=(len(X_test),len(Priors)))
    columns = list(test.columns)
    labels = list(Likelihood_Dict.keys())
    dict_labels= dict()

    prior = list(Priors.values())
    rlist = sorted_prior()
    
    for i in range(len(columns)):
        ff = X_test[:,i]
        pred = feature_post(ff,columns[i])
        prediction  = prediction*pred
        

    prediction = prediction*rlist
    predict = np.argmax(prediction,axis=1)
    
    for i in range(len(labels)):
        dict_labels.update({i:labels[i]})

    predict =  (predict.reshape(-1,1))
    predict = pd.DataFrame(predict)
    predict = np.array(predict.replace(dict_labels))

    predict =  (predict.T[0])

    differ = np.sum(target == predict)

    print ('Accuracy:',round((differ/len(test))*100,2),'%')

In [36]:
Likelihood_Dict, Priors = NaiveBayesTrain(X_train,"Label")
print ("Priors:",Priors)
display(pd.DataFrame(Likelihood_Dict))

NaiveBayesPredict(X_test,y_test)


Priors: {'not_recom': 0.33, 'priority': 0.33, 'recommend': 0.0, 'spec_prior': 0.31, 'very_recom': 0.02}


Unnamed: 0,recommend,priority,not_recom,very_recom,spec_prior
parents,{'usual': 1.0},"{'great_pret': 0.2, 'pretentious': 0.36, 'usua...","{'great_pret': 0.33, 'pretentious': 0.34, 'usu...","{'pretentious': 0.45, 'usual': 0.55}","{'great_pret': 0.5, 'pretentious': 0.31, 'usua..."
has_nurs,{'proper': 1.0},"{'critical': 0.11, 'improper': 0.21, 'less_pro...","{'critical': 0.2, 'improper': 0.21, 'less_prop...","{'improper': 0.19, 'less_proper': 0.4, 'proper...","{'critical': 0.3, 'improper': 0.19, 'less_prop..."
form,{'complete': 1.0},"{'complete': 0.27, 'completed': 0.25, 'foster'...","{'complete': 0.25, 'completed': 0.24, 'foster'...","{'complete': 0.38, 'completed': 0.3, 'foster':...","{'complete': 0.22, 'completed': 0.24, 'foster'..."
children,{'1': 1.0},"{'1': 0.28, '2': 0.26, '3': 0.23, 'more': 0.23}","{'1': 0.25, '2': 0.25, '3': 0.25, 'more': 0.25}","{'1': 0.46, '2': 0.31, '3': 0.13, 'more': 0.11}","{'1': 0.21, '2': 0.24, '3': 0.28, 'more': 0.28}"
housing,{'convenient': 1.0},"{'convenient': 0.38, 'critical': 0.29, 'less_c...","{'convenient': 0.33, 'critical': 0.33, 'less_c...","{'convenient': 0.64, 'critical': 0.06, 'less_c...","{'convenient': 0.26, 'critical': 0.41, 'less_c..."
finance,{'convenient': 1.0},"{'convenient': 0.52, 'inconv': 0.48}","{'convenient': 0.5, 'inconv': 0.5}","{'convenient': 0.68, 'inconv': 0.32}","{'convenient': 0.46, 'inconv': 0.54}"
social,"{'nonprob': 0.5, 'slightly_prob': 0.5}","{'nonprob': 0.35, 'problematic': 0.29, 'slight...","{'nonprob': 0.33, 'problematic': 0.33, 'slight...","{'nonprob': 0.5, 'slightly_prob': 0.5}","{'nonprob': 0.3, 'problematic': 0.41, 'slightl..."
health,{'recommended': 1.0},"{'priority': 0.44, 'recommended': 0.56}",{'not_recom': 1.0},{'recommended': 1.0},"{'priority': 0.61, 'recommended': 0.39}"
Label,{'recommend': 1.0},{'priority': 1.0},{'not_recom': 1.0},{'very_recom': 1.0},{'spec_prior': 1.0}


Accuracy: 90.47 %
