## Naive Bayes Classifier from scratch

This notebook contains 5 functions that build the Naive Bayes Classifier from scratch that is without the use of Sklearn Library.

- load_data() - loads the student.csv dataset into a more usable format
- split_data() - splits the dataset using hold-out strategy
- train() - trains the model
- predict() - predicts the model
- evaluate() - calculates the accuracy without the use of metrics
- Last cell calculates precision, recall and f-1 scores without the use of metrics

In [2]:
# This function should open a data file in csv, and transform it into a usable format

import pandas as pd
import numpy as np

def load_data(file):
    dataset = pd.read_csv(file)
    return dataset 

dataset = load_data('student.csv')
# storing all grades(class labels) separately
grades = dataset['Grade'].unique()


In [3]:
# This function should split a data set into a training set and hold-out test set

from sklearn.model_selection import train_test_split

def split_data(dataset):
    # splitting dataset into training and test data of 70% and 30% respectively
    train_data, test_data = train_test_split(dataset, test_size = 0.3)    
    return train_data, test_data

trains, test = split_data(dataset)

In [56]:
# This function should build a supervised NB model

def train(grades,trains):
# calculating PRIORS
    
    prior_count={}
    prior_prob={}
    # prior_count consists of occurences of class labels
    for grade in grades:
        prior_count[grade]=np.sum(trains['Grade']==grade)
    # prior_prob consists of probabilities of class labels
    for value in prior_count:
        prior_prob[value] = round(prior_count[value]/len(trains),2)

# calculating LIKELIHOODS
    attribute_count = {}
    attribute_len = {}
    for attribute in  trains:
        attribute_count[attribute] = trains[attribute].unique()
    for i in attribute_count:
        attribute_len[i] = len(attribute_count[i])
    #print(attribute_len)
    
    # conditional count consists of occurences of attribute values given class 
    conditional_count={}
    #iterating through every attribute(column)
    for attribute in trains:
    #get unique values in column
        unique_values = trains[attribute].unique()
        value_dict={}
    #iterating through every attribute value
        for value in unique_values:
            grade_dict={}
     #iterating through all class labels 
            for grade in grades:
    #counting the value associated with that grade and summing it up
                grade_dict[grade]=((trains[attribute]==value) & (trains['Grade']==grade)).sum() 
            value_dict[value]=grade_dict
        conditional_count[attribute]=value_dict

    #conditional_count now consists of probabilities of each attribute value given class
    for attr in conditional_count:
        for val in conditional_count[attr]:
            for grade in conditional_count[attr][val]:
     # dividing each occurence of attribute value with total number of occurences of every class label
                conditional_count[attr][val][grade] = round(((conditional_count[attr][val][grade]+1)/(prior_count[grade]+(attribute_len[attr]*1))),3)
       
     
    return prior_prob,conditional_count

prior,conditional =train(grades,trains)




In [58]:
# This function should predict the class for an instance or a set of instances, based on a trained model 

def predict(test,prior,conditional):
    
    # predicted_grades consists of preidcted class labels
    predicted_grades=[]
    # actual_grades consists of actual class labels
    actual_grades=[]
    
    # iterating through every instance row by row in test data
    for index,row in test.iterrows():
    # setting max probability as -1 such that it's always true
        maxscore=-1
        maxgrade=''
        scores={}
    # iterating through every class label
        for grade in grades:
    # stored probability of prior
            score=prior[grade]
    # iterating through every attribute value
            for attr in test:
    # calculating using bayes rules i.e prior x liklihoods
    # using conditional_count developed in previous function to calculate conditional probabilities
    # attr = column name, row[attr]=value of row for that column
                score*=conditional[attr][row[attr]][grade]  
        
    # scores consists of all probabilities calculated above
            scores[grade]=score
    # find grade with maximum probability
            if score>maxscore:
                maxscore=score
                maxgrade=grade
    # inserting predicted grades one by one
        predicted_grades.append(maxgrade)
    # inserting actual grades one by one
        actual_grades.append(row['Grade'])
    
    return actual_grades,predicted_grades
    
            
actual, predicted = predict(test, prior, conditional)
#print(test)
#print(conditional)
#print(predicted)
#print(actual)

In [53]:
# This function should evaluate a set of predictions in terms of accuracy

def evaluate(actual,predicted,test):
    
    # initialising a counter
    acc = 0;
    
    # iterating through preidcted grades and actual grades
    for i in range(0,len(predicted)):
    # if both are same, increment counter
        if(actual[i]==predicted[i]):
            acc+=1
            
    # divide counter by total number of instances in test data
    return acc/len(test)*100

accuracy = evaluate(actual,predicted,test)
print(accuracy)

94.35897435897435


In [49]:
#calculating precision, recall and F-1 on test data
import numpy as np

matrix = {}

# creating a confusion matrix initialised with 0
for g1 in grades:
    matrix[g1] = {}
    for g2 in grades:
        matrix[g1][g2] = 0
# adding the occurences of grades in confusion matrix
for i in range(len(test)):
    matrix[predicted[i]][actual[i]] += 1

# precision, recall and f-1  for every class
precision = {}
recall = {}
f1 = {}
for g1 in grades:
    num = matrix[g1][g1]
    den1 = 0
    den2 = 0
    for g2 in grades:
        den1+=matrix[g1][g2]
        den2+=matrix[g2][g1]
    if(den1 == 0 & den2 == 0):
        precision[g1] = 'NA'
        recall[g1] = 'NA'
    else:
        precision[g1] = num/den1
        recall[g1] = num/den2
for g in grades:
    if precision[g] == 'NA':
        f1[g] = 'NA'
    else:
        f1[g] = ((2*precision[g]*recall[g])/(precision[g]+recall[g]))
        
# overall precison, recall and f-1
overall_precision = 0
overall_recall = 0
c1 = 0
c2 = 0
for g1 in grades:
    if(precision[g1] != 'NA'):
        overall_precision += precision[g1]
        c1+=1
    if(recall[g1] != 'NA'):
        overall_recall += recall[g1]
        c2+=1

overall_precision /= c1
overall_recall /= c2
overall_f1 = (2*overall_precision*overall_recall)/(overall_precision+overall_recall)
        


print(precision)
print(recall)
print(f1)
print(overall_precision)
print(overall_recall)
print(overall_f1)
        
    

{'B': 0.9473684210526315, 'D': 0.9666666666666667, 'F': 0.9565217391304348, 'C': 0.9795918367346939, 'A+': 0.75, 'A': 0.9047619047619048}
{'B': 0.9230769230769231, 'D': 0.9830508474576272, 'F': 0.9166666666666666, 'C': 1.0, 'A+': 0.5, 'A': 1.0}
{'B': 0.935064935064935, 'D': 0.9747899159663865, 'F': 0.9361702127659574, 'C': 0.9896907216494846, 'A+': 0.6, 'A': 0.9500000000000001}
0.9174850947243888
0.887132406200203
0.9020534925751884
