# Project 1 - Naive Baye's Classifier

Upasana Garg - 
Anurag Singh

In [3]:
import math 
import random 
import pandas as pd
import numpy as np

In [4]:
''' method to read training and test data from file'''
def loadTestTrainData():
    testData = pd.read_csv('test.csv',delimiter=',')
    testData = pd.DataFrame(data=testData, dtype=np.float64)
    testData = testData.values.tolist()
    '''tData is training data and testData is testing data'''
    tData = pd.read_csv('train.csv',delimiter=',')
    tData = pd.DataFrame(data=tData, dtype=np.float64)
    tData = tData.values.tolist()
    return [tData,testData]

In [5]:
''' to calculate mean and standard deviation need to categorize data according to the class i.e. 0/1 (last index of the data)''' 
def categorizeClassData(dataObj): 
    dataCategory={} 
    for i in range(len(dataObj)): 
        dataRow = dataObj[i] 
        rowLength=len(dataRow)-1 
        if(dataRow[rowLength] not in dataCategory): 
            dataCategory[dataRow[rowLength]]=[] 
        dataCategory[dataRow[rowLength]].append(dataRow) 
    '''print("data categorized on the basis of the class --- ",dataCategory) '''
    return dataCategory

In [6]:
''' method to calculate standard mean and standard deviation of numbers''' 
def calculateMeanDeviation(values): 
    mean=sum(values)/float(len(values))
    totalNumerate=0.0 
    for i in range(len(values)): 
        indTerm = pow((values[i]-mean),2) 
        totalNumerate+=indTerm 
    variance=totalNumerate/float(len(values)) 
    deviation=math.sqrt(variance)
    return mean,deviation

In [7]:
''' method to all mean and standard deviation on class basis''' 
def getClassListMeanDev(dataCategory):  
    classMeanDevDiff={} 
    for className,classData in dataCategory.items(): 
        meanDevList=[] 
        ''' zip method collects one-one attributes from all data rows for the calculation till end of the row'''
        for column in zip(*classData):
            dataMean,dataDeviation = calculateMeanDeviation(column) 
            meanDevList.append((dataMean,dataDeviation)) 
        ''' to remove last occurance of mean and standard deviation because of existence of class factor''' 
        meanDevList.pop() 
        classMeanDevDiff[className] = meanDevList 
    '''print("Data categorization of mean and standard deviation on the basis of className --- ",classMeanDevDiff) '''
    return classMeanDevDiff

In [8]:
''' method to calculate probability according to the gaussian formula - univariate''' 
def calGaussianFormula(factor,mean,deviation): 
    expNumerate = math.pow(factor-mean,2) 
    expDenome= 2*math.pow(deviation,2)  
    exponent = math.exp(-(expNumerate/expDenome))  
    probab= (1/(math.sqrt(2*math.pi)*deviation))*exponent 
    '''print("Final probability of particular input row according to gaussian formula --- ",probab) '''
    return probab

In [16]:
''' method to calculate the class probabilities'''  
def getClassProbability(classMeanDevDiff,inputRow): 
    classProbabilities={}
    for className,classMeanDev in classMeanDevDiff.items(): 
        classProbabilities[className] = 1
        for j in range(len(classMeanDev)): 
            mean,deviation=classMeanDev[j] 
            factor=inputRow[j] 
            classProbabilities[className]=calGaussianFormula(factor,mean,deviation)
    '''print("Overall class probabilities are --- ",classProbabilities) '''
    return classProbabilities

In [10]:
''' method to check prior class identification on the test data prepared earlier''' 
def checkPriorClass(classMeanDevDiff,testData): 
    predictArr=[] 
    for i in range(len(testData)):
        classProbabilities = getClassProbability(classMeanDevDiff,testData[i]) 
        selectedClass,maxProbab=None,-1 
        for className,classProbability in classProbabilities.items(): 
            if selectedClass is None or classProbability > maxProbab: 
                maxProbab=classProbability 
                selectedClass=className
        predictArr.append(selectedClass)
    '''print("Cross check prior identification method on some test data sets --- ",predictArr) '''
    return predictArr

In [11]:
''' method to check the accuracy of the prior prediction of test data set ''' 
def checkAccuracy(testData,predictArr): 
    count=0 
    for j in range(len(testData)): 
        tDataClass = testData[j][len(testData[j]) -1]
        if tDataClass == predictArr[j]: 
            count+=1 
    accuracyPercent = (count/float(len(testData)))*100 
    print("Accuracy percent of the test data set --- ",accuracyPercent) 
    return accuracyPercent

In [12]:
''' method to calculate error percentage from accuracy'''
def getErrorPercent(accuracyPercent):
    return (100.0 - accuracyPercent);

In [13]:
''' method to get actual class array from testdata'''
def getActualClasses(testData):
    actualClass=[]
    for j in range(len(testData)):
        actualClass.append(testData[j][-1])
    return actualClass

''' method to get confusion matrix on the basis of actual class data and prediction data'''
def getConfusionMatrix(actualArr,predictArr):
    actualSeries = pd.Series(actualArr, name='Actual')
    predictSeries = pd.Series(predictArr, name='Predicted')
    confusionMatrix = pd.crosstab(actualSeries, predictSeries, rownames=['Actual'], colnames=['Predicted'], margins=True)
    print("Confusion Matrix-------")
    print(confusionMatrix)
    return np.matrix(confusionMatrix)

In [14]:
'''method to get accuracy using confusion matrix   (TP+TN)/(TP+FP+TN+FN)  '''
def getAccuracyConfusion(confMatrix):
    numer =confMatrix.item(0,0)+confMatrix.item(1,1)
    confAccu=numer/confMatrix.item(2,2)
    return confAccu

'''method to get error using confusion matrix   (FP+FN)/(TP+FP+TN+FN)  '''
def getErrorConfusion(confMatrix):
    numer =confMatrix.item(0,1)+confMatrix.item(1,0)
    confErr=numer/confMatrix.item(2,2)
    return confErr

'''method to calculate sensitivity    TP / (FN + TP)  '''
def getSensitivity(confMatrix):
    return confMatrix.item(1,1)/confMatrix.item(1,2)

'''method to calculate specificity    TN / (TN/FP)  '''
def getSpecificity(confMatrix):
    return confMatrix.item(0,0)/confMatrix.item(0,2)

In [18]:
''' main method to call other methods'''
def runClassifier(): 
    tData,testData=loadTestTrainData()
    '''return dictionary of key-classname and value-dataset'''
    dataCategory=categorizeClassData(tData) 
    
    classMeanDevDiff = getClassListMeanDev(dataCategory)
    
    testPredictArr=checkPriorClass(classMeanDevDiff,testData)
    testAccuracy = checkAccuracy(testData,testPredictArr) 
    
    errors = getErrorPercent(testAccuracy)
    print("Error percent --- ",errors)
    
    confusionMatrix = getConfusionMatrix(getActualClasses(tData),testPredictArr);
    
    confAcc=getAccuracyConfusion(confusionMatrix)
    print(" Classification Error --- ",confAcc)
    
    confErr=getErrorConfusion(confusionMatrix)
    print(" Classifier Error --- ",confErr)
    
    confSense=getSensitivity(confusionMatrix)
    print(" Classifier Sensitivity --- ",confSense)
    
    confSpec=getSpecificity(confusionMatrix)
    print(" Classifier Specificity --- ",confSpec)
    
runClassifier()

Accuracy percent of the test data set ---  64.42687747035573
Error percent ---  35.573122529644266
Confusion Matrix-------
Predicted  0.0  1.0  All
Actual                  
0.0        107   52  159
1.0         38   56   94
All        145  108  253
 Classification Error ---  0.6442687747035574
 Classifier Error ---  0.3557312252964427
 Classifier Sensitivity ---  0.5957446808510638
 Classifier Specificity ---  0.6729559748427673
