# Naive Bayes Classifier

ASSUMPTION:
All the features in the given dataset are independent

Importing libraries

In [3]:
import numpy as np
import pandas as pd

Funtion to classify data for better understanding

In [4]:
def dataClassification(dataset):
    
    # To get the no of observations or datapoints in the given data
    C = dataset.iloc[:, 10]
    B = []
    M = []
     
    # For every class label "2" this loop will append "B" to an empty list and for every class label "4" it will append "M"    
    for i in range(0, len(C)):
        if C[i] == 2:
            B.append("B")
            l1 = len(B)
        
        elif C[i] == 4:
            M.append("M")
            l2 = len(M)
    test_patients = dataset.shape[0]
    
    # To print our basic understanding of the dataset 
    a = str("The data consists of " + str(test_patients) + " different diagnosis in which " + str(l1) 
            + " are Benign and " + str(l2) + " are Malignant")
    return a

Functions to split the dataset into training and testing data

In [5]:
def trainingDataset(dataset, splitRatio):
    
    total_data = dataset.shape[0]
    
    # Splitting the data into two groups for training and testing and this data is for training
    # Also removing the sample code number feature as it has no affect on class labels
    train_data = dataset.iloc[0:(int(splitRatio*total_data)), 1:11]
    
    # Converting the data into a dataframe
    d = pd.DataFrame(train_data)
    return d

In [6]:
def testingData(dataset, splitRatio):
    
    total_data = dataset.shape[0]
    
    # This data is for testing
    # Also removing the sample code number feature as it has no affect on class labels
    test_data = dataset.iloc[int(total_data - total_data*(1 - splitRatio)):total_data, 1:11]
    
    # Converting the data into dataframe
    d1 = pd.DataFrame(test_data)
    return d1

Function to understand the how the split happened

In [7]:
def DataDivision(data1, data2):
     
    # To analyze the how the data is splitted 
    lt1 = len(data1)
    lt2 = len(data2)
    
    # To print our basic understanding of the splitting process of the data 
    b = str("The data is divided into " + str(lt1) + " training data samples and " + str(lt2) + " testing data samples") 
    return b

In [8]:
def testdataclassification(test_data):
    
    Bdata, Mdata =  (test_data['Class'].value_counts())
    testshape = test_data.shape[0]
    
    l1 = str("The testing data consists of " + str(testshape) + " different diagnosis in which " + str(Bdata) + 
          " are Benign and " + str(Mdata) + " are Malignant")
    return l1

Function to group data based on class label to compute the individual means and standard deviations of the independent features 

In [9]:
def grouping(dataset):
    
    # Grouping of data is necessary in a Naive bayes classifier as we have to calculate the individual probability with respect
    # to a feature in a particular class labelled label 
    grouped = dataset.groupby('Class')
    B_data = grouped.get_group(2)
    B_data = B_data.iloc[:, 0:9]
    
    M_data = grouped.get_group(4)
    M_data = M_data.iloc[:, 0:9]
    
    return B_data, M_data

The mean function

In [10]:
def mean(points):
    
    # The mean is defined as the sum of the datapoints or observations divided by the number observations or datapoints
    return np.sum(points)/np.size(points)

In [11]:
def MeanSet(data1,data2):
    
    B_mean_set = []
    M_mean_set = []
    
    # To calculate the mean of every independent feature for both classes
    for i in range(0,len(data1.columns)):
        p = data1.iloc[:,i]
        m = mean(p)
        B_mean_set.append(m)
        
    for i in range(0,len(data2.columns)):
        p = data2.iloc[:,i]
        m = mean(p)
        M_mean_set.append(m)
        
    return B_mean_set, M_mean_set

The standard deviation function

In [12]:
def st_dev(points, mean):
    
   # Standard deviation is defined as the square root of the sum of difference between the observations and their mean 
   # divided by the no.0f observations
   variance = np.sum([(x - mean)**2 for x in points])/(np.size(points) - 1)
   return np.sqrt(variance)

In [13]:
def SDset(data1, data2, mean1, mean2):
    
    B_sd_set = []
    M_sd_set = []
        
    # To calculate the standard deviation of every independent feature for both classes
    for i in range(0, len(data1.columns)):
        p = data1.iloc[:,i]
        sd = st_dev(p, mean1[i])
        B_sd_set.append(sd)
    
    for i in range(0, len(data2.columns)):
        p = data2.iloc[:,i]
        sd = st_dev(p, mean2[i])
        M_sd_set.append(sd)
    
    return B_sd_set, M_sd_set

The Guassian probability density function

In [14]:
from math import pi

def probability(x, mean, sd):
    
    # This is the gaussian probability density function, which has the mathematical equation
    # (1/sqrt(2*pi*(sd)^2)) * exp((-sum(xi-mean)^2)/2*(sd^2))
    e = np.exp(-((x - mean)**2 / (2 * sd**2 )))
    return (1 / (np.sqrt(2 * pi) * sd)) * e

Function to predict class labels for the given testing data

In [15]:
def prediction(testData, mean1, mean2, sd1, sd2):
    
    test_prob_set1 = []
    test_prob_set2 = []
    p1 = []
    p2 = []
    
    # To predict the class labels for testing data
    # We took two empty lists as we have two classes and we need to calculate the probabilities of a datapoint with two 
    # groups of data
    for i in range(0, len(testData.index)):
        for j in range(0, len(testData.columns)-1):
            prob1 = probability(testData.iloc[i,j], mean1[j], sd1[j])
            prob2 = probability(testData.iloc[i,j], mean2[j], sd2[j])
            test_prob_set1.append(prob1)
            test_prob_set2.append(prob2) 
            
        # Multiply the independent probabilities to get the probability for that particular group of data
        a = np.prod(test_prob_set1)
        b = np.prod(test_prob_set2)
        
        # Then append the two prbabilities in two empty lists
        p1.append(a)
        p2.append(b)
    
    return p1, p2

The classification report metrics consists of Accuracy, Precision, Recall, F1-score and Confusion matrix to evaluate the preformance of the model 

In [16]:
def Classification_report(testData, p, q):
    
    #Intialize the true positives, false positives, true negatives and the false negatives
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    # Iterate through the p and q lists comparing the predicted labels with the actual ones and computing 
    # the TP, TN, FP and FN
    for i in range(0, len(testData.index)):
        if (p[i] == max(p[i], q[i]) and testData.iloc[i,9] == 2):
            TP += 1
        elif (q[i] == max(p[i], q[i]) and testData.iloc[i,9] == 4):
            TN += 1
        elif (q[i] == max(p[i], q[i]) and testData.iloc[i,9] == 2):   
            FN += 1
        elif (p[i] == max(p[i], q[i]) and testData.iloc[i,9] == 4):
            FP += 1
            
    # Accuracy is the measure of many correct predictions the model made        
    A = (TP + TN)/(TP + TN + FP + FN) * 100.0
    
    # precision expresses the proportion of the data points our model says was relevant actually were relevant.
    P1 = (TP)/(TP + FP) * 100.0
    P2 = (TN)/(TN + FN) * 100.0
    
    # Recall expresses the ability to find all relevant instances in a dataset
    R1 = (TP)/(TP + FN) * 100.0
    R2 = (TN)/(TN + FP) * 100.0
    
    # Harmonic mean of recall and precision 
    F1 = (2*P1*R1)/(R1 + P1)
    F2 = (2*P2*R2)/(P2 + R2)
    
    # A matrix to represent all types of predictions 
    CM = np.array([[TP, FP],[FN, TN]])
    
    return A, P1, P2, R1, R2, F1, F2, CM

A main() function to simplify the code

In [20]:
def main():
    
    # Reading the data file and converting it into a dataframe
    data = pd.read_excel(r"C:\Users\sriva\OneDrive\Desktop\Python_for_ML_project_dataset\breast-cancer-wisconsin.xlsx")
    df = pd.DataFrame(data)
    
    # Processing the data
    m = dataClassification(data)
    splitRatio = 0.75
    df1 = trainingDataset(data, splitRatio)
    df2 = testingData(data, splitRatio)
    l = DataDivision(df1, df2)
    t = testdataclassification(df2)
    
    # Parameter calculation
    B_data, M_data = grouping(df1)
    B_mean_set, M_mean_set = MeanSet(B_data,M_data)
    B_sd_set, M_sd_set = SDset(B_data, M_data, B_mean_set, M_mean_set)
    
    # Predictions
    p1, p2 = prediction(df2, B_mean_set, M_mean_set, B_sd_set, M_sd_set)
    
    # Model evaluation
    Acc, PrecisionB, PrecisionM, RecallB, RecallM, F1_scoreB, F1_scoreM, cm = Classification_report(df2, p1, p2)
    
    print(m)
    print(l)
    print(t)
    print("\n" + "The Accuracy of the NaiveBayes model is: "+ str(Acc) + "%")
    print("\n" + "The Precision of the NaiveBayes model is: "+ str((PrecisionB + PrecisionM)/2) + "%")
    print("\n" + "The Recall of the NaiveBayes model is: "+ str((RecallB + RecallM)/2) + "%")
    print("\n" + "The F1 score of the NaiveBayes model is: "+ str((F1_scoreB + F1_scoreM)/2) + "%")
    print("\n" + "The Confusion matrix of the NaiveBayes model is: " + "\n" + str(cm))