In [10]:
##Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
import gzip
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB


In [11]:
class NaiveBayesClassifier():
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function derived from wikipedia:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is quare root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
#         numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy


In [77]:
##Importing dataset
urlfile = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(urlfile)
#print(data.shape)
#print(data)

##Remove unwanted features
data.drop(['PassengerId','Name','SibSp','Parch','Ticket', 'Fare','Cabin','Embarked'],axis='columns',inplace=True)
##Drop NaN values from the train set
data.dropna(axis=0, inplace=True)

##Convert categorical variable to numeric
features = ['Pclass','Age', 'Sex']
target = 'Survived'

data = data[features + [target]]
data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])
data['Age'] = pd.qcut(data['Age'], 10, labels=False)
print('data.shape = ', data.shape)
#print(data)


labels = data[target].values
#print(labels)
print('labels.shape = ', labels.shape)

##Split dataset into training set (80%) and validatation set (20%)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=1)
#X_train.head()
#y_train.head()

##Split training set (80%) in two parts - for training (70%) and testing (30%)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.3, random_state=12)
#print(X_train1)
#print(y_train1)

data.shape =  (714, 4)
labels.shape =  (714,)


In [76]:
##Train and test the model
x = NaiveBayesClassifier()
x.fit(X_train1, y_train1)
predictions_test = x.predict(X_test1)
accuracy_test = x.accuracy(y_test1, predictions_test)

##Validate the model
predictions_val = x.predict(X_val)
accuracy_val = x.accuracy(y_val, predictions_val)

print('Model metrics')
print('Accuracy of Naive Bayes = ', accuracy_test)
print('Validity of Naive Bayes = ', accuracy_val)

##Determining probability of survival for each class
#print('Probability of each class')
#print('Survive = 0: %.2f' % classifier.class_prior_[0])
#print('Survive = 1: %.2f' % classifier.class_prior_[1])

Model metrics
Accuracy of Naive Bayes =  0.6337209302325582
Validity of Naive Bayes =  0.6013986013986014


  numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
  numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
  prob = numerator / denominator


In [75]:
##Comparison with sklearn Naive Bayes Classifier

##Initialize the classifier
classifier = GaussianNB()

##Fitting the first part of data
classifier.fit(X_train1, y_train1)

predictions_test = classifier.predict(X_test1)
accuracy_test = accuracy_score(y_test1, predictions_test)

predictions_val = classifier.predict(X_val)
accuracy_val = accuracy_score(y_val, predictions_val)

print('Model metrics with sklearn Naive Bayes Classifier')
print('Accuracy of Naive Bayes = ', accuracy_test)
print('Validity of Naive Bayes = ', accuracy_val)

##Determining probability of survival for each class
#print('Probability of each class')
#print('Survive = 0: %.2f' % classifier.class_prior_[0])
#print('Survive = 1: %.2f' % classifier.class_prior_[1])

Model metrics with sklearn Naive Bayes Classifier
Accuracy of Naive Bayes =  1.0
Validity of Naive Bayes =  1.0
