In [57]:
import numpy as np
import pandas as pd

### Loading Dataset

In [58]:
X = pd.read_csv('titanic_X_train.csv')
y = pd.read_csv('titanic_y_train.csv')
X.head(10)

Unnamed: 0,ID,Pclass,Sex,Age,SibSp,Parch,Fare,Embarced
0,0,3,0,22.0,1,0,7.25,1
1,1,1,1,38.0,1,0,71.2833,0
2,2,3,1,26.0,0,0,7.925,1
3,3,1,1,35.0,1,0,53.1,1
4,4,3,0,35.0,0,0,8.05,1
5,5,3,0,20.0,0,0,8.4583,2
6,6,1,0,54.0,0,0,51.8625,1
7,7,3,0,2.0,3,1,21.075,1
8,8,3,1,27.0,0,2,11.1333,1
9,9,2,1,14.0,1,0,30.0708,0


### Data preparation 

In [59]:
### Dropping unnecassery columns

df = X.drop(['ID', 'SibSp', 'Parch', 'Embarced'],axis='columns')
y = y.iloc[:, -1].values
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,0,22.0,7.25
1,1,1,38.0,71.2833
2,3,1,26.0,7.925
3,1,1,35.0,53.1
4,3,0,35.0,8.05


In [60]:
### one_hot coding for feature 'Sex' column of dataset

dummies = pd.get_dummies(df.Sex)
dummies.head(3)

Unnamed: 0,0,1
0,1,0
1,0,1
2,0,1


In [61]:
### Adding dummies columns to X dataset

X = pd.concat([df, dummies], axis='columns')
X.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,0,1
0,3,0,22.0,7.25,1,0
1,1,1,38.0,71.2833,0,1
2,3,1,26.0,7.925,0,1


In [62]:
X = np.array(X)
y = np.array(y)

In [64]:

class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator


# Testing
if __name__ == "__main__":
    # Imports
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

  
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=123
    )

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)

    print("Naive Bayes classification accuracy", accuracy(y_test, predictions))


Naive Bayes classification accuracy 0.8044692737430168
