In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [54]:
df=pd.read_csv("./TitanicDataset.csv")
df=df[["Pclass","Age","SibSp","Parch","Fare","Survived"]]
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df[:6]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1
3,1,35.0,1,0,53.1,1
4,3,35.0,0,0,8.05,0
5,3,28.0,0,0,8.4583,0


In [55]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [56]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
  feat = list(df.columns)
  df = df[df[Y]==label]
  mean, std = df[feat_name].mean(), df[feat_name].std()
  p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-
  ((feat_val-mean)**2 / (2 * std**2 )))
  return p_x_given_y


In [57]:
def naive_bayes_gaussian(df, X, Y):
# get feature names
  features = list(df.columns)[:-1]

  # calculate prior
  prior = calculate_prior(df, Y)

  Y_pred = []
  # loop over every data sample
  for x in X:
  # calculate likelihood
    labels = sorted(list(df[Y].unique()))
    likelihood = [1]*len(labels)
    for j in range(len(labels)):
      for i in range(len(features)):
        likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

    # calculate posterior probability (numerator only)
    post_prob = [1]*len(labels)
    for j in range(len(labels)):
      post_prob[j] = likelihood[j] * prior[0]
    Y_pred.append(np.argmax(post_prob))

  return np.array(Y_pred)

In [58]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(df,test_size=0.2,random_state=40)
X_test=test.iloc[:,:-1].values
Y_test=test.iloc[:,-1].values
Y_pred=naive_bayes_gaussian(train,X_test,Y="Survived")
print(Y_pred)


[0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 1 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0
 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 1]


In [59]:
from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred,average="micro"))

[[84 19]
 [40 36]]
0.6703910614525139
