In [77]:
## 5.2 code

In [10]:
import numpy as np
import pandas as pd

In [11]:
data = pd.read_csv('titanic_data.csv')

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


There are 3 values in **Pclass**, I assume them as multinomial distribution.

There are binary digits in **Sex**, I assume them as Bernoulli distribution.

The data in **age** range from 0.42 to 80, I assume them as normal distribution.

There are 9 values (0-8) in **Siblings/Spouses Aboard**, I assume them as poisson distribution.

There are 7 values (0-6) in **Parents/Children Aboard**, I assume them as poisson distribution.

The data in **Fare** range from 0 to 512.3, I assume them as normal distribution.

In [161]:
class NaiveBayes:
    def __init__(self):
        return None
        
    def fit(self, data):
        self.data = data
        self.data0 = data[data.Survived == 0]
        self.data1 = data[data.Survived == 1]
        
        self.py0 = self.data0.shape[0]/self.data.shape[0]
        self.py1 = 1 - self.py0
        
        
        self.y0 = self.data0.Survived
        self.X0 = self.data0.drop(['Survived'],axis = 1)
        
        self.y1 = self.data1.Survived
        self.X1 = self.data1.drop(['Survived'],axis = 1)
        
        n0 = self.data0.shape[0]  ## sample size
        n1 = self.data1.shape[0]  ## sample size
        
        
        # Pclass [1,2,3]
        self.y0_Pclass = [(self.X0.Pclass == 1).sum()/n0]
        self.y0_Pclass.append((self.X0.Pclass == 2).sum()/n0)
        self.y0_Pclass.append((self.X0.Pclass == 3).sum()/n0)
        
        self.y1_Pclass = [(self.X1.Pclass == 1).sum()/n1]
        self.y1_Pclass.append((self.X1.Pclass == 2).sum()/n1)
        self.y1_Pclass.append((self.X1.Pclass == 3).sum()/n1)
        
        # Sex  [0,1]
        self.y0_Sex = [(self.X0.Sex == 0).sum()/n0]
        self.y0_Sex.append((self.X0.Sex == 1).sum()/n0)
        
        self.y1_Sex = [(self.X1.Sex == 1).sum()/n1]
        self.y1_Sex.append((self.X1.Sex == 0).sum()/n1)
        
        # Age [mu,sigma]
        self.y0_Age = [self.X0.Age.mean()]  # mu
        self.y0_Age.append(self.X0.Age.std(ddof = 0)) # sigma
        
        self.y1_Age = [self.X1.Age.mean()]
        self.y1_Age.append(self.X1.Age.std(ddof = 0))
        
        
        # Siblings/Spouses Aboard
        self.y0_Sib = (self.X0['Siblings/Spouses Aboard']).mean()  # lambda
        
        self.y1_Sib = (self.X1['Siblings/Spouses Aboard']).mean()  # lambda
        
        # Parents/Children Aboard
        self.y0_Par = (self.X0['Parents/Children Aboard']).mean()  # lambda
        
        self.y1_Par = (self.X1['Parents/Children Aboard']).mean()  # lambda
        
        # Fare [mu,sigma]
        self.y0_Fare = [self.X0.Fare.mean()]  # mu
        self.y0_Fare.append(self.X0.Fare.std(ddof = 0)) # sigma
        
        self.y1_Fare = [self.X1.Fare.mean()]  # mu
        self.y1_Fare.append(self.X1.Fare.std(ddof = 0)) # sigma
        
    def _predict(self, new = None):
        # compute prob of y0(just numerator)
        y0_prob = [self.py0]
        y0_prob.append(self.y0_Pclass[new.Pclass.astype(int) - 1])  # prob of Pclass
        y0_prob.append(self.y0_Sex[new.Sex.astype(int) - 1])  # prob of Sex
        y0_prob.append(1/self.y0_Age[1] * np.exp( - np.square(new.Age - self.y0_Age[0])/2/np.square(self.y0_Age[1])))  # prob of Age
        y0_prob.append(np.power(self.y0_Sib , new['Siblings/Spouses Aboard']) * np.exp(-self.y0_Sib) / \
                       np.math.factorial(new['Siblings/Spouses Aboard']))  # Siblings/Spouses Aboard
        
        y0_prob.append(np.power(self.y0_Par , new['Parents/Children Aboard']) * np.exp(-self.y0_Par) / \
                       np.math.factorial(new['Parents/Children Aboard']))  # Parents/Children Aboard
        
        y0_prob.append(1/self.y0_Fare[1] * np.exp( - np.square(new.Fare - self.y0_Fare[0])/2/np.square(self.y0_Fare[1])))  # prob of Fare
        
        # compute prob of y1(just numerator)
        y1_prob = [self.py1]
        y1_prob.append(self.y1_Pclass[new.Pclass.astype(int) - 1])  # prob of Pclass
        y1_prob.append(self.y1_Sex[new.Sex.astype(int) - 1])  # prob of Sex
        y1_prob.append(1/self.y1_Age[1] * np.exp( - np.square(new.Age - self.y1_Age[0])/2/np.square(self.y1_Age[1])))  # prob of Age
        y1_prob.append(np.power(self.y1_Sib , new['Siblings/Spouses Aboard']) * np.exp(-self.y1_Sib) / \
                       np.math.factorial(new['Siblings/Spouses Aboard']))  # Siblings/Spouses Aboard
        
        y1_prob.append(np.power(self.y1_Par , new['Parents/Children Aboard']) * np.exp(-self.y1_Par) / \
                       np.math.factorial(new['Parents/Children Aboard']))  # Parents/Children Aboard
        
        y1_prob.append(1/self.y1_Fare[1] * np.exp( - np.square(new.Fare - self.y1_Fare[0])/2/np.square(self.y1_Fare[1])))  # prob of Fare
        
        
        return  0 if np.prod(y0_prob) >= np.prod(y1_prob) else 1
        
    def predict(self, new = None):
        if new.ndim == 1:
            return self._predict(new)
        
        pre = np.ones(new.shape[0])
        for i in range(new.shape[0]):
            pre[i] = self._predict(new.iloc[i])
        return pre 
        

In [148]:
NB = NaiveBayes()
NB.fit(data)

In [149]:
NB.y0_Sex

[0.8513761467889909, 0.14862385321100918]

In [150]:
new = pd.Series({'Pclass':3, 'Sex':0, 'Age': 23, 'Siblings/Spouses Aboard': 1, 'Parents/Children Aboard':0, 'Fare': 0})
new

Pclass                      3
Sex                         0
Age                        23
Siblings/Spouses Aboard     1
Parents/Children Aboard     0
Fare                        0
dtype: int64

In [152]:
NB.predict(new)

0

In [153]:
NB.predict(toy.drop(['Survived'],axis = 1))

array([0., 1., 0., 0., 0.])

In [154]:
toy

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [162]:
def NB_cross_val(data, fold = 10):
        indices = np.random.permutation(list(data.index))
        testsize = data.shape[0]//fold
        acc = []
        NB = NaiveBayes()
        for i in range(fold):
            if i == fold - 1:
                test = data.loc[indices[i*testsize:]]
                train = data.loc[data.index.isin(indices[i*testsize:]) == False]
            else:
                test = data.loc[indices[i*testsize : (i+1)*testsize]]
                train = data.loc[data.index.isin(indices[i*testsize : (i+1)*testsize]) == False]
            
            #print(train.shape)
            
            NB.fit(train)
            
            pre = NB.predict(test.drop(['Survived'],axis = 1))
            
            acc.append((pre == test.Survived).sum()/len(test.Survived))
            
        #return np.array(acc).mean()
        return acc

In [164]:
acc = NB_cross_val(data)
acc

[0.6477272727272727,
 0.6704545454545454,
 0.6363636363636364,
 0.6590909090909091,
 0.6363636363636364,
 0.75,
 0.6590909090909091,
 0.6590909090909091,
 0.5795454545454546,
 0.6736842105263158]

In [165]:
np.array(acc).mean()

0.6571411483253587

In [127]:
np.prod(a)

2.2085857931854898e-05

In [128]:
np.prod(b)

5.914126637139044e-06

In [63]:
data[data.Survived == 0].Fare.std(ddof = 0)

31.455255530833597

In [107]:
np.power(3,2)

9

In [13]:
toy = data.head()
toy

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [146]:
toy.loc[1]

Survived                    1.0000
Pclass                      1.0000
Sex                         1.0000
Age                        38.0000
Siblings/Spouses Aboard     1.0000
Parents/Children Aboard     0.0000
Fare                       71.2833
Name: 1, dtype: float64

In [141]:
new

Pclass                      3
Sex                         0
Age                        23
Siblings/Spouses Aboard     1
Parents/Children Aboard     0
Fare                        0
dtype: int64

In [103]:
new = pd.Series({'Pclass':3, 'Sex':0, 'Age': 23, 'Siblings/Spouses Aboard': 1, 'Parents/Children Aboard':0, 'Fare': 0})
new

Pclass                      3
Sex                         0
Age                        23
Siblings/Spouses Aboard     1
Parents/Children Aboard     0
Fare                        0
dtype: int64

In [115]:
new['Parents/Children Aboard']

0

In [43]:
a = [1]
a

[1]

In [45]:
a.append(1-a[0])

In [46]:
a

[1, 0]

In [108]:
np.prod(3)

3

In [109]:
np.math.factorial(3)

6

In [114]:
1 / \
  2

0.5