In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import copy
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv('train_clean.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [31]:
def prepareBernoulliData(data):
    y = data['Survived'].copy()
    X = pd.DataFrame()
    X['isFemale'] = (data['Sex'] == 'female')
    X['is3class'] = (data['Pclass'] == 3)
    X['isChild'] = (data['Age'] > 0) & (data['Age'] < 11)
    X['has SibSp'] = (data['SibSp'] > 0)
    X['has ParCh'] = (data['Parch'] > 0)
    X['Cabin knowon'] = (data['Cabin'] != 'Unknown')
    #X['isEmbarked_S'] = (data['Embarked'] != 'S')
    return (X, y)

In [32]:
X,y = prepareBernoulliData(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111, stratify=y)
X_train.shape, y_train.shape

((623, 6), (623,))

In [33]:
clf = BernoulliNB(alpha=1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7798507462686567

In [34]:
clf.predict_proba(X_train)

array([[0.59062024, 0.40937976],
       [0.95242073, 0.04757927],
       [0.59062024, 0.40937976],
       ...,
       [0.95242073, 0.04757927],
       [0.04645616, 0.95354384],
       [0.04645616, 0.95354384]])

In [38]:
X_train.tail(3)

Unnamed: 0,isFemale,is3class,isChild,has SibSp,has ParCh,Cabin knowon
692,False,True,False,False,False,False
215,True,False,False,True,False,True
809,True,False,False,True,False,True


In [37]:
y_train.tail(3)

692    1
215    1
809    1
Name: Survived, dtype: int64

In [60]:
def getAgeClass(age):
    if age == 0:
        return 0
    elif age < 11:
        return 1
    elif age < 55:
        return 2
    else:
        return 3
    

In [61]:
ord('S')

83

In [106]:
def prepareMultinomialData(data):
    y = data['Survived'].copy()
    X = pd.DataFrame()
    X['isFemale'] = (data['Sex'] == 'female')
    X['is3class'] = (data['Pclass'] == 3)
    X['Age Range'] = data['Age'].apply(getAgeClass)
    #X['Embarked'] = data['Embarked'].apply(ord)
    X['SibSp'] = data['SibSp']
    X['ParCh'] = data['Parch']
    #X['Cabin knowon'] = (data['Cabin'] != 'Unknown')
    return (X, y)

In [107]:
Xm, ym = prepareMultinomialData(data)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.3, random_state=111, stratify=ym)
Xm_train.shape, ym_train.shape

((623, 5), (623,))

In [108]:
multNB = MultinomialNB()
multNB.fit(Xm_train, ym_train)
multNB.score(Xm_test, ym_test)

0.7723880597014925

In [109]:
Xmf, ymf = prepareMultinomialData(data[data['Sex'] == 'male'])
Xmf_train, Xmf_test, ymf_train, ymf_test = train_test_split(Xmf, ymf, test_size=0.3, random_state=111, stratify=ymf)
Xmf_train.shape, ymf_train.shape

((403, 5), (403,))

In [110]:
multNB = MultinomialNB()
multNB.fit(Xmf_train, ymf_train)
multNB.score(Xmf_test, ymf_test)

0.8045977011494253