In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
trainData = pd.read_csv("./titanikData.csv")
testData = pd.read_csv("./titanikTest.csv", names=["pclass", "age", "gender", "survived"])

In [2]:
number = LabelEncoder()
trainData['survived'] = number.fit_transform(trainData['survived'])
trainData['gender'] = number.fit_transform(trainData['gender'])
trainData['age'] = number.fit_transform(trainData['age'])
trainData['pclass'] = number.fit_transform(trainData['pclass'])
trainData.head()

Unnamed: 0,pclass,age,gender,survived
0,0,0,1,1
1,0,0,1,1
2,0,0,1,1
3,0,0,1,1
4,0,0,1,1


In [3]:
testData['survived'] = number.fit_transform(testData['survived'])
testData['gender'] = number.fit_transform(testData['gender'])
testData['age'] = number.fit_transform(testData['age'])
testData['pclass'] = number.fit_transform(testData['pclass'])
testData.head()

Unnamed: 0,pclass,age,gender,survived
0,0,0,1,1
1,0,0,1,1
2,0,0,1,1
3,0,0,1,1
4,0,0,1,1


In [7]:
rounds = 3 #Initialize
models_algorithm = []
εt_algorithm = []
predictions_algorithm = []

In [6]:
weights_data = pd.DataFrame(trainData.survived.copy())
weights_data['weight'] = 1/len(trainData)
weights_data.drop('survived', axis=1)

Unnamed: 0,weight
0,0.000465
1,0.000465
2,0.000465
3,0.000465
4,0.000465
...,...
2145,0.000465
2146,0.000465
2147,0.000465
2148,0.000465


In [11]:
from sklearn.tree import DecisionTreeClassifier
for temp in range(0, rounds):
    features_train = trainData.drop(['survived'], axis = 1)
    target_train = trainData['survived'].where(trainData['survived'] == 1, 0)
    tree_model = DecisionTreeClassifier(criterion = "entropy", max_depth = 2)
    model = tree_model.fit(features_train, target_train, sample_weight=np.array(weights_data['weight']))
    models_algorithm.append(model)
    weights_data['hypothesis'] = model.predict(features_train)
    weights_data['is_hypothesis_incorrect'] = np.where(weights_data['hypothesis'] != trainData['survived'], 1, 0)
        
    epsilonT = np.sum(weights_data['weight'] * weights_data['is_hypothesis_incorrect'])
    if epsilonT > 0.5:
        continue

    betaT = np.log(1 / (epsilonT / (1 - epsilonT)))/2
    εt_algorithm.append(betaT)

    # recalculate weights 
    weights_data['weight'] *= np.exp(betaT * weights_data['is_hypothesis_incorrect'])

    # rescale weights
    weights_data['weight'].div(weights_data['weight'].sum())

In [12]:
X_test = testData.drop(['survived'], axis=1).reindex(range(len(testData)))
temp = []

for error, model in zip(εt_algorithm, models_algorithm):
    temp.append(error * model.predict(X_test))
predictions_algorithm = np.sign(np.sum(np.array(temp), axis=0))

In [15]:
testData['prediction'] = predictions_algorithm
testData['corrects_rows'] = (predictions_algorithm == testData.survived)
corrects_rows = np.sum(testData['corrects_rows'].astype(int))
print("Success: ",corrects_rows * 100 / len(testData.prediction),"%")

Success:  75.75757575757575 %


In [17]:
new_test_data = pd.read_csv("./titanikTest.csv", names=["pclass", "age", "gender", "survived"])
new_test_data['prediction'] = np.where(testData['prediction'] == 1, 'yes', 'no')
print(new_test_data)

   pclass    age gender survived prediction
0     1st  adult   male      yes        yes
1     1st  adult   male      yes        yes
2     1st  adult   male      yes        yes
3     1st  adult   male      yes        yes
4     1st  adult   male      yes        yes
..    ...    ...    ...      ...        ...
61    3rd  adult   male      yes         no
62    3rd  adult   male      yes         no
63    3rd  adult   male      yes         no
64    3rd  adult   male      yes         no
65    3rd  adult   male      yes         no

[66 rows x 5 columns]
