In [4]:
import csv
import sys
import numpy as np
import pandas as pd
import pprint

N = 887

In [119]:
class KNN:
    def __init__(self, data):
        self.data = data
        
    def classify(self, example, k):
        self.k = k
        self.dists = [sys.float_info.max] * self.k
        self.labels = [0] * self.k
        for d in self.data:
            dist = self._calc_distance(example, d)
            index = np.argmax(self.dists)
            if dist < self.dists[index]:
                self.dists[index] = dist
                self.labels[index] = d[0]
        prediction = np.bincount(self.labels).argmax()
        return 'Survived' if prediction else 'Perished'
            
    def _calc_distance(self, example, d):
        ex = np.array(example)
        ex[1] *= 100
        dd = np.array(list(map(float, d[1:])))
        dd[1] *= 100
        return np.linalg.norm(ex - dd)

In [6]:
f = open('titanic_data.csv', 'r')
r = csv.reader(f)
next(r)
rows = []
for row in r:
    rows.append(row)
f.close()

In [126]:
for k in [1, 3, 5, 7, 9, 11]:
    prediction = knn.classify([2, 0, 25, 1, 0, 30], k=k)
    print({'k': k, 'prediction': prediction})

{'k': 1, 'prediction': 'Survived'}
{'k': 3, 'prediction': 'Survived'}
{'k': 5, 'prediction': 'Perished'}
{'k': 7, 'prediction': 'Perished'}
{'k': 9, 'prediction': 'Perished'}
{'k': 11, 'prediction': 'Perished'}


In [30]:
class NB:
    def __init__(self, data):
        self.data = data
        self.prob_one = 0
        self.prob_zero = 0
        self.avgs_vars = pd.DataFrame(0.0, index=['age_1', 'fare_1', 'age_0', 'fare_0'], columns=['avg', 'var'])
        self.nomials = self._init_nomials()

        self._calc_avgs()
        self._calc_vars()
        self._calc_nomials()
        

    def _init_nomials(self):
        index = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_0', 'Sex_1']
        for i in range(9):
            index.append(f'Siblings/Spouses Aboard_{i}')
        for i in range(7):
            index.append(f'Parents/Children Aboard_{i}')
        return pd.DataFrame(0.0, index=index, columns=['survive', 'perish'])


    def _calc_avgs(self):
        self.avgs_vars.at['age_1', 'avg'] = self.data.loc[self.data['Survived'] == 1]['Age'].mean()
        self.avgs_vars.at['fare_1', 'avg'] = self.data.loc[self.data['Survived'] == 1]['Fare'].mean()
        self.avgs_vars.at['age_0', 'avg'] = self.data.loc[self.data['Survived'] == 0]['Age'].mean()
        self.avgs_vars.at['fare_0', 'avg'] = self.data.loc[self.data['Survived'] == 0]['Fare'].mean()
        

    def _calc_vars(self):
        self.avgs_vars.at['age_1', 'var'] = self.data.loc[self.data['Survived'] == 1]['Age'].var()        
        self.avgs_vars.at['fare_1', 'var'] = self.data.loc[self.data['Survived'] == 1]['Fare'].var()
        self.avgs_vars.at['age_0', 'var'] = self.data.loc[self.data['Survived'] == 0]['Age'].var()
        self.avgs_vars.at['fare_0', 'var'] = self.data.loc[self.data['Survived'] == 0]['Fare'].var()

    
    def _calc_nomials(self):
        cols = ['Pclass', 'Sex', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']
        ones = self.data.loc[self.data['Survived'] == 1][cols]
        zeros = self.data.loc[self.data['Survived'] == 0][cols]
        ones_size = ones['Sex'].size
        zeros_size = zeros['Sex'].size
        self.prob_one = ones_size / self.data['Sex'].size
        self.prob_zero = 1 - self.prob_one
        
        for i in range(1, 4):
            one = (ones.loc[ones['Pclass'] == i]['Pclass'].size + 1) / (ones_size + 3)
            zero = (zeros.loc[zeros['Pclass'] == i]['Pclass'].size + 1) / (zeros_size + 3)
            self.nomials.at[f'Pclass_{i}', 'survive'] = one
            self.nomials.at[f'Pclass_{i}', 'perish'] = zero
        
        for i in range(2):
            one = (ones.loc[ones['Sex'] == i]['Sex'].size + 1) / (ones_size + 2)
            zero = (zeros.loc[zeros['Sex'] == i]['Sex'].size + 1) / (zeros_size + 2)
            self.nomials.at[f'Sex_{i}', 'survive'] = one
            self.nomials.at[f'Sex_{i}', 'perish'] = zero
            
        for i in range(9):
            one = (ones.loc[ones['Siblings/Spouses Aboard'] == i]['Siblings/Spouses Aboard'].size + 1) / (ones_size + 9)
            zero = (zeros.loc[zeros['Siblings/Spouses Aboard'] == i]['Siblings/Spouses Aboard'].size + 1) / (zeros_size + 9) 
            self.nomials.at[f'Siblings/Spouses Aboard_{i}', 'survive'] = one
            self.nomials.at[f'Siblings/Spouses Aboard_{i}', 'perish'] = zero
            
        for i in range(7):
            one = (ones.loc[ones['Parents/Children Aboard'] == i]['Parents/Children Aboard'].size + 1) / (ones_size + 7)
            zero = (zeros.loc[zeros['Parents/Children Aboard'] == i]['Parents/Children Aboard'].size + 1) / (zeros_size + 7)
            self.nomials.at[f'Parents/Children Aboard_{i}', 'survive'] = one
            self.nomials.at[f'Parents/Children Aboard_{i}', 'perish'] = zero
            
            
    def _calc_gauss(self, index, val):
        mean = self.avgs_vars.at[index, 'avg']
        var = self.avgs_vars.at[index, 'var']
        numer = - ((val - mean) ** 2) / (2 * var)
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(numer)
            
            
    def prediction(self, sample):
        for label in ['survive', 'perish']:
            pclass = self.nomials.at[f'Pclass_{int(sample[0])}', label]
            sex = self.nomials.at[f'Sex_{int(sample[1])}', label]
            sib = self.nomials.at[f'Siblings/Spouses Aboard_{int(sample[3])}', label]
            child = self.nomials.at[f'Parents/Children Aboard_{int(sample[4])}', label]
            if label == 'survive':
                age = self._calc_gauss(f'age_{1}', int(sample[1]))
                fare = self._calc_gauss(f'fare_{1}', int(sample[5]))
            else:
                age = self._calc_gauss(f'age_{0}', int(sample[1]))
                fare = self._calc_gauss(f'fare_{0}', int(sample[5]))
            setattr(self, label, self.prob_one * pclass * sex * age * sib * child * fare)

        return 'Survived' if self.survive > self.perish else 'Perished'
        
        
        

In [27]:
rows = pd.read_csv('titanic_data.csv')

In [31]:
nb = NB(rows)

In [44]:
x = [2, 0, 25, 1, 0, 30]
nb.prediction(x)

'Perished'