In [75]:
import csv
import numpy as np
import copy
from scipy import stats
from scipy.stats import norm

In [2]:
def split_train_label(data):
    train_x = []
    train_y = []
    for i in data:
        train_x.append(i[1:])
        train_y.append([i[0]])
        
    return train_x,train_y

In [4]:
with open('titanic_data.csv','r') as file:
    temp = csv.reader(file)
    data = list(temp)

header = data[0]
data = data[1:]
for i in range(len(data)):
    row_len = len(data[0])
    for j in range(row_len):
        data[i][j] = float(data[i][j])
    
train_x, train_y = split_train_label(data)

# Convert all features into binary features

I used average as the criteria to change the data

In [6]:
#binary conversion
binary_avg = []
for i in range(len(header[1:])):
    total = 0
    avg = 0
    for j in train_x:
        total += j[i]  
    avg = total/len(train_x)      
    binary_avg.append(avg)
    
for i in range(len(train_x)):
    for j in range(len(train_x[0])):
        if train_x[i][j] >= binary_avg[j]:
            train_x[i][j] = 1.0
        else:
            train_x[i][j] = 0.0
            
print(binary_avg)

[2.305524239007892, 0.35400225479143177, 29.471443066516347, 0.5253664036076663, 0.3833145434047351, 32.30542018038328]


In [52]:
class KNN:
    def __init__(self,train_x,train_y,k_parameter = 10): # input can be k
        self.train_x = np.array(train_x)
        self.train_y = np.array(train_y)
        self.k_parameter = k_parameter
    def predict(self,test_x):
        test_x = np.array(copy.deepcopy(test_x))
        all_distance = []
        for idx in range(len(self.train_x)):
            single_train_x = self.train_x[idx]
            difference = np.subtract(single_train_x,test_x)
            distance = 0
            for single_error in difference:
                distance += abs(single_error)
            all_distance.append((idx,distance))
        all_distance = sorted(all_distance, key = self.get_difference)
        prediction = sum(self.train_y[x[0]] for x in all_distance[:self.k_parameter])/self.k_parameter
        
        return prediction
    def get_difference(self,distance_element):
        return distance_element[1]

In [53]:
predictor = KNN(train_x,train_y)

In [66]:
predictor.predict([0,0,0,0,0,1])

array([0.3])

In [72]:
result = []
for i in [1,2,3,4,5,10,20,50,100,200]:
    predictor = KNN(train_x,train_y,k_parameter = i)
    result.append(predictor.predict([0,0,0,0,0,1]))
print(result)

[array([1.]), array([0.5]), array([0.33333333]), array([0.25]), array([0.2]), array([0.3]), array([0.25]), array([0.26]), array([0.3]), array([0.29])]


In [69]:
np.subtract(np.array([0,1,3]),np.array([0,1,0]))

array([0, 0, 3])

# Naive Bayes

In [89]:
with open('titanic_data.csv','r') as file:
    temp = csv.reader(file)
    data = list(temp)

header = data[0]
data = data[1:]
for i in range(len(data)):
    row_len = len(data[0])
    for j in range(row_len):
        data[i][j] = float(data[i][j])
    
train_x, train_y = split_train_label(data)

In [125]:
class NaiveBayes:
    def __init__(self,train_x,train_y):
        self.train_x = np.array(train_x)
        self.train_y = np.array(train_y)
    def normal_pdf(self,average,variance,test_value):
        likelihood = 1/(np.sqrt(2 * np.pi * variance)) * np.exp( - (test_value - average)**2 / (2 * variance))
        return(likelihood)
    def predict(self,test_x):
        test_x = np.array(copy.deepcopy(test_x))
        # divide data into two class
        total_class = [[],[]]
    
        for i in range(len(self.train_x)):
            if self.train_y[i] == 0:
                total_class[0].append(self.train_x[i])
            elif self.train_y[i] == 1:
                total_class[1].append(self.train_x[i])              
        #calculate class 0 posterior
        class_probability = [0,0]
        for class_idx in range(2): # go over all class
            p_y = len(total_class[class_idx])/(len(total_class[0])+len(total_class[1]))
            for feature in range(len(self.train_x[0])):
                if feature == 0: # pclass multinomial
                    count = 0
                    for i in total_class[class_idx]:
                        if i[feature] == test_x[feature]:
                            count += 1
                    print(feature,(count+1)/(len(total_class[class_idx])+3))
                    p_y = p_y * (count+1)/(len(total_class[class_idx])+3)
                elif feature == 1: # gender binomial
                    count = 0
                    for i in total_class[class_idx]:
                        if i[feature] == test_x[feature]:
                            count += 1
                    print(feature,(count+1)/(len(total_class[class_idx])+2))
                    p_y = p_y * (count+1)/(len(total_class[class_idx])+2)
                else: # continuous
                    continuous_list = []
                    for i in total_class[class_idx]:
                        continuous_list.append(i[feature])
                    avg = sum(continuous_list)/len(continuous_list)
                    var = np.var(continuous_list)
                    print(feature,self.normal_pdf(avg,var,test_x[feature]))
                    p_y = p_y * self.normal_pdf(avg,var,test_x[feature])
            class_probability[class_idx] = p_y
        return class_probability

In [126]:
NB_predictor = NaiveBayes(train_x, train_y)

In [127]:
NB_predictor.predict([1.0, 1.0, 35.0, 1.0, 0.0, 53.1])

0 0.1478102189781022
1 0.14990859232175502
2 0.027022744380896704
3 0.29139931447447853
4 0.4459566065935878
5 0.007830412140269565
0 0.39710144927536234
1 0.6802325581395349
2 0.024939428202702174
3 0.4275361270460614
4 0.4315675017672968
5 0.005984177238311291


[3.743683030522327e-07, 2.867965121877786e-06]

In [94]:
norm(0,1)

<scipy.stats._distn_infrastructure.rv_frozen at 0x7fd90b7f5d30>

In [85]:
import math

In [128]:
train_y[3]

[1.0]

In [99]:
count = 0
for i in train_y:
    count += i[0]
    
print(count)

342.0


In [101]:
np.var([1,1])

0.0