In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import copy
import math
import random
import time
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('train_clean.csv', index_col=0)
y = data['Survived'].copy()
X = data.drop(['PassengerId', 'Survived', 'Name'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=111)
X_train.shape, X_test.shape

((623, 9), (268, 9))

In [3]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
680,3,female,0.0,0,0,330935,8.1375,Unknown,Q
810,3,male,26.0,0,0,3474,7.8875,Unknown,S
727,3,female,0.0,0,0,36866,7.7375,Unknown,Q
775,3,male,18.0,0,0,347078,7.75,Unknown,S
795,2,male,39.0,0,0,28213,13.0,Unknown,S


In [4]:
def safeLog(x):
    if x == 0:
        return math.log(0.1)
    else:
        return math.log(x)

In [5]:
X_train['Age'].max()

71.0

In [14]:
def metric1(x1, x2, params):
    distance = 0
    #Pclass
    distance += abs(x1[0] - x2[0])/ 2
    #Sex
    distance += (x1[1] != x2[1]) * params[0]
    #Age
    if (x1[2] * x2[2] == 0) & (x1[2] != x2[2]):
        distance += params[1]
    else:
        distance += (abs(x1[2] - x2[2])/71.0) 
    #SibSp
    if x1[3] == x2[3]:
        pass
    elif x1[3] * x2[3] == 0:
        distance += 1
    else:
        distance += 0.75
    #Parch
    if x1[4] == x2[4]:
        pass
    elif x1[4] * x2[4] == 0:
        distance += 1
    else:
        distance += abs(x1[4] - x2[4]) / params[2]
    #Ticket
    distance += (x1[5] != x2[5]) 
    #Fare
    distance += abs(safeLog(x1[6]) - safeLog(x2[6]))/params[3]
    #Cabin
    if (x1[7] != x2[7]):
        distance += 1
    elif x1[7] == 0:
        distance += 0.3
    #Embarked
    distance += (x1[5] != x2[5])/100.0
    return distance
    

In [7]:
X_train.tail(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
437,2,female,24.0,2,3,29106,18.75,Unknown,S
36,3,male,0.0,0,0,2677,7.2292,Unknown,C
692,3,male,0.0,0,0,1601,56.4958,Unknown,S
215,1,female,31.0,1,0,35273,113.275,D36,C
809,1,female,33.0,1,0,113806,53.1,E8,S


In [8]:
def cabinToNumber(cabin):
    if cabin == 'Unknown':
        return 0
    else:
        return hash(cabin)

def toNumbers(data):
    X = data.copy()
    X['Sex'] = X['Sex'] == 'female'
    X['Ticket'] = X['Ticket'].apply(hash)
    X['Cabin'] = X['Cabin'].apply(cabinToNumber)
    X['Embarked'] = X['Embarked'].apply(ord)
    return X
    

In [9]:
X1_train = toNumbers(X_train)
X1_test = toNumbers(X_test)
X1_train.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
437,2,True,24.0,2,3,2329843637095374628,18.75,0,83
36,3,False,0.0,0,0,7177476274700848360,7.2292,0,67
692,3,False,0.0,0,0,9021645250135126965,56.4958,0,83
215,1,True,31.0,1,0,-3551648163321888517,113.275,8790039787250679278,67
809,1,True,33.0,1,0,1080942435171150265,53.1,-7273539201955125537,83


In [15]:
clf = KNeighborsClassifier(n_neighbors=9, algorithm='brute', metric=lambda a,b: metric1(a,b,[5.0, 0.1, 10.0, 3.0]))
clf.fit(X1_train, y_train)
clf.score(X1_test, y_test)

0.8134328358208955

In [70]:
y_pred = clf.predict(X1_test)

In [11]:
y_proba = clf.predict_proba(X1_test)
y_proba[:5]

array([[1.        , 0.        ],
       [0.88888889, 0.11111111],
       [0.44444444, 0.55555556],
       [1.        , 0.        ],
       [0.        , 1.        ]])

In [33]:
metricParams = []
for a in range(5,6):
    for b in [0.1]:
        for c in range(10,12,5):
            for d in range(3, 4):
                metricParams.append(lambda x1,x2: metric1(x1, x2, [a, b, c, d]))

In [36]:
start = time.clock()
knn = KNeighborsClassifier(algorithm='brute')
parameters ={'n_neighbors': range(9,15), 'weights': ['uniform', 'distance'], 'metric': metricParams}
clf1 = GridSearchCV(knn, parameters, cv=5)
clf1.fit(X1_train, y_train)
means = clf1.cv_results_['mean_test_score']
stds = clf1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print('Best parameters:', clf1.best_params_)
print(clf1.best_score_, accuracy_score(y_test, clf1.predict(X1_test)))
print('time spent: %.2f minutes' % ((time.clock() - start)/60))

0.791 (+/-0.065) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 9, 'weights': 'uniform'}
0.801 (+/-0.058) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 9, 'weights': 'distance'}
0.803 (+/-0.061) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 10, 'weights': 'uniform'}
0.801 (+/-0.072) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 10, 'weights': 'distance'}
0.798 (+/-0.084) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 11, 'weights': 'uniform'}
0.806 (+/-0.084) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 11, 'weights': 'distance'}
0.798 (+/-0.090) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 12, 'weights': 'uniform'}
0.803 (+/-0.078) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 12, 'weights': 'distance'}
0.796 (+/-0.092) for {'metric': <function <lambda> at 0x1a123e0f28>, 'n_neighbors': 13, 'weights': 'uniform'}
0.799 (+

In [37]:
knn_proba = clf1.predict_proba(X1_test)
type(knn_proba)

numpy.ndarray

In [38]:
knn_proba[:5]

array([[0.82210502, 0.17789498],
       [0.81726568, 0.18273432],
       [0.4472521 , 0.5527479 ],
       [1.        , 0.        ],
       [0.        , 1.        ]])