In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, learning_curve, cross_val_score, validation_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, confusion_matrix, accuracy_score, classification_report

In [131]:
# import dataset
df = pd.read_csv('titanic_data/gender_submission.csv', sep=',', index_col='PassengerId')
test = pd.read_csv('titanic_data/test.csv', index_col='PassengerId')
train = pd.read_csv('titanic_data/train.csv', index_col='PassengerId')

In [132]:
df.sample(10)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
1203,0
1193,0
1051,1
1211,0
1118,0
994,0
1132,1
1284,0
1296,0
1131,1


In [133]:
test.sample(5)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q
968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S
1205,3,"Carr, Miss. Jeannie",female,37.0,0,0,368364,7.75,,Q
1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63.0,1,0,PC 17483,221.7792,C55 C57,S
1088,1,"Spedden, Master. Robert Douglas",male,6.0,0,2,16966,134.5,E34,C


In [134]:
train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
792,0,2,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S
21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
825,0,3,"Panula, Master. Urho Abraham",male,2.0,4,1,3101295,39.6875,,S
566,0,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.15,,S


In [135]:
df.isna().sum()

Survived    0
dtype: int64

In [136]:
train.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [137]:
train.shape

(891, 11)

In [138]:
train['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [139]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


### Cabin and Age column

In [140]:
train['Age'] = train['Age'].fillna(round(np.mean(train['Age']), 2))

In [141]:
train['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [142]:
train['Cabin'] = train['Cabin'].fillna('T')

In [143]:
# changing column 
train['Cabin'] = train['Cabin'].astype(str).str[0]

In [144]:
dic = {'T':8, 'C':3, 'E':5, 'G':7, 'D':4, 'A':1, 'B':2, 'F':6}
train['Cabin'].replace(dic, inplace=True)

In [145]:
train['Cabin'].unique()

array([8, 3, 5, 7, 4, 1, 2, 6])

In [146]:
#apply to test
test['Age'] = test['Age'].fillna(round(np.mean(test['Age']), 2))
test['Cabin'] = test['Cabin'].fillna('T')
test['Cabin'] = test['Cabin'].astype(str).str[0]
test['Cabin'].unique()

array(['T', 'B', 'E', 'A', 'C', 'D', 'F', 'G'], dtype=object)

In [147]:
dic = {'T':8, 'C':3, 'E':5, 'G':7, 'D':4, 'A':1, 'B':2, 'F':6}
test['Cabin'].replace(dic, inplace=True)
test['Cabin'].unique()

array([8, 2, 5, 1, 3, 4, 6, 7])

In [148]:
test.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        1
Cabin       0
Embarked    0
dtype: int64

In [149]:
test.loc[test['Fare'].isnull()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,8,S


In [150]:
test['Fare'] = test['Fare'].fillna(round(np.mean(test['Fare']), 2))
test.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

### Sex column

In [151]:
train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [152]:
train['Sex'].replace(['male', 'female'], [1, 0], inplace=True)

In [153]:
test['Sex'].replace(['male', 'female'], [1, 0], inplace=True)
train['Sex'].unique()

array([1, 0])

In [154]:
# deleting columns
to_del = ['Name', 'SibSp', 'Parch', 'Ticket']
train.drop(to_del, axis=1, inplace = True)
test.drop(to_del, axis=1, inplace = True)

In [155]:
train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
779,0,3,1,29.7,7.7375,8,Q
273,1,2,0,41.0,19.5,8,S
806,0,3,1,31.0,7.775,8,S
827,0,3,1,29.7,56.4958,8,S
513,1,1,1,36.0,26.2875,5,S


In [156]:
test.sample(5)

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1183,3,0,30.0,6.95,8,Q
1030,3,0,23.0,8.05,8,S
1172,3,0,23.0,8.6625,8,S
1223,1,1,39.0,29.7,1,C
1240,2,1,24.0,13.5,8,S


### Embarked 

In [157]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [158]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [159]:
train['Embarked'].fillna('S', inplace=True)

In [160]:
dic = {'S':1, 'C':2, 'Q':3}
train['Embarked'].replace(dic, inplace=True)
train['Embarked'].unique()

array([1, 2, 3])

In [161]:
test['Embarked'].fillna('S', inplace=True)
dic = {'S':1, 'C':2, 'Q':3}
test['Embarked'].replace(dic, inplace=True)
test['Embarked'].unique()

array([3, 1, 2])

In [165]:
# train sets
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)

# test sets 
y_test = df['Survived']
X_test = test

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((891, 6), (891,), (418, 6), (418,))

### Baseline

In [166]:
# normalizing 
mms = MinMaxScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.transform(X_test)

In [167]:
# instanciate model
model = KNeighborsClassifier(n_neighbors=10)

model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [168]:
y_pred = model.predict(X_test)

print(f'Train Score = {model.score(X_train, y_train):.3f}')
print(f'Test Score = {model.score(X_test, y_test):.3f}')

Train Score = 0.829
Test Score = 0.871


### Improving Baseline

In [192]:
k = np.arange(1, 25)

for i in k:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print(f'For k = {i}, Test score = {knn.score(X_train, y_train):.3f}, Train score = {knn.score(X_test, y_test):.3f}')


For k = 1, Test score = 0.983, Train score = 0.746
For k = 2, Test score = 0.883, Train score = 0.816
For k = 3, Test score = 0.881, Train score = 0.811
For k = 4, Test score = 0.862, Train score = 0.835
For k = 5, Test score = 0.873, Train score = 0.825
For k = 6, Test score = 0.854, Train score = 0.842
For k = 7, Test score = 0.847, Train score = 0.873
For k = 8, Test score = 0.835, Train score = 0.880
For k = 9, Test score = 0.837, Train score = 0.902
For k = 10, Test score = 0.829, Train score = 0.871
For k = 11, Test score = 0.833, Train score = 0.873
For k = 12, Test score = 0.831, Train score = 0.859
For k = 13, Test score = 0.826, Train score = 0.873
For k = 14, Test score = 0.825, Train score = 0.876
For k = 15, Test score = 0.826, Train score = 0.888
For k = 16, Test score = 0.822, Train score = 0.888
For k = 17, Test score = 0.825, Train score = 0.890
For k = 18, Test score = 0.817, Train score = 0.888
For k = 19, Test score = 0.816, Train score = 0.885
For k = 20, Test scor

In [196]:
# best model : k = 9
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f'Test score = {knn.score(X_train, y_train):.3f}')
print(f'Train score = {knn.score(X_test, y_test):.3f}')

Test score = 0.837
Train score = 0.902


In [197]:

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)

Confusion Matrix:
[[246  20]
 [ 21 131]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       266
           1       0.87      0.86      0.86       152

    accuracy                           0.90       418
   macro avg       0.89      0.89      0.89       418
weighted avg       0.90      0.90      0.90       418

