In [33]:
import seaborn as sns
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix

In [34]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [35]:
df_subset = df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Age', 'Survived']]
df_subset = df.dropna() # Imputing is better, because na deletes the whole row
df_subset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,E46
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,G6
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,C103


In [36]:
dummies = pd.get_dummies(df_subset['Sex'])
dummies.head()

Unnamed: 0,female,male
1,1,0
3,1,0
6,0,1
10,1,0
11,1,0


In [37]:
df_subset = pd.concat([df_subset, dummies], axis=1)
df_subset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,E46,0,1
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,G6,1,0
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,C103,1,0


In [38]:
X = df_subset[['Pclass', 'male', 'female', 'SibSp', 'Parch', 'Age', 'Survived']] 
X = normalize(X) 
y = df_subset['Survived'] 

In [39]:
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X, y, test_size=0.3, random_state=1)
# X_train_k.head()

In [40]:
knn = KNeighborsClassifier(n_neighbors=3) #create a KNN-classifier with 3 neighbors
knn = knn.fit(X_train_k, y_train_k) # Confusion matrix (Y, Y-p)
knn.score(X_test_k, y_test_k)

0.9464285714285714

In [23]:
y_test_p = knn.predict(X_test_k) # put predicted values
cm = confusion_matrix(y_test_k, y_test_p) # confunsion matrix, 
conf_matrix = pd.DataFrame(cm, index=['Not survived', 'survived'], columns = ['Not survived pre', 'survived pre']) # Not suvived is null and survived is 1
conf_matrix # the actual are always in the rows, and what is predicted alwaus in the colums

Unnamed: 0,Not survived pre,survived pre
Not survived,17,2
survived,1,36


In [24]:
# accuracy, precision and recall
surv_1 = conf_matrix.iloc[1,1]
surv_1

36

In [25]:
surv_2 = conf_matrix.iloc[1,0]
surv_2

1

In [26]:
not_surv_1 = conf_matrix.iloc[0,1]
not_surv_1

2

In [27]:
not_surv_2 = conf_matrix.iloc[0,0]
not_surv_2

17

In [28]:
# accuracy: how many of the cases do we get right?
accuracy = (not_surv_2 + surv_1) / (not_surv_2 + not_surv_1 + surv_2 + surv_1)
accuracy

0.9464285714285714

In [29]:
# precision: how many of the precited survivers, are actually survivers? 
# precision and recall area always connected to one of the outcomes, in this case we choose the icon survived: 
precision = surv_1 / (surv_1 + not_surv_1)
precision

0.9473684210526315

In [31]:
# recall: how many from the actual survivers do I actiually catch?
recall = surv_1 / (surv_2 + surv_1)
recall

0.972972972972973