In [21]:
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix


In [8]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [9]:
df_subset = df[['Survived','Sex','SibSp', 'Parch', 'Pclass']]
df_subset = df_subset.dropna()
df_subset.head()

Unnamed: 0,Survived,Sex,SibSp,Parch,Pclass
0,0,male,1,0,3
1,1,female,1,0,1
2,1,female,0,0,3
3,1,female,1,0,1
4,0,male,0,0,3


In [10]:
dummies = pd.get_dummies(df_subset['Sex'])
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [11]:
df2 = pd.concat([df_subset, dummies], axis=1) #the axis=1 means: add it to the columns (axis=2 is rows)
df2.head()

Unnamed: 0,Survived,Sex,SibSp,Parch,Pclass,female,male
0,0,male,1,0,3,0,1
1,1,female,1,0,1,1,0
2,1,female,0,0,3,1,0
3,1,female,1,0,1,1,0
4,0,male,0,0,3,0,1


In [12]:
df2 = df2.drop("Sex", axis=1)

In [14]:
df2.head()

Unnamed: 0,Survived,SibSp,Parch,Pclass,female,male
0,0,1,0,3,0,1
1,1,1,0,1,1,0
2,1,0,0,3,1,0
3,1,1,0,1,1,0
4,0,0,0,3,0,1


In [17]:
X = df2[['SibSp', 'Parch', 'Pclass', 'female', 'male']] #create the X matrix
X = normalize(X) #normalize the matrix to put everything on the same scale
y = df2['Survived'] #create the y-variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [39]:
knn = KNeighborsClassifier(n_neighbors=3) #create a KNN-classifier with 3 neighbors, 3 is the best predicter!
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test)

0.753731343283582

In [20]:
#75% of the survivors are predicted accurately

In [28]:
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[132,  21],
       [ 45,  70]], dtype=int64)

In [32]:
conf_matrix = pd.DataFrame(cm, index=['Not Survived', 'Survived'], columns = ['Not survived_p', 'Survived_p']) 
conf_matrix

Unnamed: 0,Not survived_p,Survived_p
Not Survived,132,21
Survived,45,70


In [33]:
Recall = 132 / (132 + 21) 
Recall

0.8627450980392157

In [34]:
Precision = 132 / (132 + 45)
Precision

0.7457627118644068