In [None]:
#https://www.kaggle.com/code/dennisbakhuis/titanic-k-nearest-neighbor-knn-frmscratch-0-813/notebook

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df_test = pd.read_csv('/Users/Simon/OneDrive - Sistemas Expertos S.A.S/PC SIMON/Programacion/python/datasets/titanic/test.csv')
df_train = pd.read_csv('/Users/Simon/OneDrive - Sistemas Expertos S.A.S/PC SIMON/Programacion/python/datasets/titanic/train.csv')

#### Missing values

In [3]:
df_train['set'], df_test['set'] = 'train', 'test'
combined = pd.concat([df_train, df_test])

In [4]:
combined.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
set               0
dtype: int64

In [5]:
combined = combined.drop('Cabin',  axis=1)

In [6]:
pclass = combined.loc[combined.Fare.isnull(), 'Pclass'].values[0]
median_fare_pclass = combined.loc[combined.Pclass == pclass, 'Fare'].median()
combined.loc[combined.Fare.isnull(), 'Fare'] = median_fare_pclass

In [7]:
# Select everything before the . as title
combined['Title'] = combined['Name'].str.extract('([A-Za-z]+)\.', expand=True)
combined['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [8]:
title_reduction = {'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 
                   'Master': 'Master', 'Don': 'Mr', 'Rev': 'Rev',
                   'Dr': 'Dr', 'Mme': 'Miss', 'Ms': 'Miss',
                   'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr',
                   'Mlle': 'Miss', 'Col': 'Mr', 'Capt': 'Mr',
                   'Countess': 'Mrs','Jonkheer': 'Mr',
                   'Dona': 'Mrs'}
combined['Title'] = combined['Title'].map(title_reduction)
combined['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object)

In [9]:
for title, age in combined.groupby('Title')['Age'].median().items():
    print(title, age)
    combined.loc[(combined['Title']==title) & (combined['Age'].isnull()), 'Age'] = age

Dr 49.0
Master 4.0
Miss 22.0
Mr 30.0
Mrs 36.0
Rev 41.5


#### working with str values

In [10]:
s = (combined.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

['Name', 'Sex', 'Ticket', 'Embarked', 'set', 'Title']


In [11]:
#men and women to 0 and 1
combined['Sex'] = LabelEncoder().fit_transform(combined['Sex'])

In [12]:
combined['Title'] = LabelEncoder().fit_transform(combined['Title'])

In [13]:
#put fares into equal-sized bins
combined.loc[:, 'Age'] = pd.qcut(combined['Age'], 4, labels=False)
combined.loc[:, 'Fare'] = pd.qcut(combined['Fare'], 5, labels=False)

  combined.loc[:, 'Age'] = pd.qcut(combined['Age'], 4, labels=False)
  combined.loc[:, 'Fare'] = pd.qcut(combined['Fare'], 5, labels=False)


In [14]:
combined['Accompanied'] = combined['Ticket'].duplicated(keep=False).astype(int)
combined.loc[(combined['Accompanied']==1)|(combined['SibSp']>0)|(combined['Parch']>0), 'Accompanied'] = 1
combined.loc[(combined['Accompanied']==0)&(combined['SibSp']==0)&(combined['Parch']==0), 'Accompanied'] = 0

In [80]:
#gets less noise the accompanied option than the size of the family
#combined['family_size'] = combined['Parch'] + combined['SibSp']

In [15]:
combined.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'set', 'Title', 'Accompanied'],
      dtype='object')

In [16]:
#scale columns
selected = ['Pclass', 'Sex', 'Age', 'Fare', 'Accompanied', 'Title']
scaler = StandardScaler()
scaler.fit(combined[selected])
combined[selected] = scaler.transform(combined[selected])

In [17]:
train = combined.loc[combined['set']=='train'].drop('set', axis=1).reset_index(drop=True)
test = combined.loc[combined['set']=='test'].drop('set', axis=1).reset_index(drop=True)

In [18]:
y_train = train.Survived

In [19]:
train = train.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Embarked'], axis=1)
test = test.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Embarked'], axis=1)

In [89]:
#without Embarked it increase from 75 to 77, this column contained noise and unesful info
#train = train.drop('Embarked', axis=1)
#test = test.drop('Embarked', axis=1)

In [20]:
test

Unnamed: 0,Pclass,Sex,Age,Fare,Title,Accompanied
0,0.841916,0.743497,0.646019,-1.392358,0.192495,-0.987096
1,0.841916,-1.344995,1.541823,-1.392358,1.484682,1.013072
2,-0.352091,0.743497,1.541823,-0.690279,0.192495,-0.987096
3,0.841916,0.743497,-0.249785,-0.690279,0.192495,-0.987096
4,0.841916,-1.344995,-1.145589,0.011800,1.484682,1.013072
...,...,...,...,...,...,...
413,0.841916,0.743497,-0.249785,-0.690279,0.192495,-0.987096
414,-1.546098,-1.344995,1.541823,1.415958,1.484682,1.013072
415,0.841916,0.743497,1.541823,-1.392358,0.192495,-0.987096
416,0.841916,0.743497,-0.249785,-0.690279,0.192495,-0.987096


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [22]:
train_X, val_X, train_y, val_y = train_test_split(train, y_train, test_size=0.33, random_state=0)

In [23]:
# KNN
#0.75
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)
predictions = knn.predict(val_X)
acc_score = knn.score(train_X, train_y) * 100
print(acc_score)
mse = mean_absolute_error(val_y, predictions)
rmse = mse**.5
print(mse)
print(rmse)

85.57046979865773
0.17627118644067796
0.41984662251907895


In [24]:
# Random Forest
rfc = RandomForestClassifier()
rfc.fit(train_X, train_y)
predictions = rfc.predict(val_X)
acc_score = rfc.score(train_X, train_y) * 100
print(acc_score)
mse = mean_absolute_error(val_y, predictions)
rmse = mse**.5
print(mse)
print(rmse)

86.24161073825503
0.18983050847457628
0.435695430862634


In [25]:
dtc = DecisionTreeClassifier()
dtc.fit(train_X, train_y)
predictions = dtc.predict(val_X)
acc_score = dtc.score(train_X, train_y) * 100
print(acc_score)
mse = mean_absolute_error(val_y, predictions)
rmse = mse**.5
print(mse)
print(rmse)

86.24161073825503
0.2
0.4472135954999579


In [24]:
predictions_test = knn.predict(test)

In [25]:
output = df_test['PassengerId'].copy()
output = pd.DataFrame(output)
output['Survived'] = predictions_test.astype(int)
#output = output.reset_index(drop=True)
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [59]:
#this output gets 77 percent of the grade
output.to_csv('C:/Users/Simon/OneDrive - Sistemas Expertos S.A.S/PC SIMON/Programacion/python/datasets/titanic/outputKnn.csv', index=False)