In [556]:
#import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split # to split the data
from sklearn.model_selection import cross_val_score # To score the data
k_fold = KFold(n_splits=10, shuffle=True, random_state=0) 




In [557]:
#Load Titanic data
df = pd.read_csv("titanic.csv")


In [558]:
df.head(20)
df.shape

(891, 12)

In [559]:
#Sepreate the data to two datasets
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # Did a 80/20 split for the train and test data

x_train.head(30)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
140,141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,
817,818,0,2,"Mallet, Mr. Albert",male,31.0,1,1,S.C./PARIS 2079,37.0042,
378,379,0,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,
491,492,0,3,"Windelov, Mr. Einar",male,21.0,0,0,SOTON/OQ 3101317,7.25,
331,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124
588,589,0,3,"Gilinski, Mr. Eliezer",male,22.0,0,0,14973,8.05,
358,359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,
674,675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0,
162,163,0,3,"Bengtsson, Mr. John Viktor",male,26.0,0,0,347068,7.775,


In [560]:
x_train.shape 
x_test.shape

(179, 11)

In [561]:
#checking for null values
x_test.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             36
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          138
dtype: int64

In [562]:
#First making sure that the age category has no null values and grouping the ages groups from 1 -4
x_train["Age"] = x_train["Age"].fillna(x_train.groupby('Sex')['Age'].transform("median"))
x_test["Age"] = x_test["Age"].fillna(x_test.groupby('Sex')['Age'].transform("median"))


x_train['Age'] = x_train['Age'].apply(lambda x: 1 if x <= 18 else (2 if x <= 40 else (3 if x <= 60 else 4)))
x_test['Age'] = x_test['Age'].apply(lambda x: 1 if x <= 18 else (2 if x <= 40 else (3 if x <= 60 else 4)))



In [563]:
x_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
140,141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,2,0,2,2678,15.2458,
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,2,0,0,C.A. 18723,10.5,
817,818,0,2,"Mallet, Mr. Albert",male,2,1,1,S.C./PARIS 2079,37.0042,
378,379,0,3,"Betros, Mr. Tannous",male,2,0,0,2648,4.0125,
491,492,0,3,"Windelov, Mr. Einar",male,2,0,0,SOTON/OQ 3101317,7.25,
331,332,0,1,"Partner, Mr. Austen",male,3,0,0,113043,28.5,C124
588,589,0,3,"Gilinski, Mr. Eliezer",male,2,0,0,14973,8.05,
358,359,1,3,"McGovern, Miss. Mary",female,2,0,0,330931,7.8792,
674,675,0,2,"Watson, Mr. Ennis Hastings",male,2,0,0,239856,0.0,
162,163,0,3,"Bengtsson, Mr. John Viktor",male,2,0,0,347068,7.775,


In [564]:
x_train.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          549
dtype: int64

In [565]:
#Converting the age category to a useable numerical form
sex_mapping_values = {"male": 0, "female": 1}

x_train.replace({'Sex':sex_mapping_values}, inplace=True)
x_test.replace({'Sex':sex_mapping_values}, inplace=True)

  x_train.replace({'Sex':sex_mapping_values}, inplace=True)
  x_test.replace({'Sex':sex_mapping_values}, inplace=True)


In [566]:
features2drop = ['Name', 'Ticket', 'Fare', 'Cabin',"Survived", "Pclass"]  # After expereimenting, these were the features that need to be removed to give the best results

target = x_train["Survived"] #Target varibale to train and test on
x_train = x_train.drop(features2drop, axis=1)
x_train = x_train.drop(["PassengerId"], axis=1) # had to keep things sepete because it affect getting the passenger ID to keep track of the those who lived in the test



In [567]:
x_train.head()

Unnamed: 0,Sex,Age,SibSp,Parch
140,1,2,0,2
439,0,2,0,0
817,0,2,1,1
378,0,2,0,0
491,0,2,0,0


In [568]:
print(target)

140    0
439    0
817    0
378    0
491    0
      ..
835    1
192    1
629    0
559    1
684    0
Name: Survived, Length: 712, dtype: int64


In [569]:
#Creating the KNN alogorthim and getting the accuray by having a score
K_alogorthim = KNeighborsClassifier(n_neighbors =13)
scoring = "accuracy"
score = cross_val_score(K_alogorthim, x_train, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[0.70833333 0.80555556 0.73239437 0.81690141 0.83098592 0.83098592
 0.88732394 0.84507042 0.8028169  0.88732394]


In [570]:
#
useable_score = round(np.mean(score)*100, 2)
print(f'Our KNN classifier score is {useable_score} %')

Our KNN classifier score is 81.48 %


In [571]:
#Testing the alogrthim withthe test data
K_alogorthim.fit(x_train, target)

x_test = x_test.drop(features2drop, axis=1)
test_data = x_test.drop("PassengerId", axis=1).copy()

prediction = K_alogorthim.predict(test_data)


x_test.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch
495,496,0,2,0,0
648,649,0,2,0,0
278,279,0,1,4,1
31,32,1,2,1,0
255,256,1,2,0,2


In [572]:
#Chechking the results
prediction

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [573]:
#Creating a seperate file that has the results with the ID of the passengers that lived and died 
results = pd.DataFrame({
    'PassengerId': x_test['PassengerId'],
    'Survived': prediction
})
results.to_csv('titanic-submission1.csv', index=False)

In [574]:
#Checking the new file
submision = pd.read_csv('titanic-submission1.csv')
submision.head(30)


Unnamed: 0,PassengerId,Survived
0,496,0
1,649,0
2,279,0
3,32,1
4,256,1
5,299,0
6,610,1
7,319,1
8,485,0
9,368,1


In [575]:
number_of_Survivor = 0
for x in prediction:
    number_of_Survivor += x    
print(f"From the test data, the alogrthim predicts that there would be {number_of_Survivor} survivors from the test data ")

From the test data, the alogrthim predicts that there would be 64 survivors from the test data 
