In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import math

## Functions

In [225]:
def create_answer_df(y_col):
    answer_df = pd.DataFrame({'PassengerId':test_df['PassengerId'], 'Survived':y_col}, index=False)
    return answer_df

In [11]:
df = pd.read_csv("datasets/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Pre-Processing of data set

In [12]:
# check for number of na values in the dataset
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
# remove name and cabin column
df = df.drop(columns=["Name","Cabin", "Ticket"])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [15]:
# label encode gender column
genderEncoder = LabelEncoder()
genderEncoder.fit(df['Sex'])
sex_n = genderEncoder.transform(df['Sex'])

# replace col with label encoded column 
df['Sex'] = sex_n
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,S
1,2,1,1,0,38.0,1,0,71.2833,C
2,3,1,3,0,26.0,0,0,7.925,S
3,4,1,1,0,35.0,1,0,53.1,S
4,5,0,3,1,35.0,0,0,8.05,S


In [20]:
# label encode embarked column
embarkedEncoder = LabelEncoder()
embarkedEncoder.fit(df['Embarked'])
embarked_n = embarkedEncoder.transform(df['Embarked'])

# replace Embarked col with encoded col
df['Embarked'] = embarked_n
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [16]:
# fill all NAs for age with median 
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Age'].isna().sum()

np.int64(0)

In [17]:
# fill column 'embarked' NAs with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Embarked'].isna().sum()

np.int64(0)

In [39]:
# final column display after preprocessing 
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


### Model Selection 

In [26]:
# show correlation chart for survived
df.corr()['Survived'].sort_values()

Sex           -0.543351
Pclass        -0.338481
Embarked      -0.167675
Age           -0.064910
SibSp         -0.035322
PassengerId   -0.005007
Parch          0.081629
Fare           0.257307
Survived       1.000000
Name: Survived, dtype: float64

Based on the correlation chart, Sex and Pclass are chosen as the best two columns

### Logistic Regression Model 

##### Logistic regression is used as we need to classify passengers as survived or not survived 

In [117]:
lr = LogisticRegression(random_state=0)
lr.fit(df[['Sex', 'Pclass']].to_numpy(), df['Survived'].to_numpy()) 

#### Reports an accuracy of .76555

### Getting Test DF

In [110]:
test_df = pd.read_csv("datasets/test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Testing Label Encoding on test df

In [119]:
# label encoding of sex
le = LabelEncoder()

In [120]:
sex_n = le.fit_transform(test_df['Sex'])

In [121]:
test_df['Sex'] = sex_n

In [122]:
x_col = test_df[['Sex', 'Pclass']]

In [123]:
y_col = lr.predict(x_col.to_numpy())

In [131]:
answer_df = pd.DataFrame({'PassengerId':test_df['PassengerId'], 'Survived':y_col})

In [132]:
answer_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [133]:
answer_df.shape

(418, 2)

In [134]:
answer_df.to_csv("datasets/answer.csv")

### K Nearest Neighbors

In [137]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


#### Best results from knn is observed when only 'Sex' and 'Pclass' columns are used 

In [191]:
# get train and test split
train_x, test_x, train_y, test_y = train_test_split(df[['Sex', 'Pclass']], df['Survived'])

In [192]:
train_x.head()

Unnamed: 0,Sex,Pclass
743,1,3
353,1,3
846,1,3
713,1,3
507,1,1


In [193]:
train_y.head()

743    0
353    0
846    0
713    0
507    1
Name: Survived, dtype: int64

In [194]:
df.shape

(891, 9)

In [195]:
math.sqrt(891)

29.8496231131986

In [196]:
# rule of thumb is to take k values as square root
k = 29

In [219]:
# evaluate all k possibilites
for i in range(3, k+1, 2):
    KNF = KNeighborsClassifier(n_neighbors=i)
    KNF.fit(train_x, train_y)
    y_pred = KNF.predict(test_x)
    print(i, ":",  accuracy_score(y_pred, test_y))

3 : 0.7937219730941704
5 : 0.8430493273542601
7 : 0.8430493273542601
9 : 0.8430493273542601
11 : 0.8161434977578476
13 : 0.8430493273542601
15 : 0.7937219730941704
17 : 0.7937219730941704
19 : 0.7937219730941704
21 : 0.7937219730941704
23 : 0.7937219730941704
25 : 0.7937219730941704
27 : 0.7937219730941704
29 : 0.7937219730941704


Best value of k is 9 with cols 'Sex' and 'Pclass'

### Prediction using K Nearest Neighbors

In [220]:
KNF = KNeighborsClassifier(n_neighbors=9)
KNF.fit(train_x, train_y)

In [221]:
y_pred = KNF.predict(test_df[['Sex', 'Pclass']])

In [222]:
answer_df = create_answer_df(y_pred)

In [223]:
answer_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [224]:
answer_df.to_csv("datasets/answer_knn.csv")