In [None]:
import pandas as pd

In [None]:
# Converts from csv into pandas' DataFrame class for easy computation
train = pd.read_csv('./titanic_data/train.csv')
test = pd.read_csv('./titanic_data/test.csv')

In [None]:
train.head(20)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
def bar_chart(feature):
    # Create two vectors that contain only the survived and dead passengers of that certain feature
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    # Create a dataframe using the two columns
    df = pd.DataFrame([survived, dead])
    print(df)
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked = True, figsize = (10,5))

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('SibSp')

In [None]:
bar_chart('Parch')

In [None]:
bar_chart('Embarked')

In [None]:
# We will use feature engineering to fill in values that were not given (e.g. NaN values in Cabin)
# We will also use it to create new features

In [None]:
len(train_test_data)

In [None]:
train_test_data = [train, test]

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
train['Title'].value_counts()

In [None]:
title_mapping = {"Mr":0, "Miss":1, "Mrs":2, "Master":3, "Dr":3, "Rev":3, "Mile":3, "Col":3, "Major":3, 
                 "Jonkheer":3, "Ms":3, "Lady":3, "Don":3, "Dona":3, "Mme":3, "Capt":3, "Countess":3, "Mme":3, "Sir":3}
for dataset in train_test_data:
    dataset["Title"] = dataset["Title"].map(title_mapping)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
bar_chart("Title")

In [None]:
test.drop("Name", axis=1, inplace=True)

In [None]:
test.head()

In [None]:
sex_mapping = {"male":0, "female":1}
for dataset in train_test_data:
    dataset["Sex"] = dataset["Sex"].map(sex_mapping)

In [None]:
test.head()

In [None]:
bar_chart("Sex")

In [None]:
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [None]:
facet = sns.FacetGrid(train, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train["Age"].max()))
facet.add_legend()

plt.show()

In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()
plt.xlim(0, 20)

In [None]:
for dataset in train_test_data:
    # Binning technique
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

In [None]:
bar_chart("Age")

In [None]:
#Pclass1, 2, 3 are series objects within the Panda framework
Pclass1 = train[train["Pclass"]==1]["Embarked"].value_counts()
Pclass2 = train[train["Pclass"]==2]["Embarked"].value_counts()
Pclass3 = train[train["Pclass"]==3]["Embarked"].value_counts()
PclassDf = pd.DataFrame([Pclass1, Pclass2, Pclass3])
PclassDf.index = ["1st class", "2nd class", "3rd class"]
PclassDf.plot(kind="bar", stacked=True, figsize=(10,5))

In [None]:
for dataset in train_test_data:
    dataset["Embarked"] = dataset["Embarked"].fillna("S")

In [None]:
train.head()

In [None]:
embark_map = {"S":0, "C":1, "Q":2}
for dataset in train_test_data:
    dataset["Embarked"] = dataset["Embarked"].map(embark_map)

In [None]:
train.head()

In [None]:
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Fare',shade= True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()
 
plt.show()

In [None]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3

In [None]:
bar_chart("Fare")

In [None]:
train.Cabin.value_counts()

In [None]:
for dataset in train_test_data:
    dataset["Cabin"] = dataset["Cabin"].str[:1]

In [None]:
bar_chart("Cabin")

In [None]:
Pclass1 = train[train["Pclass"]==1]["Cabin"].value_counts()
Pclass2 = train[train["Pclass"]==2]["Cabin"].value_counts()
Pclass3 = train[train["Pclass"]==3]["Cabin"].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ["1st class", "2nd class", "3rd class"]
df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [None]:
train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

In [None]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'FamilySize',shade= True)
facet.set(xlim=(0, train['FamilySize'].max()))
facet.add_legend()
plt.xlim(0)
plt.ylim(0, 1.2)

In [None]:
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)

In [None]:
train.head()

In [None]:
features_drop = ["Ticket", "SibSp", "Parch"]
train=train.drop(features_drop, axis=1)
test=test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

In [None]:
train.head()

In [None]:
train.drop("Name", axis=1)

In [None]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']

In [None]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

In [None]:
train.info()

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)