In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [None]:
titanic = pd.read_csv('.. PATH TO FILE ..')

In [None]:
titanic.head()

**Analysis: Continuous Features**

In [None]:
# Drop all categorical features
cat_feat = ['PassengerId', 'Name', 'Ticket', 'Sex', 'Cabin', 'Embarked']
titanic.drop(cat_feat, axis=1, inplace=True) #inplace : don't create new dataframe
titanic.head()

In [None]:
# Mean : what percent of people survived
titanic.describe()

In [None]:
# describe data based on class
titanic.groupby('Survived').mean()

In [None]:
titanic.groupby(titanic['Age'].isnull()).mean()

**PLOT CONTINUOUS FEATURES**

In [None]:
for i in ['Age', 'Fare']:
  died = list(titanic[titanic['Survived']==0][i].dropna())
  survived = list(titanic[titanic['Survived']==1][i].dropna())
  xmin = min(min(died), min(survived))
  xmax = max(max(died), max(survived))
  width = xmax - xmin / 40
  sns.distplot(died, color='r', kde=False, bins=np.arange(xmin,xmax, width))
  sns.distplot(survived, color='g', kde=False, bins=np.arange(xmin,xmax, width))
  plt.legend(['Did not survive', 'Survived'])
  plt.title('Overlaid histogram for {}'.format(i))
  plt.show()

The point on each of the plot represent the percentage of people survived. Vertical bars represent the error. Limited data shows large vertical bar, and rich data / confident data represents small vertical bars.

In [None]:
for i, col in enumerate(['Pclass', 'SibSp', 'Parch']):
  plt.figure(i)
  sns.catplot(x=col, y='Survived', data=titanic , kind='point' , aspect=2)

In [None]:
# SibSp and Parch relates to family. Hence merging it to a single feature
titanic['family_cnt'] = titanic['SibSp'] + titanic['Parch']
titanic.drop(['SibSp', 'Parch'], axis=1, inplace=True) # remove these features to prevent multi-collinearity (two or more features representing the same information)
sns.catplot(x='family_cnt', y='Survived', data=titanic, kind='point', aspect=2)

Fill missing age with average

In [None]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic.isnull().sum()

# **Analysis: Categorical Features**

In [None]:
titanic = pd.read_csv('.. PATH TO FILE ..')

In [None]:
# drop all continuous features
cont_feat = ['PassengerId', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Fare'] # remove name as well as it has no meaning in this dataset
titanic.drop(cont_feat, axis=1, inplace=True)
titanic.head()

**Explore Categorical Features**

In [None]:
titanic.info()

We can find that there are missing values for Cabin and Embarked

In [None]:
titanic.groupby(titanic['Cabin'].isnull()).mean()

66% of people who had non-missing Cabin value survived and 30% of peopl who had missing Cabin value survived. Although 80% of Cabin value is null in the dataset, the feature has strong correlation with the target variable (based on the above analysis). Hence it should not be deleted.

In [None]:
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)
titanic.head(10)

**Plotting Categorical Features**

In [None]:
for i, col in enumerate(['Cabin_ind', 'Sex', 'Embarked']):
  plt.figure(i)
  sns.catplot(x=col, y='Survived', data=titanic , kind='point' , aspect=2)

In [None]:
# pivot tables are used to explore relationships between multiple features
titanic.pivot_table('Survived', index='Sex', columns='Embarked', aggfunc='count')

In [None]:
titanic.pivot_table('Survived', index='Cabin_ind', columns='Embarked', aggfunc='count')

Inference for 3rd catplot: Fewer people from Southampton (S) survived because most of them were men (from fig: 1 (Men's survival rate is low)) and most people who boarded in C survived as most of them had cabins (from fig: 2 (people without cabins had low survival rate)). Since embarked is related to Sex and Cabin_ind it does not contribute much.

**Categorical Data Cleaning**

In [None]:
titanic = pd.read_csv('.. PATH TO FILE ..')
titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)
titanic.head()

In [None]:
# Create indicator for cabin
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)
titanic.head()

In [None]:
# Convert Sex to numeric
gender_num = {'male': 0, 'female': 1}
titanic['Sex'] = titanic['Sex'].map(gender_num)
titanic.head()

In [None]:
titanic.drop(['Cabin', 'Embarked'], axis=1 , inplace=True)
titanic.head()

## **Evaluation**

**Train/Validation/Test Split**

In [None]:
titanic = pd.read_csv('.. PATH TO FILE ..')

In [None]:
from sklearn.model_selection import train_test_split
features = titanic.drop(['Survived'], axis=1)
labels = titanic['Survived']
# test_size = 40% of the entire data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
print (len(labels), len(y_train), len(y_val), len(y_test))