In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset = pd.read_csv('titanic.csv')

In [3]:
dataset.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   float64
 1   survived   1309 non-null   float64
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   float64
 6   parch      1309 non-null   float64
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(7), object(7)
memory usage: 143.4+ KB


# Some preprocessing and EDA

In [5]:
# drop name, ids
dataset = dataset.drop(['name', 'boat', 'home.dest', 'body'], axis = 1)

In [6]:
dataset.isnull().sum()

pclass         1
survived       1
sex            1
age          264
sibsp          1
parch          1
ticket         1
fare           2
cabin       1015
embarked       3
dtype: int64

In [7]:
dataset['cabin'] = dataset['cabin'].fillna('Not Specified')

In [8]:
dataset['age'].mode()

0    24.0
dtype: float64

In [9]:
dataset['age'] = dataset['age'].fillna(24.0)

In [10]:
dataset.isnull().sum()

pclass      1
survived    1
sex         1
age         0
sibsp       1
parch       1
ticket      1
fare        2
cabin       0
embarked    3
dtype: int64

In [11]:
dataset = dataset.dropna()

In [12]:
dataset.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
cabin       0
embarked    0
dtype: int64

In [13]:
dataset.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1.0,1.0,female,29.0,0.0,0.0,24160,211.3375,B5,S
1,1.0,1.0,male,0.9167,1.0,2.0,113781,151.55,C22 C26,S
2,1.0,0.0,female,2.0,1.0,2.0,113781,151.55,C22 C26,S
3,1.0,0.0,male,30.0,1.0,2.0,113781,151.55,C22 C26,S
4,1.0,0.0,female,25.0,1.0,2.0,113781,151.55,C22 C26,S


In [19]:
dataset = pd.get_dummies(dataset)

In [20]:
dataset.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,ticket_110152,ticket_110413,...,cabin_F2,cabin_F33,cabin_F38,cabin_F4,cabin_G6,cabin_Not Specified,cabin_T,embarked_C,embarked_Q,embarked_S
0,1.0,1.0,29.0,0.0,0.0,211.3375,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1.0,1.0,0.9167,1.0,2.0,151.55,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1.0,0.0,2.0,1.0,2.0,151.55,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,0.0,30.0,1.0,2.0,151.55,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.0,0.0,25.0,1.0,2.0,151.55,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
X = dataset.drop('survived', axis = 1)
y = dataset['survived']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Decision Trees

In [29]:
from sklearn.tree import DecisionTreeClassifier
from mlxtend.evaluate import bias_variance_decomp

clf_dt = DecisionTreeClassifier(random_state=123)
clf_dt.fit(X_train.values,y_train.values)
y_pred=clf_dt.predict(X_test.values)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        clf_dt, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)


Average expected loss: 0.206
Average bias: 0.190
Average variance: 0.083


# RANDOM FOREST CLASSIFIER

In [30]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(max_depth=2, random_state=0)
clf_RF.fit(X_train.values,y_train.values)
y_pred=clf_RF.predict(X_test.values)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        clf_RF, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.379
Average bias: 0.387
Average variance: 0.009
