In [11]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

# Read data from .csv file.

In [30]:
titanic_data = pd.read_csv('titanic.csv', index_col='PassengerId')
titanic_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Use only 4 features: Pclass, Fare, Age, Sex

In [37]:
data_to_process = titanic_data.drop(["Survived", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], axis=1)

# Exchange categorical `Sex` column by it's numerical representation
le = preprocessing.LabelEncoder()
data_to_process["Sex"] = le.fit_transform(list(data_to_process["Sex"]))

# Delete all raws with NAN Age
nan_idx = list(np.where(np.isnan(data_to_process["Age"]))[0])
nan_idx[:] = [idx + 1 for idx in nan_idx]
data_to_process_not_nan = data_to_process.drop(index=nan_idx)

data_to_process_not_nan.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,1,22.0,7.25
2,1,0,38.0,71.2833
3,3,0,26.0,7.925
4,1,0,35.0,53.1
5,3,1,35.0,8.05


# Get survived column

In [36]:
data_survived = titanic_data.drop(index=nan_idx)["Survived"]
print(data_survived)

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
886    0
887    0
888    1
890    1
891    0
Name: Survived, Length: 714, dtype: int64


# Convert pandas dataframes to format for fitting the model

In [42]:
X = list(zip(data_to_process_not_nan["Pclass"], data_to_process_not_nan["Sex"],
             data_to_process_not_nan["Age"], data_to_process_not_nan["Fare"]))
y = list(data_survived)

# Create and train decision tree

In [43]:
d_tree = DecisionTreeClassifier(random_state=241)
d_tree.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=241, splitter='best')

# Check importances

In [45]:
print(d_tree.feature_importances_)

[0.14000522 0.30051221 0.2560461  0.30343647]


Two most important features are: `Sex` and `Fare`!!!