# Decision tree in general

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
%matplotlib inline

In [26]:
titanic_data = pd.read_csv('../Datasets/train.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check for NaN

In [32]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [31]:
titanic_data.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

To calculate **Precision** use following formula $\Large\frac{TP}{TP + FP}$, and for **Recall** $\Large\frac{TN}{TN + FN}$. For example, there are 9 passengers of Titanic, 6 out of them survived and 3 didn't. Model marked 7 as survived and 2 dead. It means Precission $\Large\frac{6}{6 + 1} = 85%$, and Recall $\Large\frac{2}{2 + 0} = 100%$.


# Decision tree

I can predict probability of titanic passenger to survive. But some records have no info about age. In this notebook I will try to fill empty "age" values to improve accuracy of decision tree. In the end I will compare accuracy of following models:

1. DecisionTree with predicted (using DecisionTreeRegression) age values
2. DecisionTree where missing age values are filled with mean value
3. RandomForest with predicted (using DecisionTreeRegression) age values
4. RandomForest where missing age values are filled with mean value

In [18]:
#Libs

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [19]:
titanic_data = pd.read_csv("../Datasets/train.csv")
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing

In [20]:
# Since type of Sex is object (string) and DecisionTree doesn't work with non number/boolean values i need to transform

titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

# Remove rows with NaN in age
rows_witn_unknown_age = titanic_data[np.isnan(titanic_data.Age)]
rows_witn_known_age = titanic_data[np.isnan(titanic_data.Age) == False]

## Train Decision Tree to predict age in the dataset

Here I use regressor rather than classifier to predict numeric value

In [21]:
# Split of data
X = rows_witn_known_age.get(['Survived', 'Sex', 'Pclass'])
Y = rows_witn_known_age.Age

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Learn ML model (DecisionTreeRegressor)
regressor_clf = DecisionTreeRegressor()
regressor_clf.fit(X_train, Y_train)

# Predcit age for missing rows
predicted_ages = regressor_clf.predict(rows_witn_unknown_age.get(['Survived', 'Sex', 'Pclass']))
# rows_witn_unknown_age.loc[:,'Age'] = predicted_ages
rows_witn_unknown_age.Age = predicted_ages
#
# # Create a titanic DataFrame with all data
titanic_data_with_age = rows_witn_known_age.merge(rows_witn_unknown_age, how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## Usage of DecisionTreeClassifier with predicted value in age column

In [22]:
X = titanic_data_with_age.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data_with_age.Survived

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), {'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision {precision_score(Y_test, predicted)}')
print(f'Recall {recall_score(Y_test, predicted)}')
print(f'F1-score {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision 0.8247422680412371
Recall 0.6896551724137931
F1-score 0.7511737089201878
Accuracy 0.8203389830508474


## Usage of DecisionTreeClassifier with median value in age column

In [23]:
titanic_data = pd.read_csv("../Datasets/train.csv")
titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

X = titanic_data.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data.Survived

X = X.fillna({ 'Age': X.Age.median() })

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), {'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision {precision_score(Y_test, predicted)}')
print(f'Recall {recall_score(Y_test, predicted)}')
print(f'F1-score {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision 0.8840579710144928
Recall 0.5083333333333333
F1-score 0.6455026455026456
Accuracy 0.7728813559322034


## Usage of RandomForest with median value in age column

In [24]:
titanic_data = pd.read_csv("../Datasets/train.csv")
titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

X = titanic_data.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data.Survived

X = X.fillna({ 'Age': X.Age.median() })

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(RandomForestClassifier(), {'n_estimators': [10,20,30], 'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision {precision_score(Y_test, predicted)}')
print(f'Recall {recall_score(Y_test, predicted)}')
print(f'F1-score {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision 0.8478260869565217
Recall 0.65
F1-score 0.7358490566037735
Accuracy 0.8101694915254237


## Usage of RandomForestClassifier with predicted value in age column

In [25]:
titanic_data = pd.read_csv("../Datasets/train.csv")

X = titanic_data_with_age.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data_with_age.Survived

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(RandomForestClassifier(), {'n_estimators': [10,20,30], 'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision {precision_score(Y_test, predicted)}')
print(f'Recall {recall_score(Y_test, predicted)}')
print(f'F1-score {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision 0.7614678899082569
Recall 0.7155172413793104
F1-score 0.7377777777777778
Accuracy 0.8
