## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import precision_score, recall_score, accuracy_score
import graphviz
from IPython.display import display, Image

## Read the data

In [2]:
data = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


## Create a checkpoint

In [3]:
df = data.copy()

## Create a new column for males

In [4]:
df['male'] = df['Sex'] == 'male'
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare,male
0,0,3,male,22.0,1,0,7.25,True
1,1,1,female,38.0,1,0,71.2833,False
2,1,3,female,26.0,0,0,7.925,False
3,1,1,female,35.0,1,0,53.1,False
4,0,3,male,35.0,0,0,8.05,True


## Create the target and predictor

In [5]:
x = df.drop(['Survived', 'Sex'], axis=1).values
y = df['Survived'].values

## Split the data

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Set the hyperparameters for te param grid

In [7]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 15, 25],
    'min_samples_leaf': [1, 3],
    'max_leaf_nodes': [10, 20, 35, 50],
    'max_features': [None, 'sqrt', 'log2']
}

## Instantiating the tree class for gridsearch

In [8]:
dt = DecisionTreeClassifier()

dt_gs = GridSearchCV(dt, param_grid, scoring='f1', cv=5)
dt_gs.fit(x_train, y_train)

In [9]:
best_params = dt_gs.best_params_
best_params

{'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 'log2',
 'max_leaf_nodes': 20,
 'min_samples_leaf': 1}

In [10]:
dt_gs.best_score_

0.7756562526986077

## Train the model

In [11]:
dt_best = DecisionTreeClassifier(random_state=42, **best_params)
dt_best.fit(x_train, y_train)

## Display the tree

In [12]:
df.columns.tolist()

['Survived',
 'Pclass',
 'Sex',
 'Age',
 'Siblings/Spouses',
 'Parents/Children',
 'Fare',
 'male']

In [13]:
feature_names = ['Pclass', 'Age', 'Siblings/Spouses','Parents/Children', 'Fare', 'male']

dot_file = export_graphviz(dt_best, out_file=None, feature_names=feature_names, filled=True, rounded=True)
graph = graphviz.Source(dot_file)
graph.render(filename='tree', format='png', cleanup=True)

'tree.png'

In [14]:
import os
print(os.getcwd())

C:\Users\HP


## Evaluating the model

In [15]:
y_pred = dt_best.predict(x_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [17]:
print(f'Accuracy score : {accuracy*100:.2f}%')
print('Recall score : {:.2f}%'.format(recall*100))
print('Precision score : {:.2f}%'.format(precision*100))

Accuracy score : 77.53%
Recall score : 55.22%
Precision score : 78.72%
