<a href="https://colab.research.google.com/github/kanacb/machinelearning/blob/main/DecisionTrees_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import software libraries and load the dataset #

In [2]:
import sys                             # Read system parameters.
import os                              # Interact with the operating system.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib                      # Create 2D charts.
import matplotlib.pyplot as plt
import seaborn as sb                   # Perform data visualization.
import scipy as sp                     # Perform scientific computing and advanced mathematics.
import sklearn                         # Perform data mining and analysis.
from time import time                  # Calculate training time.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Seaborn {}'.format(sb.__version__))
print('- SciPy {}'.format(sp.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
file_train = "https://github.com/kanacb/machinelearning/blob/main/dt_titanic_train.csv?raw=true"
file_test = "https://github.com/kanacb/machinelearning/blob/main/dt_titanic_test.csv?raw=true"
data_raw = pd.read_csv(file_train, index_col=0)
print('Loaded {} records from {}.'.format(len(data_raw), file_train))

Libraries used in this project:
- Python 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
- NumPy 1.19.5
- pandas 1.1.5
- Matplotlib 3.2.2
- Seaborn 0.11.2
- SciPy 1.4.1
- scikit-learn 0.22.2.post1

Loaded 891 records from https://github.com/kanacb/machinelearning/blob/main/dt_titanic_train.csv?raw=true.


# Split the datasets

In [None]:
# Separate training and test sets already exist.
# A validation set will be split off from the training sets.

from sklearn.model_selection import train_test_split

# 'Survived' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_columns = ['Survived']

training_columns = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

# Split the training and validation datasets and their labels.
X_train, X_val, y_train, y_val = train_test_split(data_raw[training_columns],
                                                                            data_raw[label_columns],
                                                                            random_state = 1912)

print('The training and validation datasets and labels have been split.')

# Look for categorical features that need to be one-hot encoded

In [None]:
X_train.head()

# Perform common preparation on the training and validation sets

In [None]:
# Perform common cleaning and feature engineering tasks on datasets.
def prep_dataset(dataset):
    
    # PROVIDE MISSING VALUES
    
    # Fill missing Age values with the median age.
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    # Fill missing Fare values with the median fare.
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

    # Fill missing Embarked values with the mode.
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    # ONE-HOT ENCODING
    
    cols = ['Pclass', 'Sex', 'Embarked']
    
    for i in cols:
        dummies = pd.get_dummies(dataset[i], prefix = i, drop_first = False)
        dataset = pd.concat([dataset, dummies], axis = 1)

    return dataset

X_train = prep_dataset(X_train.copy())

X_val = prep_dataset(X_val.copy())

print('The dataset has been cleaned and prepared.')

# Drop columns that won't be used for training

In [None]:
# Drop unused columns from datasets.
def drop_unused(dataset):
        
    dataset = dataset.drop(['PassengerId'], axis = 1)
    dataset = dataset.drop(['Cabin'], axis = 1)
    dataset = dataset.drop(['Ticket'], axis = 1)
    dataset = dataset.drop(['Name'], axis = 1)

    # These have been replaced with one-hot encoding.
    dataset = dataset.drop(['Pclass'], axis = 1)
    dataset = dataset.drop(['Sex'], axis = 1)
    dataset = dataset.drop(['Embarked'], axis = 1)
    
    return dataset

X_train = drop_unused(X_train.copy())

X_val = drop_unused(X_val.copy())

print('Columns that will not be used for training have been dropped.')

# Preview current training data

In [None]:
X_train.head()

# Create a basic decision tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state = 1912)
start = time()
tree.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start) * 1000

prediction = tree.predict(X_val)

# Score using the validation data.
score = tree.score(X_val, y_val)

print('Decision tree model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.0f}%'.format(score * 100))

# Visualize the decision tree structure

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image, display 
import pydotplus as pdotp

def plot_tree(model, image):
    dot_data = StringIO()
    export_graphviz(model, out_file = dot_data, 
                    filled = True,
                    rounded = True,
                    special_characters = True, 
                    feature_names = X_train.columns.values.tolist(),
                    class_names = ['0', '1'])

    graph = pdotp.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png(image)
    Image(graph.create_png())
    
print('A function to plot the decision tree structure has been defined.')

# Compute accuracy, precision, recall, and F<sub>1</sub> score

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def model_scores(y, prediction):
    acc = accuracy_score(y, prediction)
    print('Accuracy: {:.0f}%'.format(np.round(acc * 100)))
    
    precision = precision_score(y, prediction)
    print('Precision: {:.0f}%'.format(np.round(precision * 100)))
    
    recall = recall_score(y, prediction)
    print('Recall: {:.0f}%'.format(np.round(recall * 100)))
    
    f1 = f1_score(y, prediction)
    print('F1: {:.0f}%'.format(np.round(f1 * 100)))
    
print('A function to compute the model scores has been defined.')

# Generate a ROC curve and compute the AUC

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

def roc(y, prediction_proba):
    fpr, tpr, thresholds = roc_curve(y, prediction_proba)
    
    plt.plot(fpr, tpr);
    plt.xlim([0.0, 1.0]);
    plt.ylim([0.0, 1.0]);
    plt.title('ROC Curve');
    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate');
    plt.grid(True);
    
    auc = roc_auc_score(y, prediction_proba)
    print('Area Under Curve: {:.2f}'.format(auc))
    
print('A function to generate the ROC curve and compute AUC has been defined.')

# Generate a precision–recall curve and compute the average precision

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

def prc(y, prediction_proba):
    precision, recall, thresholds = precision_recall_curve(y, prediction_proba)
    
    plt.plot(recall, precision);
    plt.xlim([0.0, 1.0]);
    plt.ylim([0.0, 1.0]);
    plt.title('Precision–Recall Curve');
    plt.xlabel('Recall');
    plt.ylabel('Precision');
    plt.grid(True);
    
    ap = average_precision_score(y, prediction_proba)
    print('Average Precision: {:.2f}'.format(ap))
    
print('A function to generate the PRC and compute average precision has been defined.')

# Evaluate the initial decision tree model

In [None]:
plot_tree(tree, 'titanic.png')
display(Image('titanic.png'))

In [None]:
initial_predict = tree.predict(X_val)

model_scores(y_val, initial_predict)

In [None]:
initial_predict_proba = tree.predict_proba(X_val)

roc(y_val, initial_predict_proba[:, 1])

In [None]:
prc(y_val, initial_predict_proba[:, 1])

# Performing pre-pruning on the decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

pruned_tree = DecisionTreeClassifier(max_depth = 4, random_state = 1912)
start = time()
pruned_tree.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start) * 1000

prediction = pruned_tree.predict(X_val)

# Score using the validation data.
score = pruned_tree.score(X_val, y_val)

print('Decision tree model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.0f}%'.format(score * 100))

# Evaluate the pruned decision tree model

In [None]:
plot_tree(pruned_tree, 'titanic_pruned.png')
display(Image('titanic_pruned.png'))

In [None]:
pruned_predict = pruned_tree.predict(X_val)

model_scores(y_val, pruned_predict)

In [None]:
pruned_predict_proba = pruned_tree.predict_proba(X_val)

roc(y_val, pruned_predict_proba[:, 1])

In [None]:
prc(y_val, pruned_predict_proba[:, 1])

# Fit a decision tree model using randomized search with cross-validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

dist = {'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': sp_randint(2, 10),
        'min_samples_split': sp_randint(5, 100),
        'min_samples_leaf': sp_randint(5, 100)}

search = RandomizedSearchCV(tree, param_distributions = dist, n_iter = 500, 
                            scoring = 'f1', cv = 5, iid = False, random_state = 1912)
search.fit(X_train, np.ravel(y_train));
optimized_tree = search.best_estimator_

print(search.best_params_)

# Evaluate the optimized model

In [None]:
plot_tree(optimized_tree, 'titanic_optimized.png')
display(Image('titanic_optimized.png'))

In [None]:
optimized_predict = optimized_tree.predict(X_val)

model_scores(y_val, optimized_predict)

In [None]:
optimized_predict_proba = optimized_tree.predict_proba(X_val)

roc(y_val, optimized_predict_proba[:, 1])

In [None]:
prc(y_val, optimized_predict_proba[:, 1])