# Sales Opps - Decision Tree Model

This notebook builds a Decision Tree model from training data in CSV format. It relies on a CSV file output from the Sales data preparation notebook.

In [None]:
# Define some exclusions for PEP8 that don't apply when the Jupyter Notebook
#   is exported to .py file
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name

# %pip install scikit-learn

import os
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as sklm

# Setting custom parameters for this model

In [None]:
# percentage of training data to use for test
TEST_SIZE = 0.33

# random seed
RANDOM_STATE = 1234

# the name for the column that indicates the label/target
# the training CSV file should have headers
LABEL_COLUMN_NAME = "Won"

# First search range for Max Tree Depth to find a local max:
# max_depth_range = [2**i for i in range(6)] # ranges from 1 to 2^5

# second search range, zooming in on near the broad maximum found
MAX_TREE_DEPTH_RANGE = range(3, 20, 2)

# original test to look for local maximum broadly
# leaf_range = [2**i for i in range(8)]
# new range to local specific max:
MIN_SAMPLES_TO_MAKE_LEAF_RANGE = range(2, 20, 2)

# acceptable values: accuracy, f1, precision, recall
METRIC_TO_MEASURE = "accuracy"

# "error", "ignore", "always", "default", "module" or "once"
warnings.filterwarnings('always')


def get_best_hyperparameter_value(hyperparameter_list, metric_list):
    index_max = max(range(len(metric_list)), key=metric_list.__getitem__)
    return hyperparameter_list[index_max]

In [None]:
filename = os.path.join(
   os.getcwd(), "data", "dummy_sfdc_data_train.csv"
)
# filename = "sales_data_train.csv"
df = pd.read_csv(filename, header=0)

In [None]:
# verify that there are only numeric and boolean datatypes left
# there should not be any strings left
for index, value in df.dtypes.items():
    assert value in [
        "float64",
        "bool",
        "int64",
    ], f"Column name {index} is not numeric or boolean- found {value}. All features at this point should be numeric or boolean. Exiting."

print("Feature datatype check passed.")

# Check that the data does not contain any missing values.
The absense of missing values is necessary for training a Decision Tree model.

In [None]:
# get a Panda Series of the columns and number of NaNs in each one
nan_count = np.sum(df.isnull(), axis=0)

# iterate through the Series. It could be easier to just throw and exception if
# any have a value of zero.
for index, value in nan_count.items():
    assert (
        value == 0
    ), f"Column name {df.columns[index]} (index = {index}) has {value} missing values (NaN). Decision trees cannot have any missing values. Exiting."

# Split up the data

In [None]:
y = df[LABEL_COLUMN_NAME]
X = df.drop(columns=LABEL_COLUMN_NAME, axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Define a function that builds a model given hyperparameters

In [None]:
def train_test_DT(X_train1, X_test1, y_train1, y_test1, leaf1, depth1, crit="entropy"):
    """
    Fit a Decision Tree classifier to the training data X_train, y_train.
    Return the accuracy of resulting predictions on the test set.
    Parameters:
        leaf := The minimum number of samples required to be at a leaf node
        depth := The maximum depth of the tree
        crit := The function to be used to measure the quality of a split.

                Default: gini.
    """
    # Instantiate the  Scikit-learn DecisionTreeClassifier model object
    # with specific hyperparameters
    model = DecisionTreeClassifier(
        max_depth=depth1, min_samples_leaf=leaf1, criterion=crit
    )

    # Fit the model to the training data below
    model.fit(X_train1, y_train1)

    # Make predictions on the test data and store the results
    class_label_predictions = model.predict(X_test1)

    if sum(class_label_predictions) == 0:
        prec = 0
        print(f'WARNING: No True predictions for model with max_depth={depth1} min_samples_leaf={leaf1}. Force setting precision = {prec}')
    else:
        prec = sklm.precision_score(y_test1, class_label_predictions)

    metrics = {
        "accuracy": sklm.accuracy_score(y_test1, class_label_predictions),
        "f1": sklm.f1_score(y_test1, class_label_predictions),
        "precision": prec,
        "recall": sklm.recall_score(y_test1, class_label_predictions),
    }
    
    return metrics

# Train on different hyperparameter values

### Hyperparameter value: Maximum Depth of Tree

In [None]:
# specify the min number of samples to create a leaf as 1 for this
# first iteration
min_samples_to_create_leaf = 5

# initialize an empty list to store results
metrics_list = []

# iterate on the list of max depths
for iter_max_depth_value in MAX_TREE_DEPTH_RANGE:
    # train the model and store the metric
    resulting_metrics = train_test_DT(
        X_train,
        X_test,
        y_train,
        y_test,
        min_samples_to_create_leaf,
        iter_max_depth_value,
    )

    # add the results to the list for future plotting
    metrics_list.append(float(resulting_metrics[METRIC_TO_MEASURE]))

fig = plt.figure()
ax = fig.add_subplot(111)
p = sns.lineplot(
    x=MAX_TREE_DEPTH_RANGE, y=metrics_list, marker="o", label="Full training set"
)

plt.title(f"Test set {METRIC_TO_MEASURE} of the Decision Tree predictions")
ax.set_xlabel("Max depth of Decision Tree")
ax.set_ylabel(METRIC_TO_MEASURE)
plt.show()

# Storing the best value for hyperparameter: Maximum Tree Depth

In [None]:
OPTIMUM_MAX_DEPTH = get_best_hyperparameter_value(MAX_TREE_DEPTH_RANGE, metrics_list)
print(OPTIMUM_MAX_DEPTH)

# best max depth value found:
# OPTIMUM_MAX_DEPTH = 6

In [None]:
# initialize an empty list to store results
metrics_list_leaf = []

# iterate through list of potential leaf values
for leaf in MIN_SAMPLES_TO_MAKE_LEAF_RANGE:
    resulting_metrics = train_test_DT(X_train, X_test, y_train, y_test, leaf, OPTIMUM_MAX_DEPTH)
    metrics_list_leaf.append(float(resulting_metrics[METRIC_TO_MEASURE]))

# print(metrics_list_leaf)

fig = plt.figure()
ax = fig.add_subplot(111)
p = sns.lineplot(
    x=MIN_SAMPLES_TO_MAKE_LEAF_RANGE,
    y=metrics_list_leaf,
    marker="o",
    label="Full training set",
)

plt.title(f"Test set {METRIC_TO_MEASURE} of the DT predictions, Leaf")
ax.set_xlabel("minimum number of samples required to be at a leaf node")
ax.set_ylabel(METRIC_TO_MEASURE)
plt.show()

# Storing the best value for hyperparameter: Leaf

In [None]:
OPTIMUM_LEAF = get_best_hyperparameter_value(MIN_SAMPLES_TO_MAKE_LEAF_RANGE, metrics_list_leaf)
print(OPTIMUM_LEAF)
# OPTIMUM_LEAF = 5

# Run the final model

In [None]:
# run the final model that uses both of the optimum values:
final_metrics = train_test_DT(
    X_train, X_test, y_train, y_test, OPTIMUM_LEAF, OPTIMUM_MAX_DEPTH
)
final_metrics

# Optimizing hyperparmeters

In [None]:
for iter_metric in ["accuracy", "f1", "precision", "recall"]:
    best_leaf = 0
    best_depth = 0
    best_metric = 0

    for r, iter_leaf in enumerate(MIN_SAMPLES_TO_MAKE_LEAF_RANGE):
        
        for c, iter_max_depth_value in enumerate(MAX_TREE_DEPTH_RANGE):
            # train the model and store the accuracy
            resulting_metrics = train_test_DT(
                X_train, X_test, y_train, y_test, iter_leaf, iter_max_depth_value
            )

            met = resulting_metrics[iter_metric]

            if met > best_metric:
                best_leaf = iter_leaf
                best_depth = iter_max_depth_value
                best_metric = met

    print(
        f"Best leaf value = {best_leaf:4}, Best max depth = {best_depth:4}, resulted in highest value of {iter_metric:10} = {best_metric:.5}"
    )
