In [None]:
# Originally based off of the tutorial 'Machine Learn with Python: Decision Trees' by Frederick Nwanganga

In [None]:
# Required Dependency Importation
import pandas as pd

from matplotlib import pyplot as plt

from sklearn import tree
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Classification Decision Tree

In [None]:
# Data Preperation

In [None]:
# Reads the original raw data into a DataFrame
data = pd.read_csv('data.csv')

# Isolates the dependent variable being solved for
y = data[['dependent_variable']]

# Isolates the independent variables used to solve for the dependent variable
x = data[['independent_variable']]

# Splits the original raw data respectively into its training and testing portions
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    train_size = 0.8, 
    
    # Data to be split using stratified random sampling based on
    #    the values of the given column variable
    # Stratified random sampling is a sampling method in which a population group is divided into 1 or many
    #    distinc units, called strata, based on shared behavior or characteristics
    
    stratify = y, 
    
    # Seed for result replication
    random_state = 1234)

In [None]:
# Training

In [None]:
# Generates the classifier on which the classification oriented Decision Tree model will be based upon
classifier = DecisionTreeClassifier(random_state = 1234)

# Generates the classification oriented Decision Tree
model = classifier.fit(x_train, y_train)

In [None]:
# Evaluation

In [None]:
# Evaluates the Decision Tree upon the training portion of the original raw data
print(f'Training Accuracy: {model.score(x_train, y_train)}')

# Evaluates the Decision Tree upon the testing portion of the original raw data
print(f'Testing Accuracy: {model.score(x_test, y_test)}')

In [None]:
# Visualization

In [None]:
# Visualizes the classification oriented Decision Tree
plt.figure(figsize = (15, 15))
tree.plot_tree(model, 
               feature_names = list(x.columns), 
               class_names = ['No', 'Yes'], 
               filled = True)

In [None]:
# Isolates the importance of the classification oriented Decision Tree's features
importance = model.feature_importances_
feature_importance = pd.Series(importance, index = x.columns)

# Visualizes the importance of the classification oriented Decision Tree's features
feature_importance.plot(kind = 'bar')
plt.ylabel('Importance')

In [None]:
# PrePruning

In [None]:
# A selection of hyperparameter values utlized for pre-prunning of the classification oriented Decision Tree
grid = {
    # How many levels deep a Decision Tree is allowed to be
    'max_depth': [2, 3, 4, 5],
    
    # The minimum number of items a Decision Tree must have available to itself before it is allowed to parition 
    #    and split thereby creating new branches; studies suggest values between 1 and 40 are best
    'min_samples_split': [2, 3, 4],
    
    # The minimum number of items allowed to be represented by a Decision Tree's singular leaf node;
    #    studies suggest values between 1 and 20 are best
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]
}

In [None]:
# Grid search utilized to test via brute force various hyperparameter combinations
gcv = GridSearchCV(estimator = classifier, param_grid = grid)
gcv.fit(x_train, y_train)

# Returns the best hyperparameter combination that maximizes the classification oriented 
#    decision tree model's accuracy
model_ = gcv.best_estimator_

model_.fit(x_train, y_train)

In [None]:
# Regression Decision Tree

In [None]:
# Only those steps or phases different from the above example are included below

In [None]:
# Data Preperation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    train_size = 0.8, 
                                                    stratify = x['variable'], 
                                                    random_state = 1234)

# Dummy variables generated from column variables that are qualitative in nature
x_train = pd.get_dummies(x_test)

In [None]:
# Training

In [None]:
# Generates the classifier on which the regression oriented Decision Tree model will be based upon
regressor = DecisionTreeRegressor(random_state = 1234)

# Generates the regression oriented Decision Tree
model = regressor.fit(x_train, y_train)

In [None]:
# Evaluation

In [None]:
# In order to evaluation the regression oriented Decision Tree, the test predictions must be saved in order
#    to then compare them with the actual test results via the mean absolute error
y_test_pred = model.predict(x_test)

# Going forward, results' values should be expected to be off the mark by + or - the MSE
print(f'Mean ABsolute Error (MSE): {mean_absolute_error(y_test, y_test_pred)}')

In [None]:
# Visualization

In [None]:
# The higher a datanode is placed within the decision tree, the greater importance said associated variable
#    is to the overall model's predictive abilities
tree.plot_tree(model, 
               feature_names = list(x_train.columns),  
               filled = True, 
               max_depth = 1)

In [None]:
# Isolates the importance of the regression oriented Decision Tree's features
importance = model.feature_importances_
feature_importance = pd.Series(importance, index = x_train.columns)

# Visualizes the importance of the classification oriented Decision Tree's features
feature_importance.sort_values().plot(kind = 'bar')
plt.ylabel('Importance')

In [None]:
# PostPruning

In [None]:
# Cost Complexity Prunning involves finding the best Alpha parameter that performs the best with the testing data
path = regressor.cost_complexity_pruning_path(x_train, y_train)

# Extracts the list of effective alphas from the Decision Tree Regressor
ccp_alphas = path.ccp_alphas

# The larger the effective Alpha, the smaller and shallower the Decision Tree will be
# Removes the largest Alpha, as this results in a Decision Tree of only 1 node
ccp_alphas = ccp_alphas[:-1]

In [None]:
train_scores = []
test_scores = []

# In order to determine the best Alpha, similar to hyperparameter tuning the models have to be run via a brute
#    force methodology in order to determine which returns the best results in terms of score and accuracy
for alpha in ccp_alphas:
    regressor = DecisionTreeRegressor(random_state = 1234, ccp_alpha = alpha)
    model_ = regressor_.fit(x_train, y_train)
    train_scores.append(model_.score(x_train, y_train))
    test_scores.append(model_.score(x_test, y_test))

In [None]:
# Visualizes the effects of various Alpha values on the training and testing datasets
plt.plot(ccp_alphas, 
         train_scores,
         marker = 'o', 
         label = 'train_score', 
         drawstyle = 'steps_post')

plt.plot(ccp_alphas, 
         test_scores,
         marker = 'o', 
         label = 'test_score', 
         drawstyle = 'steps_post')

plt.legend()
plt.title('R-squared by Alpha')

In [None]:
# Determine the index of the highest test score to then determine the best Alpha value
index = test_scores.index(max(test_scores))
best_alpha = ccp_alphas[index]

In [None]:
# Fits a regression oriented Decision Tree post pruned utilizing the now known best alpha
regressor = DecisionTreeRegressor(random_state = 1234, ccp_aplha = best_alpha)

model_ = regressor_.fit(x_train, y_train)