In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def tree_train_validate(X_train, y_train, X_validate, y_validate, tree_model, min_samples_leaf):

    '''
    This function takes in train, validate and test modeling sets, a decision tree or random forest classifier model, and a specified minimum number of samples
    per leaf, and prints out train and validate accuracy results for 10 different depths and plots the validate accuracy results for different depths.
    '''

    depth_range = range(1, 11)    # set a range of depths to explore

    scores = []    # set an empty list for validate scores

    metrics = []    # set an empty list for dictionaries

    for depth in depth_range:    # commence loop through different max depths for Decision Tree

        model = tree_model(max_depth = depth, min_samples_leaf = min_samples_leaf, random_state = 421) # create object

        model.fit(X_train, y_train)    # fit object

        scores.append(model.score(X_validate, y_validate))    # add validate scores to scores list

        in_sample_accuracy = model.score(X_train, y_train)    # calculate accuracy on train set

        out_of_sample_accuracy = model.score(X_validate, y_validate)    # calculate accuracy on validate set

        output = {                                       # create dictionary with max_depth,
            'max_depth': depth,                          # train set accuracy, and validate accuracy
            'train_accuracy': in_sample_accuracy,        
            'validate_accuracy': out_of_sample_accuracy
        }

        metrics.append(output)    # add dictionaries to list

    plt.figure()    # create figure

    plt.xlabel('depth')    # label x-axis

    plt.ylabel('accuracy')    # label y-axis

    plt.scatter(depth_range, scores, color = 'indianred') # plot relatiosnhip between depth range and validate accuracy

    plt.xticks([0, 2, 4, 6, 8, 10])    # customize x-axis label ticks

    plt.title('Validate Accuracy')    # title

    plt.show();     # show

    metrics_df = pd.DataFrame(metrics)    # form dataframe from scores data

    metrics_df = metrics_df.set_index('max_depth')   # set index to max depth

    metrics_df['difference'] = metrics_df.train_accuracy - metrics_df.validate_accuracy   # create column of values
                                                                        # for difference between train and validate
    print(metrics_df)         # view metrics dataframe