# Decision Trees in Practice

In [1]:
import numpy as np
import pandas as pd

In [2]:
loans = pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans',axis=1,inplace=True)

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features + [target]]

# Subsample dataset to make sure classes are balanced

In [5]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(frac=percentage, random_state = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print("Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data)))
print("Percentage of risky loans                :", len(risky_loans) / float(len(loans_data)))
print("Total number of loans in our new dataset :", len(loans_data))

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


In [6]:
loans_data = risky_loans.append(safe_loans)
def onehot_transform(X, names=None,prefix_sep='.'):
    dummies_X = pd.get_dummies(X,prefix_sep=prefix_sep)
    if names is None:
        return dummies_X, dummies_X.columns.values
    else:
        return pd.DataFrame(dummies_X, columns=names).fillna(0)
loans_data,names = onehot_transform(loans_data)

In [7]:
features = loans_data.columns.tolist()
features.remove('safe_loans')  # Remove the response variable
features

['grade.A',
 'grade.B',
 'grade.C',
 'grade.D',
 'grade.E',
 'grade.F',
 'grade.G',
 'term. 36 months',
 'term. 60 months',
 'home_ownership.MORTGAGE',
 'home_ownership.OTHER',
 'home_ownership.OWN',
 'home_ownership.RENT',
 'emp_length.1 year',
 'emp_length.10+ years',
 'emp_length.2 years',
 'emp_length.3 years',
 'emp_length.4 years',
 'emp_length.5 years',
 'emp_length.6 years',
 'emp_length.7 years',
 'emp_length.8 years',
 'emp_length.9 years',
 'emp_length.< 1 year']

In [9]:
safe_loans

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
14937,B,60 months,RENT,7 years,1
104761,A,36 months,RENT,10+ years,1
77248,B,36 months,RENT,< 1 year,1
120821,A,36 months,RENT,< 1 year,1
104521,D,36 months,RENT,10+ years,1
89340,B,36 months,MORTGAGE,10+ years,1
97993,A,36 months,MORTGAGE,10+ years,1
51510,A,60 months,MORTGAGE,4 years,1
90723,B,36 months,MORTGAGE,10+ years,1
19435,C,36 months,RENT,5 years,1


## Train-Validation split

In [14]:
train_idx = pd.read_json('module-6-assignment-train-idx.json',typ='series').values
validation_idx = pd.read_json('module-6-assignment-validation-idx.json',typ='series').values
train_data, validation_data = loans.iloc[train_idx],loans.iloc[validation_idx]
train_data = onehot_transform(train_data,names,)
validation_set = onehot_transform(validation_data,names,)

## Early stopping methods for decision trees

### Early stopping condition 2: Minimum node size

In [15]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    ## YOUR CODE HERE
    return len(data)<=min_node_size

### Early stopping condition 3: Minimum gain in error reduction

In [16]:
def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    ## YOUR CODE HERE
    return error_before_split - error_after_split

### Brabbing binary decision tree helper functions from past assignment

In [17]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0
    
    # Count the number of 1's (safe loans)
    ## YOUR CODE HERE
    safe_loans = sum(labels_in_node==1)
    # Count the number of -1's (risky loans)
    ## YOUR CODE HERE
    risky_loans = sum(labels_in_node== -1)            
    # Return the number of mistakes that the majority classifier makes.
    ## YOUR CODE HERE
    return min(safe_loans,risky_loans)

In [18]:
def best_splitting_feature(data, features, target):
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        ## YOUR CODE HERE
        right_split =data[data[feature] == 1]  
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        # YOUR CODE HERE
        left_mistakes = intermediate_node_num_mistakes(left_split[target])            

        # Calculate the number of misclassified examples in the right split.
        ## YOUR CODE HERE
        right_mistakes = intermediate_node_num_mistakes(right_split[target]) 
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        ## YOUR CODE HERE
        error = (left_mistakes+right_mistakes)/num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        ## YOUR CODE HERE
        if error < best_error:
            best_error = error
            best_feature = feature
    return best_feature # Return the best feature we found

In [19]:
def create_leaf(target_values):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True    }   ## YOUR CODE HERE
    
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1         ## YOUR CODE HERE
    else:
        leaf['prediction'] = -1        ## YOUR CODE HERE
        
    # Return the leaf node        
    return leaf

### Incorporating new early stopping condtions in binary decision tree implementation

In [20]:
def decision_tree_create(data, features, target, current_depth = 0, 
                         max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    
    # Stopping condition 1: All nodes are of the same type.
    if intermediate_node_num_mistakes(target_values) == 0:
        print("Stopping condition 1 reached. All data points have the same target value.")                
        return create_leaf(target_values)
    
    # Stopping condition 2: No more features to split on.
    if remaining_features == []:
        print("Stopping condition 2 reached. No remaining features.")                
        return create_leaf(target_values)    
    
    # Early stopping condition 1: Reached max depth limit.
    if current_depth >= max_depth:
        print("Early stopping condition 1 reached. Reached maximum depth.")
        return create_leaf(target_values)
    
    # Early stopping condition 2: Reached the minimum node size.
    # If the number of data points is less than or equal to the minimum size, return a leaf.
    if reached_minimum_node_size(data,min_node_size)  :        ## YOUR CODE HERE 
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return  create_leaf(target_values) ## YOUR CODE HERE
    
    # Find the best splitting feature
    splitting_feature = best_splitting_feature(data, features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    # Early stopping condition 3: Minimum error reduction
    # Calculate the error before splitting (number of misclassified examples 
    # divided by the total number of examples)
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes = intermediate_node_num_mistakes(left_split[target])   ## YOUR CODE HERE
    right_mistakes = intermediate_node_num_mistakes(right_split[target])  ## YOUR CODE HERE
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    # If the error reduction is LESS THAN OR EQUAL TO min_error_reduction, return a leaf.
    if error_reduction(error_before_split,error_after_split)<=min_error_reduction:        ## YOUR CODE HERE
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values)  ## YOUR CODE HERE 
    
    
    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction) 
    
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [21]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [22]:
small_decision_tree = decision_tree_create(train_data, features, 'safe_loans', max_depth = 2, 
                                        min_node_size = 10, min_error_reduction=0.0)
if count_nodes(small_decision_tree) == 7:
    print('Test passed!')
else:
    print('Test failed... try again!')
    print('Number of nodes found                :', count_nodes(small_decision_tree))
    print('Number of nodes that should be there : 7')

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade.D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Early stopping condition 1 reached. Reached maximum depth.
-----------------------------------------------

In [23]:
my_decision_tree_new = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 100, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length.< 1 year. (90, 11)
--------------------------------------------------------------------
Subtree, depth = 3 (90 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (11 data points).
Early stopping condition 2 reached. Reached minimum node size.
------------------------------------

In [24]:

my_decision_tree_old = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

Split on feature emp_length.8 years. (347, 11)
--------------------------------------------------------------------
Subtree, depth = 5 (347 data points).
Split on feature grade.A. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (347 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (11 data points).
Split on feature home_ownership.OWN. (9, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (9 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (2 data points).
Stopping condition 1 reached. All data points h

## Make predictions

In [26]:
def classify(tree, x, annotate = False):   
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction'] 
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
               ### YOUR CODE HERE
            return classify(tree['right'], x, annotate)

In [27]:
validation_set.iloc[0]

safe_loans                -1
grade.A                    0
grade.B                    0
grade.C                    0
grade.D                    1
grade.E                    0
grade.F                    0
grade.G                    0
term. 36 months            0
term. 60 months            1
home_ownership.MORTGAGE    0
home_ownership.OTHER       0
home_ownership.OWN         0
home_ownership.RENT        1
emp_length.1 year          0
emp_length.10+ years       0
emp_length.2 years         1
emp_length.3 years         0
emp_length.4 years         0
emp_length.5 years         0
emp_length.6 years         0
emp_length.7 years         0
emp_length.8 years         0
emp_length.9 years         0
emp_length.< 1 year        0
Name: 24, dtype: int64

In [28]:
print('Predicted class: %s ' % classify(my_decision_tree_new, validation_set.iloc[0]))

Predicted class: -1 


In [34]:

classify(my_decision_tree_new, validation_set.iloc[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
At leaf, predicting -1


-1

In [35]:
classify(my_decision_tree_old, validation_set.iloc[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
Split on grade.B = 0
Split on grade.C = 0
Split on grade.D = 1
Split on grade.E = 0
At leaf, predicting -1


-1

In [36]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = [classify(tree,data.iloc[i]) for i in range(0,data.shape[0])]
    # Once you've made the predictions, calculate the classification error and return it
    ## YOUR CODE HERE
    return float(sum(prediction!=data['safe_loans']))/len(data)

In [37]:
evaluate_classification_error(my_decision_tree_new, validation_set)

0.37774666092201636

In [38]:
evaluate_classification_error(my_decision_tree_old, validation_set)

0.37774666092201636

In [39]:
model_1 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 2, 
                                min_node_size = 0, min_error_reduction=-1)
model_2 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)
model_3 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 14, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade.D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Early stopping condition 1 reached. Reached maximum depth.
-----------------------------------------------

Split on feature grade.D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Split on feature grade.E. (22024, 1276)
--------------------------------------------------------------------
Subtree, depth = 3 (22024 data points).
Split on feature grade.F. (21666, 358)
--------------------------------------------------------------------
Subtree, depth = 4 (21666 data points).
Split on feature grade.C. (14444, 7222)
--------------------------------------------------------------------
Subtree, depth = 5 (14444 data points).
Split on feature grade.G. (14347, 97)
--------------------------------------------------------------------
Subtree, depth = 6 (14347 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (97 data points).
Early stopping condition 1 reached. Reached maximum depth.
----------------------------------

Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 13 (1 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 11 (164 data points).
Split on feature emp_length.2 years. (159, 5)
--------------------------------------------------------------------
Subtree, depth = 12 (159 data points).
Split on feature emp_length.3 years. (148, 11)
--------------------------------------------------------------------
Subtree, depth = 13 (148 data points).
Split on feature home_ownership.OWN. (148, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (148 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping c

Split on feature home_ownership.RENT. (8, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (8 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 11 (1 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 10 (1088 data points).
Split on feature home_ownership.OTHER. (1088, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (1088 data points).
Split on feature home_ownership.OWN. (1088, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (1088 data points).
Split o

Split on feature emp_length.4 years. (746, 57)
--------------------------------------------------------------------
Subtree, depth = 11 (746 data points).
Split on feature home_ownership.OTHER. (746, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (746 data points).
Split on feature home_ownership.OWN. (598, 148)
--------------------------------------------------------------------
Subtree, depth = 13 (598 data points).
Split on feature home_ownership.RENT. (0, 598)
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 14 (598 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 13 (148 data points).
Split on feature emp_length.< 1

Split on feature emp_length.9 years. (71, 5)
--------------------------------------------------------------------
Subtree, depth = 14 (71 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (5 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 11 (602 data points).
Split on feature emp_length.9 years. (580, 22)
--------------------------------------------------------------------
Subtree, depth = 12 (580 data points).
Split on feature emp_length.3 years. (545, 35)
--------------------------------------------------------------------
Subtree, depth = 13 (545 data points).
Split on feature emp_lengt

Split on feature grade.C. (90, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (90 data points).
Split on feature grade.D. (90, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (90 data points).
Split on feature grade.E. (90, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (90 data points).
Split on feature grade.F. (90, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (90 data points).
Split on feature grade.G. (90, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (90 data points).
Split on feature term. 60 months. (0, 90)
--------------------------------------------------------------------
Subtree, depth = 10 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, 

Split on feature grade.D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Split on feature grade.E. (22024, 1276)
--------------------------------------------------------------------
Subtree, depth = 3 (22024 data points).
Split on feature grade.F. (21666, 358)
--------------------------------------------------------------------
Subtree, depth = 4 (21666 data points).
Split on feature grade.C. (14444, 7222)
--------------------------------------------------------------------
Subtree, depth = 5 (14444 data points).
Split on feature grade.G. (14347, 97)
--------------------------------------------------------------------
Subtree, depth = 6 (14347 data points).
Split on feature grade.A. (9318, 5029)
--------------------------------------------------------------------
Subtree, depth = 7 (9318 data points).
Split on feature home_ownership.OTHER. (9301, 17)
-------------------------------------------------------------

Split on feature home_ownership.RENT. (449, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (449 data points).
Split on feature emp_length.1 year. (431, 18)
--------------------------------------------------------------------
Subtree, depth = 14 (431 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (18 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 11 (9 data points).
Split on feature emp_length.3 years. (8, 1)
--------------------------------------------------------------------
Subtree, depth = 12 (8 data points).
Stopping condition 1 reached. 

Split on feature home_ownership.MORTGAGE. (4303, 2919)
--------------------------------------------------------------------
Subtree, depth = 6 (4303 data points).
Split on feature emp_length.4 years. (3969, 334)
--------------------------------------------------------------------
Subtree, depth = 7 (3969 data points).
Split on feature home_ownership.OTHER. (3957, 12)
--------------------------------------------------------------------
Subtree, depth = 8 (3957 data points).
Split on feature emp_length.9 years. (3828, 129)
--------------------------------------------------------------------
Subtree, depth = 9 (3828 data points).
Split on feature emp_length.2 years. (3312, 516)
--------------------------------------------------------------------
Subtree, depth = 10 (3312 data points).
Split on feature grade.A. (3312, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (3312 data points).
Split on feature grade.B. (3312, 0)
--------------------------

Split on feature home_ownership.OTHER. (334, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (334 data points).
Split on feature home_ownership.OWN. (286, 48)
--------------------------------------------------------------------
Subtree, depth = 13 (286 data points).
Split on feature home_ownership.RENT. (0, 286)
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 14 (286 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 13 (48 data points).
Split on feature home_ownership.RENT. (48, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (48 data points).
Early stopping condition 1 reached.

Split on feature grade.B. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (347 data points).
Split on feature grade.C. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (347 data points).
Split on feature grade.G. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (347 data points).
Split on feature term. 60 months. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (347 data points).
Split on feature home_ownership.MORTGAGE. (237, 110)
--------------------------------------------------------------------
Subtree, depth = 11 (237 data points).
Split on feature home_ownership.OTHER. (235, 2)
--------------------------------------------------------------------
Subtree, depth = 12 (235 data points).
Split on feature home_ownership.OWN. (203, 32)
-----------------------------------------------------------

Split on feature home_ownership.MORTGAGE. (855, 421)
--------------------------------------------------------------------
Subtree, depth = 10 (855 data points).
Split on feature home_ownership.OTHER. (849, 6)
--------------------------------------------------------------------
Subtree, depth = 11 (849 data points).
Split on feature home_ownership.OWN. (737, 112)
--------------------------------------------------------------------
Subtree, depth = 12 (737 data points).
Split on feature home_ownership.RENT. (0, 737)
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 13 (737 data points).
Split on feature emp_length.1 year. (670, 67)
--------------------------------------------------------------------
Subtree, depth = 14 (670 data points).
Early stopping condition 1 reached. Re

Split on feature emp_length.1 year. (2392, 241)
--------------------------------------------------------------------
Subtree, depth = 14 (2392 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (241 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 12 (404 data points).
Split on feature home_ownership.RENT. (404, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (404 data points).
Split on feature emp_length.1 year. (374, 30)
--------------------------------------------------------------------
Subtree, depth = 14 (374 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (30 data points).
Early stopping condition 1 reached

In [40]:
print("Training data, classification error (model 1):", evaluate_classification_error(model_1, train_data))
print("Training data, classification error (model 2):", evaluate_classification_error(model_2, train_data))
print("Training data, classification error (model 3):", evaluate_classification_error(model_3, train_data))

Training data, classification error (model 1): 0.40003761014399314
Training data, classification error (model 2): 0.3804266064904363
Training data, classification error (model 3): 0.3772566086395874


In [41]:
print("Validation data, classification error (model 1):", evaluate_classification_error(model_1, validation_set))
print("Validation data, classification error (model 2):", evaluate_classification_error(model_2, validation_set))
print("Validation data, classification error (model 3):", evaluate_classification_error(model_3, validation_set))

Validation data, classification error (model 1): 0.3981042654028436
Validation data, classification error (model 2): 0.37774666092201636
Validation data, classification error (model 3): 0.38140887548470487


In [42]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [43]:
print("number of nodes in (model 1):", count_leaves(model_1))
print("number of nodes in (model 2):", count_leaves(model_2))
print("number of nodes in (model 3):", count_leaves(model_3))

number of nodes in (model 1): 4
number of nodes in (model 2): 39
number of nodes in (model 3): 341


## Exploring the effect of min_error

In [44]:
model_4 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)
model_5 =decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=0)
model_6 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

Split on feature home_ownership.MORTGAGE. (4303, 2919)
--------------------------------------------------------------------
Subtree, depth = 6 (4303 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (2919 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 4 (358 data points).
Split on feature emp_length.8 years. (347, 11)
--------------------------------------------------------------------
Subtree, depth = 5 (347 data points).
Split on feature grade.A. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (347 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data point

In [46]:
print("Validation data, classification error (model 4):", evaluate_classification_error(model_4, validation_set))
print("Validation data, classification error (model 5):", evaluate_classification_error(model_5, validation_set))
print("Validation data, classification error (model 6):", evaluate_classification_error(model_6, validation_set))

Validation data, classification error (model 4): 0.37774666092201636
Validation data, classification error (model 5): 0.37774666092201636
Validation data, classification error (model 6): 0.503446790176648


In [48]:
print("number of nodes in (model 4):", count_leaves(model_4))
print("number of nodes in (model 5):", count_leaves(model_5))
print("number of nodes in (model 6):", count_leaves(model_6))

number of nodes in (model 4): 39
number of nodes in (model 5): 12
number of nodes in (model 6): 1


# Exploring the effect of min_node_size

In [49]:
model_7 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)
model_8 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 2000, min_error_reduction=-1)
model_9 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

Split on feature home_ownership.MORTGAGE. (4303, 2919)
--------------------------------------------------------------------
Subtree, depth = 6 (4303 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (2919 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 4 (358 data points).
Split on feature emp_length.8 years. (347, 11)
--------------------------------------------------------------------
Subtree, depth = 5 (347 data points).
Split on feature grade.A. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (347 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data point

Split on feature grade.A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on feature grade.B. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (4701 data points).
Split on feature grade.C. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (4701 data points).
Split on feature grade.E. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (4701 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
-------

In [50]:
print("Validation data, classification error (model 7):", evaluate_classification_error(model_7, validation_set))
print("Validation data, classification error (model 8):", evaluate_classification_error(model_8, validation_set))
print("Validation data, classification error (model 9):", evaluate_classification_error(model_9, validation_set))

Validation data, classification error (model 7): 0.37774666092201636
Validation data, classification error (model 8): 0.3774235243429556
Validation data, classification error (model 9): 0.503446790176648


In [52]:
print("number of nodes in (model 7):", count_leaves(model_7))
print("number of nodes in (model 8):", count_leaves(model_8))
print("number of nodes in (model 9):", count_leaves(model_9))

number of nodes in (model 7): 39
number of nodes in (model 8): 20
number of nodes in (model 9): 1
