# Implementing binary decision trees
[Programming assignment 2](https://www.coursera.org/learn/ml-classification/supplement/seWHJ/implementing-binary-decision-trees) of *Machine Learning: Classification* by University of Washington on Coursera.

# 1. Prepare the data

In [2]:
import pandas as pd
import numpy as np
loans = pd.read_csv('../Data/lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Load data 

In [3]:
#  reassign the labels to have +1 for a safe loan, and -1 for a risky (bad) loan
loans['safe_loans'] = loans['bad_loans'].map({0: +1, 1: -1})
loans.drop('bad_loans', axis=1)
# consider four features
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
# extract these columns from the dataset and discard others
loans = loans[features + [target]]

## One-hot encoding
By one-hot encoding, we have only numeric features. In this case, each encoded feature is either 1 or 0.

In [6]:
loans = pd.get_dummies(loans)
loans.head(5)
print(loans.shape)

(122607, 26)


## balance the two classes in the dataset

In [9]:
# the train and test set index
import json
train_idx_file = '../data/module-5-assignment-2-train-idx.json'
test_idx_file = '../data/module-5-assignment-2-test-idx.json'
with open(train_idx_file) as f:
    train_idx = json.load(f)
with open(test_idx_file) as f:
    test_idx = json.load(f)
train_data = loans.iloc[train_idx, :]
test_data = loans.iloc[test_idx, :]
print('train shape: ', train_data.shape, '. \nValue counts: \n', train_data['safe_loans'].value_counts())
print('test shape: ', test_data.shape, test_data['safe_loans'].value_counts())

train shape:  (37224, 26) . 
Value counts: 
  1    18748
-1    18476
Name: safe_loans, dtype: int64
test shape:  (9284, 26) -1    4674
 1    4610
Name: safe_loans, dtype: int64


In [12]:
train_data.columns.shape

(26,)

# 2。Implement a binary decision tree

## Function to count number of mistakes while predicting majority class
In each intermediate node, we label it with the majority class. Then, the misclassification rate can be calculated. This is used to determine the best feature for splitting.

**Note:** Keep in mind that in order to compute the number of mistakes for a majority classifier, we only need the label (y values) of the data points in the node.

In [13]:
def intermediate_node_num_mistakes(labels_in_node):
    num_safe_loans = np.count_nonzero(labels_in_node == 1)
    num_risky_loans = np.count_nonzero(labels_in_node == -1)
    return num_risky_loans if num_safe_loans > num_risky_loans else num_safe_loans

In [17]:
# test case 1
example_labels = np.array([-1, -1, 1, -1, -1])
if intermediate_node_num_mistakes(example_labels) == 1:
    print('Test passed!')
else:
    print('Test 1 failed... try again!')

Test passed!


In [18]:
# test case 2
example_labels = np.array([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 1 failed... try again!')

Test passed!


In [19]:
example_labels = np.array([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 1 failed... try again!')

Test passed!


## Function to pick best feature to split on
The function will loop through the list of possible features, and consider splitting on each of them. It will calculate the classification error of each split and return the feature that had the smallest classification error when split on.

In [22]:
def best_splitting_feature(data, features, target):
    best_feature = None
    min_mistakes = float('Inf')
    for feature in features:
        # split into two subsets: left for 0 and right for 1
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        # number of misclassifications
        left_mistakes = intermediate_node_num_mistakes(left_split[target])
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
        # error rate is: (left_mistakes + right_mistakes) / number of records in data
        # since number of records in data remains the same for this splitting, no need to compute
        mistakes = left_mistakes + right_mistakes
        if mistakes < min_mistakes:
            min_mistakes = mistakes
            best_feature = feature
    return best_feature

## Build the tree
Each node in the tree is represented as following

In [23]:
class Node:
    def __init__(self):
        self.is_leaf = False
        self.predication = None
        self.left = None
        self.right = None
        self.splitting_feature = None

In [24]:
# Create a leaf node given a set of target values: majority 
def create_leaf(labels_in_node):
    leaf = Node()
    leaf.is_leaf = True
    if np.count_nonzero(labels_in_node == 1) > np.count_nonzero(labels_in_node == -1):
        leaf.predication = 1
    else:
        leaf.predication = -1
    return leaf

Recursive tree building stop criteria:
+ Condition 1: all data points in the node are from the same class
+ Condition 2: no more features available (each feature can be used for splitting once along a path in a tree)
+ Condition 3: max_depth

In [47]:
def decision_tree_create(data, features, target, current_depth=0, max_depth=10):
    target_values = data[target];
    # stop
    # condition1: in the same class, that is, misclassification error will be zero
    if intermediate_node_num_mistakes(target_values) == 0:
        print('Stopping condition 1 reached.')
        return create_leaf(target_values)
    if len(features) == 0:
        print('Stopping condition 2 reached.')
        return create_leaf(target_values)
    if current_depth >= max_depth:
        print('Stopping condition 3 reached: max depth.')
        return create_leaf(target_values)
    
    # find the best feature to split
    best_feature = best_splitting_feature(data, features, target)
    # split
    left_split = data[data[best_feature] == 0]
    right_split = data[data[best_feature] == 1]
    # remove this feature from current recursion path
    # in Python, generally do NOT change the arguments due to the reference semantics
    remaining_features = features[:]
    remaining_features.remove(best_feature)
    print('Split on feature {0} into two subsets of size {1} and {2}.'.format(best_feature, len(left_split), len(right_split)))
    
    # if the selected feature has only one value in this dataset, then either left or right_split will be empty. In this case,
    # we will build a leaf node for this dataset.
    if len(left_split) == 0 or len(right_split) == 0:
        print('The chosen splitting feature has only one value in the dataset.')
        return create_leaf(target_values)
    
    # recursion
    node = Node()
    node.is_leaf = False
    node.splitting_feature = best_feature
    node.left = decision_tree_create(left_split, features, target, current_depth + 1, max_depth)
    node.right = decision_tree_create(right_split, features, target, current_depth + 1, max_depth)
    return node

+ In the above building process, when the chosen feature has only one value in the intermediate subset D, the procedure labels such an intermediate node D as a leaf. 

+ However, a better way is to continue the tree building: for the child whose data is empty, assign it as a leaf and its class is the majority of its parent D. For the other child whose data is not empty (actually also D), continue the building process for it.

In [48]:
# build the tree
features = list(train_data) # equivalent to my_dataframe.columns.values.tolist()
features.remove(target)
tree = decision_tree_create(train_data, features, target, 0, max_depth=6)

> [1;32m<ipython-input-47-e71541d13fb1>[0m(7)[0;36mdecision_tree_create[1;34m()[0m
[1;32m      5 [1;33m    [1;31m# stop[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      6 [1;33m    [1;31m# condition1: in the same class, that is, misclassification error will be zero[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m----> 7 [1;33m    [1;32mif[0m [0mintermediate_node_num_mistakes[0m[1;33m([0m[0mtarget_values[0m[1;33m)[0m [1;33m==[0m [1;36m0[0m[1;33m:[0m[1;33m[0m[0m
[0m[1;32m      8 [1;33m        [0mprint[0m[1;33m([0m[1;34m'Stopping condition 1 reached.'[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m      9 [1;33m        [1;32mreturn[0m [0mcreate_leaf[0m[1;33m([0m[0mtarget_values[0m[1;33m)[0m[1;33m[0m[0m
[0m
ipdb> n
> [1;32m<ipython-input-47-e71541d13fb1>[0m(10)[0;36mdecision_tree_create[1;34m()[0m
[1;32m      8 [1;33m        [0mprint[0m[1;33m([0m[1;34m'Stopping condition 1 reached.'[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m      9 [1;33m

## Predication
Just preorder traversal of a binary tree

In [26]:
def classify(tree, x, annotate=False):
    if tree.is_leaf:
        if annotate:
            print('At leaf, predicting {}'.format(tree.predication))
        return tree.predication
    # goto the left or right subtree depending on the feature value
    split_feature_value = x[tree.splitting_feature]
    if annotate:
        print('Split on {} = {}'.format(tree.splitting_feature, split_feature_value))
    if split_feature_value == 0:
        return classify(tree.left, x, annotate)
    else:
        return classify(tree.right, x, annotate)

In [27]:
# a simple test
x = test_data.iloc[0, :]
print(x)

safe_loans                -1
grade_A                    0
grade_B                    0
grade_C                    0
grade_D                    1
grade_E                    0
grade_F                    0
grade_G                    0
term_ 36 months            0
term_ 60 months            1
home_ownership_MORTGAGE    0
home_ownership_OTHER       0
home_ownership_OWN         0
home_ownership_RENT        1
emp_length_1 year          0
emp_length_10+ years       0
emp_length_2 years         1
emp_length_3 years         0
emp_length_4 years         0
emp_length_5 years         0
emp_length_6 years         0
emp_length_7 years         0
emp_length_8 years         0
emp_length_9 years         0
emp_length_< 1 year        0
emp_length_n/a             0
Name: 24, dtype: int64


In [34]:
classify(tree, x, annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
At leaf, predicting -1


-1

In [35]:
def evaluate_classification_error(tree, data):
    predications = data.apply(lambda record: classify(tree, record), axis=1) # apply to each row
    return (predications != data[target]).sum() / len(predications)

In [36]:
evaluate_classification_error(tree, test_data)

0.38377854373115039

In [38]:
tree.splitting_feature

'term_ 36 months'