# Identifying safe loans with decision trees

In [21]:
import pandas as pd
import numpy as np
from sklearn import tree
from IPython.display import Image
import pydotplus
%matplotlib inline

In [2]:
loans = pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


In [4]:
loans.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

## Features for the classification algorithm

In [5]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans',axis=1,inplace=True)

In [6]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [7]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print("Number of safe loans  : %s" % len(safe_loans_raw))
print("Number of risky loans : %s" % len(risky_loans_raw))

Number of safe loans  : 99457
Number of risky loans : 23150


In [8]:
print("Percentage of safe loans  :", len(safe_loans_raw)*1.0/len(loans))
print("Percentage of risky loans :", len(risky_loans_raw)*1.0/len(loans))

Percentage of safe loans  : 0.8111853319957262
Percentage of risky loans : 0.18881466800427382


In [9]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(frac=percentage, random_state=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [10]:
print("Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data)))
print("Percentage of risky loans                :", len(risky_loans) / float(len(loans_data)))
print("Total number of loans in our new dataset :", len(loans_data))

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


## Split data into training and validation sets

In [11]:
train_idx = pd.read_json('module-5-assignment-1-train-idx.json',typ='series').values
validation_idx = pd.read_json('module-5-assignment-1-validation-idx.json',typ='series').values
train_data, validation_data = loans.iloc[train_idx],loans.iloc[validation_idx]

## Use decision tree to build a classifier

In [27]:
loans_data = risky_loans.append(safe_loans)
loans_label = loans_data[target]
loans_data = loans_data[features]
def onehot_transform(X,names=None):
    dummies_X = pd.get_dummies(X)
    if names is None:
        return dummies_X,dummies_X.columns.values
    else:
        return pd.DataFrame(dummies_X,columns=names).fillna(0)

#loans_data = one_hot_transfrom(loans_data)
loans_data,onehot_columns = onehot_transform(loans_data)

In [14]:
loans_data.head()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,grade_A,grade_B,grade_C,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
1,1,1,1.0,1,1,9.4,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
6,0,5,5.55,1,1,32.6,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
7,1,1,18.08,1,1,36.5,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
10,1,1,10.08,1,1,91.7,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
12,0,4,7.06,1,1,55.5,0.0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


In [15]:
loans_data.columns

Index(['short_emp', 'emp_length_num', 'dti', 'last_delinq_none',
       'last_major_derog_none', 'revol_util', 'total_rec_late_fee', 'grade_A',
       'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G',
       'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3', 'sub_grade_A4',
       'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2', 'sub_grade_B3',
       'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1', 'sub_grade_C2',
       'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5', 'sub_grade_D1',
       'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4', 'sub_grade_D5',
       'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3', 'sub_grade_E4',
       'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2', 'sub_grade_F3',
       'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1', 'sub_grade_G2',
       'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'purpose_car', 'purpose_credit_card',
       'purp

In [16]:
risky_loans.columns

Index(['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership',
       'dti', 'purpose', 'term', 'last_delinq_none', 'last_major_derog_none',
       'revol_util', 'total_rec_late_fee', 'safe_loans'],
      dtype='object')

In [17]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(max_depth=6)
decision_tree_model.fit(loans_data, loans_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Visualizing a learned model

In [18]:
small_model = DecisionTreeClassifier(max_depth=2).fit(loans_data,loans_label)

In [22]:

def plot_tree(decision_tree, out_file=None, max_depth=None,
              feature_names=None, class_names=None, label='all',
              filled=True, leaves_parallel=False, impurity=True,
              node_ids=False, proportion=False, rotate=False,
              rounded=True, special_characters=True):
    
    dot_data = tree.export_graphviz(decision_tree, out_file=out_file,
                                    max_depth=max_depth, feature_names=feature_names,
                                    class_names=class_names, label=label, filled=filled,
                                    leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids,
                                    proportion=proportion, rotate=rotate, rounded=rounded,
                                    special_characters=special_characters)
    graph = pydotplus.graph_from_dot_data(dot_data)
    return Image(graph.create_png())

## Make Predictions

In [24]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
19,B,B3,0,11,OWN,11.18,credit_card,36 months,1,1,82.4,0.0,1
79,D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1,1,96.4,0.0,1
24,D,D2,0,3,RENT,13.97,other,60 months,0,1,59.5,0.0,-1
41,A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1,1,62.1,0.0,-1


In [28]:
decision_tree_model.predict(onehot_transform(sample_validation_data,onehot_columns))

array([ 1, -1, -1,  1], dtype=int64)

In [30]:
decision_tree_model.predict_proba(onehot_transform(sample_validation_data,onehot_columns))[:,1]

array([0.64002368, 0.44303303, 0.26418504, 0.77623991])

## Tricky predictions

In [31]:
small_model.predict_proba(onehot_transform(sample_validation_data,onehot_columns))[:,1]

array([0.5740155 , 0.40414794, 0.40414794, 0.77193538])

## Evaluating accuracy of the decision tree model

In [32]:
from sklearn.metrics import accuracy_score
print(small_model.score(onehot_transform(train_data[features],onehot_columns),train_data[target]))
print(decision_tree_model.score(onehot_transform(train_data[features],onehot_columns),train_data[target]))

0.613448312916398
0.6368203309692672


In [33]:
print(small_model.score(onehot_transform(validation_data[features],onehot_columns),validation_data[target]))
print(decision_tree_model.score(onehot_transform(validation_data[features],onehot_columns),validation_data[target]))

0.6193451098664369
0.6370099095217578


## Evaluating accuracy of a complex decision tree model

In [35]:
big_model = DecisionTreeClassifier(max_depth=10)
big_model.fit(onehot_transform(train_data[features],onehot_columns),
             train_data[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [36]:
print(big_model.score(onehot_transform(train_data[features],onehot_columns),train_data[target]))
print(big_model.score(onehot_transform(validation_data[features],onehot_columns),validation_data[target]))

0.6637921770900495
0.6264541146057734


In [37]:
predictions = decision_tree_model.predict(onehot_transform(validation_data,onehot_columns))

In [38]:
false_positives = sum((predictions ==1)&(validation_data[target]==-1))

In [39]:
false_negatives = sum((predictions ==-1)&(validation_data[target]==1))

In [40]:

false_positives*20000+false_negatives*10000

49270000