Programming assignment 1 for week3 (decision tree) of [classification course](https://www.coursera.org/learn/ml-classification/supplement/ssPTc/identifying-safe-loans-with-decision-trees) on Coursera.

# Load the dataset and explore the features as well as the target column

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
loans = pd.read_csv('../Data/lending-club-data.csv')
loans.head(10)
loans.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [3]:
# to be consistent with the lecture, turn the column "bad_loans" into a new column "safe_loans". In "safe_loans", +1 as a safe loan and -1 as a bad loan.
loans['safe_loans'] = loans['bad_loans'].map({0: +1, 1: -1})
loans = loans.drop('bad_loans', axis=1)

In [4]:
# Distribution of safe_loans
t = loans['safe_loans'].value_counts()
# t has two indices, 1 and -1
print('safe: {0:.3f}'.format(t[1] / loans.shape[0]))

safe: 0.811


# Features for the classification
We'll only use a subset of the available features.

In [5]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

In [7]:
# Extract the feature and target column
loans = loans[features + [target]]

In [8]:
loans.head(10)

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.0,1
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.0,1
3,C,C1,0,11,RENT,20.0,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.2,wedding,36 months,1,1,28.3,0.0,1
5,E,E1,0,10,RENT,5.35,car,36 months,1,1,87.5,0.0,1
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0.0,-1
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0.0,-1
8,C,C3,0,6,OWN,16.12,debt_consolidation,60 months,1,1,20.6,0.0,1
9,B,B5,0,11,OWN,10.78,debt_consolidation,36 months,1,1,67.1,0.0,1


## Sample data to balance classes
One way to combat class imbalance is to undersample the larger class until the class distribution is approximately half and half. Here, we will undersample the larger class (safe loans) in order to balance out our dataset. This means we are throwing away many data points.

In [9]:
# Let's first check the distribution of the two classes
loans['safe_loans'].value_counts()

 1    99457
-1    23150
Name: safe_loans, dtype: int64

In [14]:
import json
# the two classes are imbalanced. We need to undersample the +1 class. Here we use the provided indices.
train_idx_file = '../Data/module-5-assignment-1-train-idx.json'
validation_idx_file = '../Data/module-5-assignment-1-validation-idx.json'
with open(train_idx_file) as f:
    train_idx = json.load(f)
with open(validation_idx_file) as f:
    validation_idx = json.load(f)

In [17]:
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [20]:
print(train_data['safe_loans'].value_counts(), validation_data['safe_loans'].value_counts(), sep='\n')

 1    18748
-1    18476
Name: safe_loans, dtype: int64
-1    4674
 1    4610
Name: safe_loans, dtype: int64


## One-hot encoding
The decision tree in scikit-learn can only accept numerical values. For the non-numerical columns, we need to transform them into numerical values.

In [22]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37224 entries, 1 to 122603
Data columns (total 13 columns):
grade                    37224 non-null object
sub_grade                37224 non-null object
short_emp                37224 non-null int64
emp_length_num           37224 non-null int64
home_ownership           37224 non-null object
dti                      37224 non-null float64
purpose                  37224 non-null object
term                     37224 non-null object
last_delinq_none         37224 non-null int64
last_major_derog_none    37224 non-null int64
revol_util               37224 non-null float64
total_rec_late_fee       37224 non-null float64
safe_loans               37224 non-null int64
dtypes: float64(3), int64(5), object(5)
memory usage: 4.0+ MB


In [24]:
train_data.describe(include='all')

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
count,37224,37224,37224.0,37224.0,37224,37224.0,37224,37224,37224.0,37224.0,37224.0,37224.0,37224.0
unique,7,35,,,4,,12,2,,,,,
top,B,B3,,,MORTGAGE,,debt_consolidation,36 months,,,,,
freq,10366,2498,,,17378,,20970,28001,,,,,
mean,,,0.128868,6.342682,,16.102869,,,0.584005,0.873791,55.818457,1.282583,0.007307
std,,,0.335059,3.754569,,7.603801,,,0.492899,0.332089,25.481621,7.081641,0.999987
min,,,0.0,0.0,,0.0,,,0.0,0.0,0.0,0.0,-1.0
25%,,,0.0,3.0,,10.42,,,0.0,1.0,37.6,0.0,-1.0
50%,,,0.0,6.0,,15.99,,,1.0,1.0,58.2,0.0,1.0
75%,,,0.0,11.0,,21.58,,,1.0,1.0,76.3,0.0,1.0


In [30]:
#  If columns is None then all the columns with object or category dtype will be converted.
train_data = pd.get_dummies(train_data)
validation_data = pd.get_dummies(validation_data)

# Decision Tree

In [36]:
from sklearn import tree
decision_tree_model = tree.DecisionTreeClassifier(max_depth=6)
small_model = tree.DecisionTreeClassifier(max_depth=2)
train_X = train_data.loc[:, train_data.columns != 'safe_loans']
train_y = train_data['safe_loans']
decision_tree_model.fit(train_X, train_y)
small_model.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Predication

In [41]:
# pick two samples from the validation set
validation_safe_loans = validation_data[validation_data['safe_loans'] == 1]
validation_risky_loans = validation_data[validation_data['safe_loans'] == -1]
sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]
sample_validation_data = pd.concat([sample_validation_data_risky, sample_validation_data_safe])

In [43]:
# make predications
sample_validation_X = sample_validation_data.loc[:, sample_validation_data.columns != 'safe_loans']
decision_tree_model.predict(sample_validation_X)

array([-1,  1,  1, -1], dtype=int64)

In [44]:
decision_tree_model.predict_proba(sample_validation_X)

array([[ 0.64750958,  0.35249042],
       [ 0.20789474,  0.79210526],
       [ 0.34156543,  0.65843457],
       [ 0.53630646,  0.46369354]])

In [47]:
# mean accuracy on the training set
for tree_model in [decision_tree_model, small_model]:
    print(tree_model.score(train_X, train_y))

0.640527616591
0.613502041694


In [48]:
# mean accuracy on the validation set
for tree_model in [decision_tree_model, small_model]:
    print(tree_model.score(validation_data.loc[:, validation_data.columns != 'safe_loans'], validation_data['safe_loans']))

0.636148211978
0.619345109866
