# Credit Card Default Study

## 1. Data Preparation

In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
dir = os.getcwd()
os.listdir(dir)

['QuestionBook.html',
 '.ipynb_checkpoints',
 'Workbook.ipynb',
 'machine_learning_procedures',
 'DefaultRecord_History.csv',
 'DefaultRecord_Person.csv',
 'partial_code.py']

In [3]:
history_path = os.path.join(dir, 'DefaultRecord_History.csv')
person_path = os.path.join(dir, 'DefaultRecord_Person.csv')

In [4]:
history = pd.read_csv(history_path)
person = pd.read_csv(person_path)

In [5]:
history.set_index(history['ID'])
del history['ID']

In [6]:
history.head(5)

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [7]:
person.set_index(person['ID'])
del person['ID']

In [8]:
result = pd.concat([history, person], axis = 1, join = 'inner')

In [9]:
print(history.shape)
print(person.shape)
print(result.shape)

(30000, 13)
(30000, 11)
(30000, 24)


## 2. Data Cleansing and Feature Engineering

In [10]:
result.describe(include = 'all')

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911
std,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988
min,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
25%,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0


In [11]:
result['PAY_SUM'] = result['PAY_0'] + result['PAY_2'] + result['PAY_3'] + result['PAY_4'] + result['PAY_5'] + result['PAY_6']

In [12]:
result['BILL_SUM'] = result['BILL_AMT1'] + result['BILL_AMT2'] + result['BILL_AMT3'] + result['BILL_AMT4'] + result['BILL_AMT5'] + result['BILL_AMT6']

In [13]:
result['PAY_AMT_SUM'] = result['PAY_AMT1'] + result['PAY_AMT2'] + result['PAY_AMT3'] + result['PAY_AMT4'] + result['PAY_AMT5'] + result['PAY_AMT6']

In [14]:
result['PAY_AMT_SUM'].describe()

count    3.000000e+04
mean     3.165139e+04
std      6.082768e+04
min      0.000000e+00
25%      6.679750e+03
50%      1.438300e+04
75%      3.350350e+04
max      3.764066e+06
Name: PAY_AMT_SUM, dtype: float64

In [15]:
result.describe(include='all')

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,PAY_SUM,BILL_SUM,PAY_AMT_SUM
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,-1.094633,269861.7,31651.39
std,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,5.893055,379564.3,60827.68
min,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-12.0,-336259.0,0.0
25%,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-5.0,28688.0,6679.75
50%,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126311.0,14383.0
75%,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,342626.5,33503.5
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,36.0,5263883.0,3764066.0


### As we can see from result.describe(), there are values of 'EDUCATION' which are higher than 4. However, according to the specification given, possible values can only be 1 2 3 4.

In [16]:
result['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

### Here I decided to assign all values other than 1, 2, 3, 4 to 4, which is the category for 'Others'

In [17]:
result['EDUCATION'][(result['EDUCATION']!= 1) & (result['EDUCATION']!= 2) & 
                    (result['EDUCATION']!= 3)] = 4

In [18]:
result['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
4      468
Name: EDUCATION, dtype: int64

### Same for 'MARRIAGE', 0 is not possible values, I assign all 0 values to 3, which means 'Others'

In [19]:
result['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

In [20]:
result['MARRIAGE'][result['MARRIAGE'] == 0] = 0

In [21]:
result['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

## 3. Model Building 

### Random Forest Classifier

In [22]:
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [55]:
rf_one = RandomForestClassifier(n_estimators = 200)

In [56]:
train_features = result.copy()
del train_features['default payment next month']
X_train, X_test, y_train, y_test = train_test_split(
                                   train_features, result['default payment next month'],
                                   test_size = 0.8, random_state = 40)
rf_one.fit(X_train.values, y_train.values)
rf_one.score(X_test.values, y_test.values)

0.81304166666666666

### Print out feature importances for every feature in descending order

In [57]:
features = list(X_train.columns)
importances = map(lambda x: round(x, 4), rf_one.feature_importances_)
feature_importance_tup = zip(importances, features)
print sorted(feature_importance_tup, reverse = True)

[(0.0753, 'PAY_0'), (0.0634, 'PAY_SUM'), (0.0593, 'AGE'), (0.0543, 'LIMIT_BAL'), (0.0539, 'PAY_AMT_SUM'), (0.0482, 'BILL_SUM'), (0.0482, 'BILL_AMT1'), (0.0438, 'BILL_AMT2'), (0.0432, 'PAY_AMT1'), (0.042, 'BILL_AMT4'), (0.0413, 'BILL_AMT6'), (0.041, 'BILL_AMT5'), (0.0408, 'BILL_AMT3'), (0.0403, 'PAY_AMT3'), (0.04, 'PAY_AMT2'), (0.0392, 'PAY_AMT6'), (0.0375, 'PAY_2'), (0.0368, 'PAY_AMT4'), (0.0356, 'PAY_AMT5'), (0.0221, 'PAY_5'), (0.0182, 'EDUCATION'), (0.0179, 'PAY_3'), (0.0175, 'PAY_6'), (0.0167, 'PAY_4'), (0.012, 'MARRIAGE'), (0.0114, 'SEX')]


### Find the appropriate number of estimators, by plotting OOB error for different number of estimators

In [None]:
plot_label = "RandomForestClassifier, max_features=None"
clf_for_itr = RandomForestClassifier(
                               warm_start=True, max_features=None, 
                               oob_score=True)

error_rate = []
min_estimators = 10
max_estimators = 200

for i in range(min_estimators, max_estimators + 1):
    clf_for_itr.set_params(n_estimators=i)
    clf_for_itr.fit(X_train, y_train)
    oob_error = 1 - clf_for_itr.oob_score_
    error_rate.append((i, oob_error))

xs, ys = zip(*error_rate)
plt.plot(xs, ys, label=plot_label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

### Second Random Forest Attempt with mannually picked features

In [67]:
rf_two = RandomForestClassifier(n_estimators = 200)

In [74]:
train_features = result.loc[:,['PAY_0', 'PAY_SUM', 'AGE', 'LIMIT_BAL', 
                               'PAY_AMT_SUM', 'BILL_SUM']]
X_train, X_test, y_train, y_test = train_test_split(
                                   train_features, result['default payment next month'],
                                   test_size = 0.8, random_state = 40)

In [75]:
rf_two.fit(X_train.values, y_train.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [76]:
rf_two.score(X_test, y_test)

0.80787500000000001

In [77]:
features = list(X_train.columns)
importances = map(lambda x: round(x, 4), rf_two.feature_importances_)
feature_importance_tup = zip(importances, features)
print sorted(feature_importance_tup, reverse = True)

[(0.2151, 'BILL_SUM'), (0.2131, 'PAY_AMT_SUM'), (0.1592, 'AGE'), (0.1479, 'PAY_SUM'), (0.1329, 'PAY_0'), (0.1318, 'LIMIT_BAL')]
