# Credit Card Default: Model Tuning and Improving Performance

#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.options.mode.chained_assignment = None
%matplotlib inline

#### Loading and preparing the dataset

In [2]:
# Loading the dataset
DATA_DIR = '../data'
FILE_NAME = 'credit_card_default.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
ccd = pd.read_csv(data_path, index_col="ID")
ccd.rename(columns=lambda x: x.lower(), inplace=True)
ccd.rename(columns={'default payment next month':'default'}, inplace=True)

# getting the groups of features
bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]
pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]
numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features

# Creating creating binary features
ccd['male'] = (ccd['sex'] == 1).astype('int')
ccd['grad_school'] = (ccd['education'] == 1).astype('int')
ccd['university'] = (ccd['education'] == 2).astype('int')
ccd['married'] = (ccd['marriage'] == 1).astype('int')

# simplifying pay features 
pay_features= ['pay_' + str(i) for i in range(1,7)]
for x in pay_features:
    ccd.loc[ccd[x] <= 0, x] = 0

# simplifying delayed features
delayed_features = ['delayed_' + str(i) for i in range(1,7)]
for pay, delayed in zip(pay_features, delayed_features):
    ccd[delayed] = (ccd[pay] > 0).astype(int)
    
# creating a new feature: months delayed
ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)

#### Splitting  and standarizing the dataset

In [3]:
numerical_features = numerical_features + ['months_delayed']
binary_features = ['male','married','grad_school','university']
X = ccd[numerical_features + binary_features]
y = ccd['default'].astype(int)

## Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=25)

## Standarize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X[numerical_features])
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])
X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])

## Optimizing more than one parameter

#### Reference model

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
ref_rf = RandomForestClassifier(n_estimators=25,
                                max_features=4,
                                max_depth=4,
                                random_state=61)

ref_rf_scores = cross_val_score(ref_rf, X_train, y_train, scoring='roc_auc', cv=10)

In [5]:
print("Mean AUC for reference model: {:0.4f}".format(ref_rf_scores.mean()))

Mean AUC for reference model: 0.7589


#### Grid Search CV

In [6]:
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators":[25,100,200,400],
              "max_features":[4,10,19],
              "max_depth":[4,8,16,20]}

In [None]:
rf = RandomForestClassifier(random_state=17)
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=5,
                           verbose=1,
                           n_jobs=4)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
gs_results = pd.Series(grid_search.cv_results_['mean_test_score'], index=grid_search.cv_results_['params'])
gs_results.sort_values(ascending=False)

In [None]:
from sklearn.metrics import precision_recall_curve
## Fitting the initial (not tuned) model:
ref_rf.fit(X_train, y_train)

## Getting the probabilites
y_prob_tunned = grid_search.predict_proba(X_test)[:,1]
y_prob_not_tunned = ref_rf.predict_proba(X_test)[:,1]

## Values for plotting the curves
prec_tuned, recall_tuned, _ = precision_recall_curve(y_test, y_prob_tunned)
prec_not_tuned, recall_not_tuned, _ = precision_recall_curve(y_test, y_prob_not_tunned)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(prec_tuned, recall_tuned, label='Tuned Model')
ax.plot(prec_not_tuned, recall_not_tuned, label='Not Tuned Model')
ax.set_title('Precision-recall curves', fontsize=16)
ax.set_xlabel('Precision', fontsize=14)
ax.set_ylabel('Recall', fontsize=14)
ax.set_xlim(0.3,0.7); ax.set_ylim(0.1,0.9)
ax.legend(); ax.grid();