# **Credit Card Risk Assessment**
In this code I have used XGBoost Model to Predict the Risk Assessment for Credit Card,

In [2]:
#Importing Required Libraries
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing the Dataset
df = pd.read_csv("/content/Credit_default_dataset.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
#We don't need Id column so we drop it
df = df.drop(["ID"], axis = 1)

In [6]:
#We Rename column Pay_0 as Pay_1 since it makes more sense
df.rename(columns = {'PAY_0' : 'PAY_1'}, inplace = True)
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [11]:
df['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

In [12]:
df['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

Columns such as Education and Marriage has some unwated categorical levels so we remove them.

In [13]:
#Data Processing
df["EDUCATION"] = df["EDUCATION"].map({0:4, 1:1, 2:2, 3:3, 4:4, 5:4, 6:4})
df["MARRIAGE"] = df["MARRIAGE"].map({0:3, 1:1, 2:2, 3:3})

In [16]:
#Standardizing the Data
#Defining Dependent and Independent Variables i.e. 'Y' and 'x' resp.
scaling=StandardScaler()
X = df.drop(['default.payment.next.month'], axis = 1)
X = scaling.fit_transform(X)
y = df['default.payment.next.month']

In [17]:
#Defining Hyper Parameters
params = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
          "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
          "min_child_weight" : [ 1, 3, 5, 7 ],
          "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
          "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]}

In [18]:
def timer(start_time = None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [19]:
#Creating Model
classifier = xgboost.XGBClassifier()
Random_search = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 5, 
                                   scoring = 'roc_auc', n_jobs = -1, cv = 5, verbose = 3)

In [20]:
#Hyper Parameter Optimization
start_time = timer(None) #Timing starts from this point for "start_time" variable
Random_search.fit(X, y)
timer(start_time) #Timing ends here for "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0 hours 1 minutes and 28.13 seconds.


In [22]:
#Choosing Best Parameters
Random_search.best_estimator_

XGBClassifier(colsample_bytree=0.5, gamma=0.4, learning_rate=0.2, max_depth=5,
              min_child_weight=3)

In [23]:
Random_search.best_params_

{'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.2,
 'gamma': 0.4,
 'colsample_bytree': 0.5}

In [24]:
#Applying Model
classifier = xgboost.XGBClassifier(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                                   colsample_bytree = 0.4, gamma = 0.1, learning_rate = 0.25,
                                   max_delta_step = 0, max_depth = 3, min_child_weight = 7, 
                                   missing = None, n_estimators = 100, n_jobs = 1, nthread = None,
                                   objective = 'binary:logistic', random_state = 0, reg_alpha = 0, 
                                   reg_lambda = 1, scale_pos_weight = 1, seed = None, silent = True, subsample = 1)

In [25]:
#Cross Validation
score = cross_val_score(classifier, X, y, cv = 10)
score

array([0.80666667, 0.80866667, 0.81566667, 0.807     , 0.817     ,
       0.827     , 0.836     , 0.83133333, 0.82833333, 0.825     ])

In [26]:
#Checking Accuracy
score.mean()

0.8202666666666666