## Set up environment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

%matplotlib inline
np.random.seed(2021)

## Load Default of Credit Card Clients data

In [5]:
df_credit_raw = pd.read_excel ('/Users/yyyyushuqi/Downloads/default of credit card clients.xls')
df_credit_raw.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


In [6]:
df_credit = df_credit_raw.iloc[1:,1:]
print(df_credit.shape)
df_credit.head(10)
df_credit = pd.DataFrame(df_credit)

(30000, 24)


# deal with the imbalance data

In [7]:
df_credit['Y'] = df_credit['Y'].astype('category')
ovs = RandomOverSampler(random_state = 42)
x1,y1 = ovs.fit_resample(df_credit.iloc[:, 0:-1],df_credit.Y)
df_credit_oversampling=x1
df_credit_oversampling["Y"]=y1
print("DATA_oversampling \ngood class:", len(df_credit_oversampling[df_credit_oversampling.Y == 0]),
      "\nBad Class:", len(df_credit_oversampling[df_credit_oversampling.Y == 1]))

DATA_oversampling 
good class: 23364 
Bad Class: 23364


In [9]:
from collections import Counter

X_smote, y_smote = SMOTE().fit_resample(df_credit.iloc[:, 0:-1], df_credit.Y)
Counter(y_smote).items()

#print("DATA_SMOTE \ngood class:", len(df_credit_smote[df_credit_smote.Y == 0]),
 #     "\nBad Class:", len(df_credit_smote[df_credit_smote.Y == 1]))

dict_items([(1, 23364), (0, 23364)])

In [11]:
# split the data for training purpose

from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(X_smote, y_smote, stratify=y_smote, test_size=0.2, random_state=0)
X_train.columns.values

array(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20',
       'X21', 'X22', 'X23'], dtype=object)

In [12]:
X_train.head(3)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
14957,150000,2,1,2,24,-1,-1,-1,-1,-1,...,291,658,441,1197,291,291,658,441,1197,8007
21974,70000,2,2,2,25,2,0,0,0,0,...,48750,49936,51102,52276,2070,2000,2000,2000,2000,2100
26414,160000,2,2,1,41,0,0,0,0,0,...,38988,19825,22654,16158,10000,10000,5000,5000,5000,10000


In [13]:
y_train

14957    0
21974    1
26414    0
42054    1
29252    0
        ..
4878     0
38342    1
7447     1
33646    1
26398    0
Name: Y, Length: 37382, dtype: category
Categories (2, int64): [0, 1]

## Standardize Features

In [7]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train),columns=X_train.columns)
X_test = ss.transform(X_test)
X_train.agg(['mean','std']).T.sort_values('mean',ascending=False)

Unnamed: 0,mean,std
X3,5.787963000000001e-17,1.000021
X15,5.2698590000000006e-17,1.000021
X9,3.5527140000000005e-17,1.000021
X8,3.4194870000000005e-17,1.000021
X7,2.5461110000000002e-17,1.000021
X1,1.539509e-17,1.000021
X18,1.0658140000000001e-17,1.000021
X14,9.473903e-18,1.000021
X10,9.473903e-18,1.000021
X21,1.776357e-18,1.000021


## Feature Selection

In [8]:
from sklearn.linear_model import LogisticRegression

# First, without regularization

logr = LogisticRegression(C=100, penalty="l1", solver="liblinear", random_state=123)
logr.fit(X_train, y_train)
logr.coef_[0,:5]
sorted_tuples = sorted(list(zip(X_train.columns.values,logr.coef_[0])),key=lambda x:x[1],reverse=True)
for feature,coef in sorted_tuples:
    print(f'{feature:30s} : {coef: 0.3f}')

# Now with LASSO

logr = LogisticRegression(C=0.1, penalty="l1", solver="liblinear", random_state=123)
logr.fit(X_train, y_train)
sorted_tuples = sorted(list(zip(X_train.columns.values,logr.coef_[0])),key=lambda x:x[1],reverse=True)
for feature,coef in sorted_tuples:
    print(f'{feature:30s} : {coef: 0.3f}')

X_train.columns[logr.coef_[0] != 0]

ValueError: Unknown label type: 'unknown'

AttributeError: 'tuple' object has no attribute 'typeof'

# Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score



In [41]:
rf = RandomForestClassifier()
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40,60,100,300],
                 'max_depth': [2, 5, 7, 9, 11, 15, 20, 100],
                 #'min_samples_splits' : [0.1,0.25,0.5,0.75,1],
               #  'min_samples_leafs' : [0.1,0.2,0.3,0.4,0.5],
                # 'max_features' : [1,5,10,15,20,40,70,100,500],
             }

grid_clf = GridSearchCV(rf, param_grid, cv=10)

In [42]:
model = grid_clf.fit(X_train, y_train)

In [46]:
print("Random Forest  \ntrain accuracy:", model.score(X_train,y_train),
     "\ntest accuracy:", model.score(X_test,y_test))


Random Forest  
train accuracy: 0.9996522390455299 
test accuracy: 0.8804836293601541


In [44]:
roc_auc_score(y_test, model.predict(X_test))

0.880483629360154

In [26]:
f = RandomForestClassifier()
m = f.fit(X_train, y_train)
print(m.score(X_train,y_train))
print(m.score(X_test,y_test))
print(roc_auc_score(y_test, m.predict(X_test)))

0.9996522390455299
0.8798416434838433
0.8798416434838433
