### Imports

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, os, sys, seaborn as sns
from colorsetup import colors, palette
sns.set_palette(palette)

### Read the Data

In [2]:
gcr_data = pd.read_csv("german_credit_data.csv", index_col=0)

### Categorical Missing Values Imputation

In [3]:
from sklearn.impute import SimpleImputer
values = gcr_data.values 
imputer = SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='missing') 
transformed_values = imputer.fit_transform(values)

In [4]:
gcr_data_imputed = pd.DataFrame(transformed_values, columns=gcr_data.columns)

In [5]:
gcr_data_imputed["Credit amount"] = gcr_data_imputed["Credit amount"].astype(int)
gcr_data_imputed["Duration"] = gcr_data_imputed["Duration"].astype(int)
gcr_data_imputed["Job"] = gcr_data_imputed["Job"].astype(int)
gcr_data_imputed["Age"] = gcr_data_imputed["Age"].astype(int)

### Separating feature and target

In [6]:
X = gcr_data_imputed.drop(columns='Risk', axis=1)
y = gcr_data_imputed['Risk']

### Feature Engineering

In [7]:
cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes == 'int']

#### Log transformation of numeric variables

In [8]:
skew_vals = X[num_cols].skew()

In [9]:
skew_limit = 0.75
skew_cols = (skew_vals.
            sort_values(ascending=False)
            .to_frame()
            .rename(columns={0:'Skew'})
            .query('abs(Skew) > {}'.format(skew_limit)))
skew_cols

Unnamed: 0,Skew
Credit amount,1.949628
Duration,1.094184
Age,1.020739


In [10]:
for col in skew_cols.index.values:
    X[col] = X[col].apply(np.log1p)

#### Label Encode Categorical Variables

In [11]:
for col in cat_cols:
    print('{}: {} {}'.format(col, '\n', X[col].unique()))

Sex: 
 ['male' 'female']
Housing: 
 ['own' 'free' 'rent']
Saving accounts: 
 ['missing' 'little' 'quite rich' 'rich' 'moderate']
Checking account: 
 ['little' 'moderate' 'missing' 'rich']
Purpose: 
 ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']


In [12]:
from sklearn.preprocessing import LabelEncoder
le_cat = LabelEncoder()

In [13]:
for column in cat_cols:
    X[column] = le_cat.fit_transform(X[column])

#### Label Encode Target Variable

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

gcr_data_imputed['Risk'] = le.fit_transform(gcr_data_imputed['Risk'])

le.classes_

array(['bad', 'good'], dtype=object)

In [15]:
gcr_data_imputed.Risk.unique()

array([1, 0])

In [16]:
gcr_data_imputed.Risk.value_counts(normalize=True)

1    0.7
0    0.3
Name: Risk, dtype: float64

In [17]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,4.219508,1,2,1,1,0,7.064759,1.94591,5
1,3.135494,0,2,1,0,2,8.691483,3.89182,5
2,3.912023,1,1,1,0,1,7.648263,2.564949,3
3,3.828641,1,2,0,0,0,8.972464,3.7612,4
4,3.988984,1,2,0,0,0,8.491055,3.218876,1


In [19]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1000.0,3.554569,0.291418,2.995732,3.332205,3.526361,3.7612,4.330733
Sex,1000.0,0.69,0.462725,0.0,0.0,1.0,1.0,1.0
Job,1000.0,1.904,0.653614,0.0,2.0,2.0,2.0,3.0
Housing,1000.0,1.071,0.531264,0.0,1.0,1.0,1.0,2.0
Saving accounts,1000.0,0.77,1.156909,0.0,0.0,0.0,1.0,4.0
Checking account,1000.0,1.121,0.883821,0.0,0.0,1.0,2.0,3.0
Credit amount,1000.0,7.789244,0.776074,5.525453,7.220008,7.749538,8.28734,9.821464
Duration,1000.0,2.940942,0.546377,1.609438,2.564949,2.944439,3.218876,4.290459
Purpose,1000.0,2.878,1.978138,0.0,1.0,3.0,5.0,7.0


### Split Train_Test

In [20]:
from sklearn.model_selection import train_test_split

feature_columns = [x for x in X.columns if x != 'Risk']

X_train, X_test, y_train, y_test = train_test_split(X[feature_columns], gcr_data_imputed['Risk'],
                 test_size=0.3, random_state=42)

### 1. XGBoost

In [29]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
from sklearn.model_selection import cross_val_score

XGB_accuracies = cross_val_score(estimator = xgbc, X = X_train, y = y_train, cv = 10)
print("Mean_XGB_Acc : ", XGB_accuracies.mean())

Mean_XGB_Acc :  0.7085714285714285


### 2. Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(oob_score=True, n_estimators=307, random_state=1, warm_start=True,
                            n_jobs=-1)
rf.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

RF_accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
print("Mean_RF_Acc : ", RF_accuracies.mean())

Mean_RF_Acc :  0.7342857142857143


### 3.  Gradient Boosting Classifier

**loss: {‘deviance’, ‘exponential’}, default=’deviance’**

loss function to be optimized. ‘deviance’ refers to deviance (= logistic regression) for classification with probabilistic outputs. For loss ‘exponential’ gradient boosting recovers the AdaBoost algorithm.

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

gbc=GradientBoostingClassifier(loss = 'deviance',learning_rate=0.07,n_estimators=350, max_depth=6,subsample=1,verbose=False)
gbc.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

GB_accuracies = cross_val_score(estimator = gbc, X = X_train, y = y_train, cv = 10)
print("Mean_GB_Acc : ", GB_accuracies.mean())

Mean_GB_Acc :  0.7
