In [1]:
# IMPORTING NECCESSARY LIBRARIES 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_curve, roc_auc_score

%matplotlib inline
sns.set_style("darkgrid")

In [2]:
# IMPORTING THE DATASET
df_loan = pd.read_csv('german_credit_data.csv', index_col=0)
df_loan.head()


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


# Pre-processing

In [3]:
## Sex

df_loan["Sex"].value_counts()

male      690
female    310
Name: Sex, dtype: int64

In [4]:
#cross table for the 'Sex' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Sex']).apply(lambda x: x/x.sum() * 100)
decimals = pd.Series([2,2], index=['Male', 'Female'])
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed

Risk,bad,good
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,35.16,64.84
male,27.68,72.32


In [5]:
## Performing OneHotEncoding

df_loan["Sex"] = df_loan["Sex"].apply(lambda x:1 if x=="male" else 0)
df_loan["Sex"].head()

0    1
1    0
2    1
3    1
4    1
Name: Sex, dtype: int64

In [6]:
## job

# Where; 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled
df_loan["Job"].head()

0    2
1    2
2    1
3    2
4    2
Name: Job, dtype: int64

In [7]:
## Housing

df_loan["Housing"].value_counts()

own     713
rent    179
free    108
Name: Housing, dtype: int64

In [8]:
#cross table for the 'housing' feature
cross_housing = pd.crosstab(df_loan['Risk'], df_loan['Housing']).apply(lambda x: x/x.sum() * 100)
cross_housing = cross_housing.round(2)
cross_housing_transposed = cross_housing.T
cross_housing_transposed

Risk,bad,good
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1
free,40.74,59.26
own,26.09,73.91
rent,39.11,60.89


In [9]:
## Performing OneHotEncoding
# 1- Own, 2- Rent, 0- Free

df_loan["Housing"].replace(["own", "rent", "free"], [1, 2, 0], inplace = True)
df_loan["Housing"].head()

0    1
1    1
2    1
3    0
4    0
Name: Housing, dtype: int64

In [10]:
## Purpose

df_loan["Purpose"].value_counts(normalize = True)

car                    0.337
radio/TV               0.280
furniture/equipment    0.181
business               0.097
education              0.059
repairs                0.022
domestic appliances    0.012
vacation/others        0.012
Name: Purpose, dtype: float64

In [11]:
df_loan["Purpose"].replace(["repairs", "radio/TV", "vacation/others"], "others", inplace = True)
df_loan["Purpose"].replace(["furniture/equipment", "domestic appliances"], "domestic equipments", inplace = True)

In [12]:
df_loan["Purpose"].value_counts(normalize= True)

car                    0.337
others                 0.314
domestic equipments    0.193
business               0.097
education              0.059
Name: Purpose, dtype: float64

In [13]:
#cross table for the 'Purpose' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Purpose']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed

Risk,bad,good
Purpose,Unnamed: 1_level_1,Unnamed: 2_level_1
business,35.05,64.95
car,31.45,68.55
domestic equipments,32.12,67.88
education,38.98,61.02
others,23.89,76.11


In [14]:
## Performing OneHotEncoding

df_loan["Purpose"].replace(["others", "business", "car", "domestic equipments", "education"], [0, 1, 2, 3, 4], inplace = True)
df_loan.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,,little,1169,6,0,good
1,22,0,2,1,little,moderate,5951,48,0,bad
2,49,1,1,1,little,,2096,12,4,good
3,45,1,2,0,little,little,7882,42,3,good
4,53,1,2,0,little,little,4870,24,2,bad


In [15]:
## Saving Accounts

df_loan["Saving accounts"].value_counts(normalize= True)

little        0.738066
moderate      0.126071
quite rich    0.077111
rich          0.058752
Name: Saving accounts, dtype: float64

In [16]:
df_loan["Saving accounts"].fillna("None", inplace= True)
df_loan["Saving accounts"].value_counts(normalize= True)

little        0.603
None          0.183
moderate      0.103
quite rich    0.063
rich          0.048
Name: Saving accounts, dtype: float64

In [17]:
#cross table for the 'Saving accounts' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Saving accounts']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed

Risk,bad,good
Saving accounts,Unnamed: 1_level_1,Unnamed: 2_level_1
,17.49,82.51
little,35.99,64.01
moderate,33.01,66.99
quite rich,17.46,82.54
rich,12.5,87.5


In [18]:
## Performing OneHotEncoding

df_loan["Saving accounts"].replace(["little", "None", "moderate", "quite rich", "rich"], [1, 0, 2, 4, 3], inplace= True)
df_loan["Saving accounts"].head()

0    0
1    1
2    1
3    1
4    1
Name: Saving accounts, dtype: int64

In [19]:
## Checking Amount

df_loan["Checking account"].value_counts(normalize= True)

little      0.452145
moderate    0.443894
rich        0.103960
Name: Checking account, dtype: float64

In [20]:
df_loan["Checking account"].fillna("None", inplace= True)
df_loan["Checking account"].value_counts(normalize= True)

None        0.394
little      0.274
moderate    0.269
rich        0.063
Name: Checking account, dtype: float64

In [21]:
#cross table for the 'Sex' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Checking account']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed

Risk,bad,good
Checking account,Unnamed: 1_level_1,Unnamed: 2_level_1
,11.68,88.32
little,49.27,50.73
moderate,39.03,60.97
rich,22.22,77.78


In [22]:
## Performing OneHotEncoding

df_loan["Checking account"].replace(["little", "None", "moderate", "rich"], [1, 0, 2, 3], inplace= True)
df_loan["Checking account"].head()

0    1
1    2
2    0
3    1
4    1
Name: Checking account, dtype: int64

In [23]:
df_loan.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,0,1,1169,6,0,good
1,22,0,2,1,1,2,5951,48,0,bad
2,49,1,1,1,1,0,2096,12,4,good
3,45,1,2,0,1,1,7882,42,3,good
4,53,1,2,0,1,1,4870,24,2,bad


In [24]:
## Preprocessing the dependent variable - Risk

df_loan["Risk"].value_counts(normalize= True)

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [25]:
## Encoding the dependent variable


df_loan["Risk_Status"] = df_loan["Risk"].apply(lambda x:1 if x == "bad" else 0)
df_loan["Risk_Status"].head()

0    0
1    1
2    0
3    0
4    1
Name: Risk_Status, dtype: int64

In [26]:
df_loan.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Risk_Status
0,67,1,2,1,0,1,1169,6,0,good,0
1,22,0,2,1,1,2,5951,48,0,bad,1
2,49,1,1,1,1,0,2096,12,4,good,0
3,45,1,2,0,1,1,7882,42,3,good,0
4,53,1,2,0,1,1,4870,24,2,bad,1


In [27]:
df_loan.drop("Risk", axis = 1, inplace = True)
df_loan.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk_Status
0,67,1,2,1,0,1,1169,6,0,0
1,22,0,2,1,1,2,5951,48,0,1
2,49,1,1,1,1,0,2096,12,4,0
3,45,1,2,0,1,1,7882,42,3,0
4,53,1,2,0,1,1,4870,24,2,1


## Scalling the dataset

In [28]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

scaler.fit(df_loan.drop("Risk_Status", axis = 1)) 

scaled_features = scaler.transform(df_loan.drop("Risk_Status", axis = 1))

scaled_features

array([[ 2.76645648,  0.67028006,  0.14694918, ..., -0.74513141,
        -1.23647786, -1.24970871],
       [-1.19140394, -1.49191369,  0.14694918, ...,  0.94981679,
         2.24819436, -1.24970871],
       [ 1.18331231,  0.67028006, -1.38377145, ..., -0.41656241,
        -0.73866754,  1.90214175],
       ...,
       [ 0.21583532,  0.67028006,  0.14694918, ..., -0.87450324,
        -0.73866754, -1.24970871],
       [-1.10345149,  0.67028006,  0.14694918, ..., -0.50552769,
         1.9992892 , -1.24970871],
       [-0.75164167,  0.67028006,  0.14694918, ...,  0.46245715,
         1.9992892 ,  0.32621652]])

In [29]:
df_feat = pd.DataFrame(scaled_features, columns = df_loan.columns[:-1])

In [30]:
df_feat.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,2.766456,0.67028,0.146949,-0.13371,-1.203212,-0.001045,-0.745131,-1.236478,-1.249709
1,-1.191404,-1.491914,0.146949,-0.13371,-0.204696,1.044372,0.949817,2.248194,-1.249709
2,1.183312,0.67028,-1.383771,-0.13371,-0.204696,-1.046463,-0.416562,-0.738668,1.902142
3,0.831502,0.67028,0.146949,-2.016956,-0.204696,-0.001045,1.634247,1.750384,1.114179
4,1.535122,0.67028,0.146949,-2.016956,-0.204696,-0.001045,0.566664,0.256953,0.326217


# Splitting the data

In [31]:
X = df_feat
y = df_loan["Risk_Status"]

In [32]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,2.766456,0.67028,0.146949,-0.13371,-1.203212,-0.001045,-0.745131,-1.236478,-1.249709
1,-1.191404,-1.491914,0.146949,-0.13371,-0.204696,1.044372,0.949817,2.248194,-1.249709
2,1.183312,0.67028,-1.383771,-0.13371,-0.204696,-1.046463,-0.416562,-0.738668,1.902142
3,0.831502,0.67028,0.146949,-2.016956,-0.204696,-0.001045,1.634247,1.750384,1.114179
4,1.535122,0.67028,0.146949,-2.016956,-0.204696,-0.001045,0.566664,0.256953,0.326217


In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [35]:
X_train.shape

(700, 9)

In [36]:
y_train.shape

(700,)