In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import scikitplot as skplt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelBinarizer # one hot encoding

## Dealing with categorical features
 - Label encoding
 - One Hot encoding

In [2]:
# load the raw data

df = pd.read_csv('german_credit_raw.csv')

In [3]:
df.head()

Unnamed: 0,default,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,present_res_since,property,age,other_installment_plans,housing,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker
0,0,< 0 DM,6,critical account/ other credits existing (not ...,domestic appliances,1169,unknown/ no savings account,.. >= 7 years,4,male : single,...,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes
1,1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,...,2,real estate,22,none,own,1,skilled employee / official,1,none,yes
2,0,no checking account,12,critical account/ other credits existing (not ...,(vacation - does not exist?),2096,... < 100 DM,4 <= ... < 7 years,2,male : single,...,3,real estate,49,none,own,1,unskilled - resident,2,none,yes
3,0,< 0 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,...,4,if not A121 : building society savings agreeme...,45,none,for free,1,skilled employee / official,2,none,yes
4,1,< 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,...,4,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes


## let's look at the different types of account status

In [4]:
df.groupby('account_check_status').size()

account_check_status
0 <= ... < 200 DM                                     269
< 0 DM                                                274
>= 200 DM / salary assignments for at least 1 year     63
no checking account                                   394
dtype: int64

In [6]:
## convert the data type to category
df['account_check_status'] = df['account_check_status'].astype('category')

## Label encoding

In [7]:
df['account_status_cat'] = df['account_check_status'].cat.codes

In [8]:
df.groupby(['account_check_status', 'account_status_cat']).size()

account_check_status                                account_status_cat
0 <= ... < 200 DM                                   0                     269
< 0 DM                                              1                     274
>= 200 DM / salary assignments for at least 1 year  2                      63
no checking account                                 3                     394
dtype: int64

## One hot encoding

In [11]:
df_one_hot = df.copy()

lb = LabelBinarizer()
lb_results = lb.fit_transform(df_one_hot['account_check_status'])
lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_)

lb_results_df.head()

Unnamed: 0,0 <= ... < 200 DM,< 0 DM,>= 200 DM / salary assignments for at least 1 year,no checking account
0,0,1,0,0
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0
4,0,1,0,0


In [12]:
## concatenate this data to our data set

final_df = pd.concat([df_one_hot, lb_results_df], axis=1)

In [15]:
print('original df dimensions:', df.shape)
print('one hot encoded df dimensions:', final_df.shape)
final_df.head()

original df dimensions: (1000, 22)
one hot encoded df dimensions: (1000, 26)


Unnamed: 0,default,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker,account_status_cat,0 <= ... < 200 DM,< 0 DM,>= 200 DM / salary assignments for at least 1 year,no checking account
0,0,< 0 DM,6,critical account/ other credits existing (not ...,domestic appliances,1169,unknown/ no savings account,.. >= 7 years,4,male : single,...,2,skilled employee / official,1,"yes, registered under the customers name",yes,1,0,1,0,0
1,1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,...,1,skilled employee / official,1,none,yes,0,1,0,0,0
2,0,no checking account,12,critical account/ other credits existing (not ...,(vacation - does not exist?),2096,... < 100 DM,4 <= ... < 7 years,2,male : single,...,1,unskilled - resident,2,none,yes,3,0,0,0,1
3,0,< 0 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,...,1,skilled employee / official,2,none,yes,1,0,1,0,0
4,1,< 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,...,2,skilled employee / official,2,none,yes,1,0,1,0,0
