In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Data Cleaning

In [2]:
df = pd.read_csv('credit_scoring_data.csv')

df.columns = df.columns.str.lower()
df.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [3]:
df.status.value_counts()

status
1    3200
2    1254
0       1
Name: count, dtype: int64

In [4]:
#help(df.replace)

In [5]:
to_replace = [1,2,0]
values = ['ok', 'default', 'unk']

df['status'] = df['status'].replace(to_replace, values) 

In [6]:
df.status.value_counts()

status
ok         3200
default    1254
unk           1
Name: count, dtype: int64

In [7]:
# Similarly with the dictionary
status_values = {
    1:'ok',
    2:'default',
    0:'unk'
}
df['status'] = df['status'].replace(status_values)

In [8]:
df.status.value_counts()

status
ok         3200
default    1254
unk           1
Name: count, dtype: int64

In [9]:
df.status.value_counts()

status
ok         3200
default    1254
unk           1
Name: count, dtype: int64

In [10]:
#help(df.value_counts)

In [11]:
# using nested dictionaries:

nest_dict = {'status':
             {1:'ok',2:'default',0:'unk'},
            'home':
             {1: 'rent',2: 'owner',3: 'private',4: 'ignore',5: 'parents',6: 'other',0: 'unk'},
            'marital'
             :{1: 'single',2: 'married',3: 'widow',4: 'separated',5: 'divorced',0: 'unk'},
            'records':
             {1: 'no',2: 'yes',0: 'unk'},
            'job':
             {1: 'fixed',2: 'partime',3: 'freelance', 4: 'others', 0: 'unk'} 
}
df = df.replace(nest_dict)

In [12]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [13]:
#help(df.describe)

In [14]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace = 99999999.0, value = np.nan )

In [15]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [16]:
df = df[df.status != 'unk'].reset_index(drop = True)

In [20]:
#help(df.reset_index)
#help(train_test_split)

In [21]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state = 42)

In [22]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_val['status']
del df_test['status']

In [23]:
df_train.shape

(2672, 13)

In [24]:
df_val.shape, df_test.shape

((891, 13), (891, 13))

# Decision Tree
* How a decision tree looks like
* Training a decision tree
* Overfitting
* Controlling the size of a tree

In [25]:
def assess_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] >= 6000:
            return 'ok'
        else:
            return 'default'

In [26]:
xi = df_train.iloc[0]
assess_risk(xi)

'default'

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [28]:
train_dicts = df_train.fillna(0).to_dict(orient = 'records')

In [31]:
#help(df.to_dict)
#help(df.fillna)
#help(dv.fit_transform)

In [38]:


dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_dicts)

'dv = DictVectorizer(sparse = False)\nX_train = dv.fit_transform(train_dicts)'

In [42]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

val_dicts = df_val.fillna(0).to_dict(orient = 'records')
X_val = dv.fit_transform(val_dicts)
y_pred = dt.predict_proba(X_val)

ValueError: X has 28 features, but DecisionTreeClassifier is expecting 2147 features as input.

In [41]:
X_train[0].shape[0], X_val[0].shape[0]

(1, 28)

In [36]:
#help(dv.fit_transform)

In [37]:
"""from sklearn.preprocessing import OneHotEncoder

oneh = OneHotEncoder(handle_unknown="ignore")
oneh.fit(df_train)

X_train = oneh.transform(df_train.fillna(0))

oneh.fit(df_val)
X_val = oneh.transform(df_val.fillna(0))


from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder(handle_unknown="ignore")
oneh.fit(train_data[features])
X_test = oneh.transform(test_data[features])"""

'from sklearn.preprocessing import OneHotEncoder\n\noneh = OneHotEncoder(handle_unknown="ignore")\noneh.fit(df_train)\n\nX_train = oneh.transform(df_train)\n\noneh.fit(df_val)\nX_val = oneh.transform(df_val)\n\n\nfrom sklearn.preprocessing import OneHotEncoder\noneh = OneHotEncoder(handle_unknown="ignore")\noneh.fit(train_data[features])\nX_test = oneh.transform(test_data[features])'