import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

In [3]:
# Uncomment the line below if the file 'CreditScoring.csv' is not already present in the local directory
# !wget $data

In [4]:
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


In [5]:
df = pd.read_csv(data)

In [6]:
df.columns = df.columns.str.lower()

In [7]:
df.status.value_counts()

status
1    3200
2    1254
0       1
Name: count, dtype: int64

In [8]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

In [9]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [10]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [11]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [12]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [13]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [14]:
df = df[df.status != 'unk'].reset_index(drop=True)

In [15]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

In [18]:
del df_train['status']
del df_val['status']
del df_test['status']

In [19]:
df_train

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,10,owner,36,36,married,no,freelance,75,0.0,10000.0,0.0,1000,1400
1,6,parents,48,32,single,yes,fixed,35,85.0,0.0,0.0,1100,1330
2,1,parents,48,40,married,no,fixed,75,121.0,0.0,0.0,1320,1600
3,1,parents,48,23,single,no,partime,35,72.0,0.0,0.0,1078,1079
4,5,owner,36,46,married,no,freelance,60,100.0,4000.0,0.0,1100,1897
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2667,18,private,36,45,married,no,fixed,45,220.0,20000.0,0.0,800,1600
2668,7,private,60,29,married,no,fixed,60,51.0,3500.0,500.0,1000,1290
2669,1,parents,24,19,single,no,fixed,35,28.0,0.0,0.0,400,600
2670,15,owner,48,43,married,no,freelance,60,100.0,18000.0,0.0,2500,2976


In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [21]:
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [22]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [23]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [24]:
dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)

# Hyperparameter Tuning with GridSearchCV

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [26]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,50,100,150,200],
            'max_depth' : [10,50,100,150,200],
            'min_samples_leaf':[1, 3, 5, 10, 50]
        }
    },
    'xgboost': {
        'model': XGBClassifier(),
        'params': {    
            'max_depth' : [3, 6, 10],
            'eta' : [0.01,0.1,0.3],
            'min_child_weight' : [None , 1 , 2 , 3]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth' : [1, 3, 5, 7, 10, 15, 20, None],
            'min_samples_leaf' :[1, 5, 10, 15, 20, 500, 100, 200],
        }
    }     
}

In [27]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, scoring = 'roc_auc')
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
gridsearch_results = pd.DataFrame(scores,columns=['model','best_score','best_params'])
gridsearch_results

Unnamed: 0,model,best_score,best_params
0,random_forest,0.84415,"{'max_depth': 10, 'min_samples_leaf': 3, 'n_es..."
1,xgboost,0.853449,"{'eta': 0.1, 'max_depth': 3, 'min_child_weight..."
2,decision_tree,0.79494,"{'max_depth': 7, 'min_samples_leaf': 100}"


In [28]:
gridsearch_results

Unnamed: 0,model,best_score,best_params
0,random_forest,0.84415,"{'max_depth': 10, 'min_samples_leaf': 3, 'n_es..."
1,xgboost,0.853449,"{'eta': 0.1, 'max_depth': 3, 'min_child_weight..."
2,decision_tree,0.79494,"{'max_depth': 7, 'min_samples_leaf': 100}"


In [29]:
gridsearch_results['best_params'][0]

{'max_depth': 10, 'min_samples_leaf': 3, 'n_estimators': 100}

In [30]:
rf_best_grid = RandomForestClassifier(criterion='gini', max_depth= 10,
                                     min_samples_leaf= 5,
                                     n_estimators=150)

In [31]:
rf_best_grid.fit(X_train, y_train)

In [32]:
y_pred = rf_best_grid.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8246379356276188

In [33]:
gridsearch_results['best_params'][1]

{'eta': 0.1, 'max_depth': 3, 'min_child_weight': 2}

In [34]:
xgbc_best_grid = XGBClassifier(max_depth=3, min_child_weight=2,eta=0.1)
xgbc_best_grid.fit(X_train, y_train)

In [35]:
y_pred = xgbc_best_grid.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8286218546414473

### Let's define the Decision Tree with best parameters & evaluate on validation data:

In [36]:
gridsearch_results['best_params'][2]

{'max_depth': 7, 'min_samples_leaf': 100}

In [37]:
dt_best_grid = DecisionTreeClassifier(max_depth=7, min_samples_leaf=100,criterion='gini')
dt_best_grid.fit(X_train, y_train)

In [38]:
y_pred = dt_best_grid.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.7795827177835364

### xgboost is clearly the best model, let's evaluate it on test data:

In [39]:
y_pred = xgbc_best_grid.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.8304029617320756

### Finally, let's train xgboost on the full training data & then run it on test data:

In [40]:
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = (df_full_train.status == 'default').astype(int).values
del df_full_train['status']

dicts_full_train = df_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [41]:
xgbc_best_grid.fit(X_full_train, y_full_train)

In [42]:
y_pred = xgbc_best_grid.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.8334331166609648

### We see that the xgbclassifier trained on the full dataset achieves an accuracy of 83.34% on the test dataset, which is very close to (& slightly higher than) the 83.27% achieved in the official Week 6 notebook by xgboost, which is a more highly optimized version of the same algorithm.

### In summary, GridsearchCV allows us to systematically traverse the grid representing all hyperparameter choices that we specify, guaranteeing that we get the combination yielding the best performance according to robust cross-validation based estimates of the performance metric we choose. However, combinatorial growth means that GridSearchCV is computationally expensive - we next consider RandomizedSearchCV, which trades performance for speed, and often yields performance that is acceptably close to that provided by GridSearchCV.
</br> </br>


# <ins> Faster, not better - but probably good enough : Hyperparameter Tuning with RandomizedSearchCV </ins> </br> </br>

## RandomizedSearchCV is similar to GridsearchCV, but it only searches a random subspace of the hyperparameter space, which makes it faster than GridSearchCV at the cost of a drop in performance that is often acceptable in practice. We can define the parameter grid exactly like we did for GridSearchCV. 

In [43]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,50,100,150,200],
            'max_depth' : [10,50,100,150,200],
            'min_samples_leaf':[1, 3, 5, 10, 50]
        }
    },
    'xgboost': {
        'model': XGBClassifier(),
        'params': {    
            'max_depth' : [3, 6, 10],
            'eta' : [0.01,0.1,0.3],
            'min_child_weight' : [None , 1 , 2 , 3]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth' : [1, 3, 5, 7, 10, 15, 20, None],
            'min_samples_leaf' :[1, 5, 10, 15, 20, 500, 100, 200],
        }
    }     
}

In [44]:
scores_rand = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, scoring = 'roc_auc')
    clf.fit(X_train, y_train)
    scores_rand.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
randsearch_results = pd.DataFrame(scores_rand,columns=['model','best_score','best_params'])
randsearch_results

Unnamed: 0,model,best_score,best_params
0,random_forest,0.842455,"{'n_estimators': 200, 'min_samples_leaf': 10, ..."
1,xgboost,0.852028,"{'min_child_weight': 3, 'max_depth': 3, 'eta':..."
2,decision_tree,0.794712,"{'min_samples_leaf': 100, 'max_depth': None}"


In [45]:
randsearch_results['best_params'][0]

{'n_estimators': 200, 'min_samples_leaf': 10, 'max_depth': 100}

In [46]:
rf_best_rand = RandomForestClassifier(max_depth= 50,
                                     min_samples_leaf= 5,
                                     n_estimators=150)

In [47]:
rf_best_rand.fit(X_train, y_train)

In [48]:
y_pred = rf_best_rand.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8243594245719406

In [49]:
randsearch_results['best_params'][1]

{'min_child_weight': 3, 'max_depth': 3, 'eta': 0.1}

In [50]:
xgbc_best_rand = XGBClassifier(max_depth=3, min_child_weight=1,eta=0.1)
xgbc_best_rand.fit(X_train, y_train)

In [51]:
y_pred = xgbc_best_rand.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8319155506042479

In [52]:
randsearch_results['best_params'][2]

{'min_samples_leaf': 100, 'max_depth': None}

In [53]:
dt_best_rand = DecisionTreeClassifier(max_depth=15, min_samples_leaf=100)
dt_best_rand.fit(X_train, y_train)

In [54]:
y_pred = dt_best_rand.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.7797250006054588

In [55]:
y_pred = xgbc_best_rand.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.8293460730169591

In [56]:
xgbc_best_rand.fit(X_full_train, y_full_train)

In [57]:
y_pred = xgbc_best_rand.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.8346916084257856