# Lab-6 Template

Answer questions in the designated cells

H2O GBM tuning guide:  
https://github.com/h2oai/h2o-3/blob/master/h2o-docs/src/product/tutorials/gbm/gbmTuning.ipynb

## Preparation

Use dataset provided in the eLearning

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 1500)

import warnings
warnings.filterwarnings('ignore')

#Extend cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
#Run once
#!pip install category_encoders
from category_encoders.target_encoder import TargetEncoder

### Load data

In [3]:
#train_df = pd.read_csv('SBA_loans_small.zip')
X_train = pd.read_csv('SBA_loans_train.csv')
X_test  = pd.read_csv('SBA_loans_test.csv')

In [4]:
X_train.head(n=3)

Unnamed: 0,City,State,Zip,Bank,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementGross,BalanceGross,GrAppv,SBA_Appv,Defaulted
0,Huntsville,AL,35811,"BUSINESS LOAN CENTER, LLC",FL,621310,73,1,2.0,2,1,0,1,N,N,25000.0,0.0,25000.0,21250.0,1
1,SCOTTSDALE,AZ,85254,WELLS FARGO BANK NATL ASSOC,CA,0,84,3,2.0,0,0,0,0,N,N,52000.0,0.0,52000.0,46800.0,1
2,BANGOR,ME,4401,BANGOR SAVINGS BANK,ME,323110,84,9,1.0,0,0,1,1,0,Y,150000.0,0.0,150000.0,127500.0,0


In [5]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (337186, 20)
Test shape: (112396, 20)


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337186 entries, 0 to 337185
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   City               337177 non-null  object 
 1   State              337180 non-null  object 
 2   Zip                337186 non-null  int64  
 3   Bank               336587 non-null  object 
 4   BankState          336583 non-null  object 
 5   NAICS              337186 non-null  int64  
 6   Term               337186 non-null  int64  
 7   NoEmp              337186 non-null  int64  
 8   NewExist           337140 non-null  float64
 9   CreateJob          337186 non-null  int64  
 10  RetainedJob        337186 non-null  int64  
 11  FranchiseCode      337186 non-null  int64  
 12  UrbanRural         337186 non-null  int64  
 13  RevLineCr          335483 non-null  object 
 14  LowDoc             336198 non-null  object 
 15  DisbursementGross  337186 non-null  float64
 16  Ba

# Prepare Dataset

Replace missing values for all columns for both X_train and X_test.
Replace Na's with zero for numerical variables and with "Missing" for categorical

Encode Categorical variables using target encoder.

In [7]:
values_to_fill = {}
for col in X_train.drop(columns=['Defaulted']).columns:
    if X_train[col].dtype == 'object':
        values_to_fill[col] = "Missing"
    else:
        values_to_fill[col] = 0

X_train.fillna(value=values_to_fill,inplace=True)
X_test.fillna(value=values_to_fill, inplace=True)

cols_to_encode = []
cat_encoders = {}
for col in X_train.drop(columns=['Defaulted']).columns:
  if X_train[col].dtype == 'object':
    cols_to_encode.append(col)
    
#Add Zip to categoricals and convert to string in order to make encoder work
cols_to_encode.append("Zip")
X_train["Zip"] = X_train["Zip"].apply(str)
X_test["Zip"] = X_test["Zip"].apply(str)

te = TargetEncoder()
Y_tr = X_train["Defaulted"]
Y_tst = X_test["Defaulted"]
te.fit(X_train[cols_to_encode],Y_tr)

X_train_te = te.transform(X_train[cols_to_encode])
X_test_te = te.transform(X_test[cols_to_encode])

new_col_names = [col+"_te" for col in cols_to_encode]
X_train_te.columns = new_col_names
X_test_te.columns = new_col_names

X_tr = X_train_te.join(X_train)
X_tst = X_test_te.join(X_test)

X_tr.drop(columns=cols_to_encode+["Defaulted"],inplace=True, axis=1)
X_tst.drop(columns=cols_to_encode+["Defaulted"],inplace=True, axis=1)

In [8]:
X_tr.head(n=3)

Unnamed: 0,City_te,State_te,Bank_te,BankState_te,RevLineCr_te,LowDoc_te,Zip_te,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,DisbursementGross,BalanceGross,GrAppv,SBA_Appv
0,0.380952,0.167744,0.308181,0.158105,0.146342,0.186457,0.347826,621310,73,1,2.0,2,1,0,1,25000.0,0.0,25000.0,21250.0
1,0.191919,0.200634,0.138341,0.221678,0.146342,0.186457,0.228916,0,84,3,2.0,0,0,0,0,52000.0,0.0,52000.0,46800.0
2,0.125984,0.096586,0.0625,0.076696,0.149252,0.09074,0.088889,323110,84,9,1.0,0,0,1,1,150000.0,0.0,150000.0,127500.0


In [9]:
X_tst.head(n=3)

Unnamed: 0,City_te,State_te,Bank_te,BankState_te,RevLineCr_te,LowDoc_te,Zip_te,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,DisbursementGross,BalanceGross,GrAppv,SBA_Appv
0,0.083335,0.224163,0.073593,0.222704,0.146342,0.09074,0.090913,0,84,1,2.0,0,0,1,0,42000.0,0.0,42000.0,33600.0
1,0.128531,0.149706,0.138341,0.175423,0.146342,0.186457,0.170732,0,84,7,1.0,0,0,1,0,15000.0,0.0,15000.0,13500.0
2,0.170213,0.224163,0.0,0.222704,0.146342,0.186457,0.229167,0,240,19,1.0,15,0,1,0,497000.0,0.0,497000.0,497000.0


In [10]:
print("Train shape:", X_tr.shape)
print("Test shape:", X_tst.shape)

Train shape: (337186, 19)
Test shape: (112396, 19)


## Datasets for all questions

For all questions, use X_tr and X_tst (after categorical variables encoding).

In [11]:
Y_tr

0         1
1         1
2         0
3         1
4         0
         ..
337181    0
337182    0
337183    0
337184    0
337185    0
Name: Defaulted, Length: 337186, dtype: int64

## Question 1 - 2 points

Train sklearn `GradientBoostingClassifier` with default parameters and `random_state=0`.
Display:
- AUC on Testing data
- Accuracy on Testing data
- Number of trees for the trained classifier

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics


gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_tr, Y_tr)

y_pred = gbrt.predict_proba(X_tst)[:, 1]
auc = metrics.roc_auc_score(Y_tst, y_pred)


print("AUC on testing data:", auc)
print("Accuracy on testing data: {:.3f}".format(gbrt.score(X_tst, Y_tst)))

AUC on testing data: 0.9505761225251513
Accuracy on testing data: 0.919


In [13]:
gbrt.n_estimators

100

## Question 2 - 3 points

Train sklearn `GradientBoostingClassifier` with following parameters:
```
n_estimators=1000, 
learning_rate=0.1,
subsample=0.8,
max_features=0.8,
n_iter_no_change=5,
max_depth=3, 
random_state=0
```

Display:
- AUC on Testing data
- Accuracy on Testing data
- Number of trees for the trained classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier


gbrt = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,subsample=0.8,max_features=0.8,n_iter_no_change=5,max_depth=3, random_state=0)
gbrt.fit(X_tr, Y_tr)

y_pred = gbrt.predict_proba(X_tst)[:, 1]
auc = metrics.roc_auc_score(Y_tst, y_pred)


print("AUC on testing data:", auc)
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_tst, Y_tst)))

AUC on testing data: 0.959355989361353
Accuracy on test set: 0.933


In [15]:
print("the number of used trees are",gbrt.n_estimators)

the number of used trees are 1000


## Question 3 - 10 points

Use Grid search to train at least 16 `GradientBoostingClassifier`. Tune following parameters, set `random_state=0`:
```
n_estimators, 
learning_rate,
subsample,
max_features,
n_iter_no_change,
max_depth, 
```
Your grid search should be performed using CV=3.
To speed up training process, set `n_jobs` to number of available cores on your machine. For example, if you have 8 CPU/Core machine, set `n_jobs=4` or `n_jobs=6`

For the best model (based on AUC) display:
- Model parameters
- AUC on Testing data
- Accuracy on Testing data
- Number of trees for the trained classifier

**Important**: It will take long time to train models, you will be training at least 16x3=48 models. Test that your Grid search works with small subset of the data first.

Once you have trained the Grid object, save it to the disk so that you can retrieve  it without going through re-train step. You can find example on how to save model in the Project 1 template.

For the best model, display:
- AUC on Testing data
- Accuracy on Testing data
- Number of trees for the trained classifier

**Optional** questions (if you plan to practice data science, you should make every effort to answer them): 
- Why do you think number of trees for the best model is less than `n_estimators`?
- Think about all parameters you are tuning using grid search.
- How are those parameters help to reduce overfit? 

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV
gbrt = GradientBoostingClassifier(random_state=0)
n_estimators= [50,1000]
learning_rate= [10,100]
max_depth= [1,3]
subsample= [0.6,0.8]
max_features= [0.8,2]
n_iter_no_change= [5,10]

# Grid Search
grid = dict(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth, subsample = subsample,max_features = max_features,n_iter_no_change = n_iter_no_change  )
grid_search = GridSearchCV(estimator=gbrt, param_grid=grid, n_jobs= 10,cv=3, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_tr, Y_tr)

# Summary
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: 0.817430 using {'learning_rate': 10, 'max_depth': 1, 'max_features': 0.8, 'n_estimators': 50, 'n_iter_no_change': 5, 'subsample': 0.6}


In [17]:
new_gb= GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,subsample=0.8,max_features=0.8,n_iter_no_change=10,max_depth=3,random_state=0)
new_gb.fit(X_tr, Y_tr)

y_pred = new_gb.predict_proba(X_tst)[:, 1]
auc = metrics.roc_auc_score(Y_tst, y_pred)


print("AUC on testing data:", auc)
print("Accuracy on test set: {:.3f}".format(new_gb.score(X_tst, Y_tst)))

AUC on testing data: 0.9615458916677098
Accuracy on test set: 0.935


In [18]:
new_gb.n_estimators_

718

In [19]:
y_pred = grid_search.predict_proba(X_tst)[:, 1]
auc = metrics.roc_auc_score(Y_tst, y_pred)


print("AUC on testing data:", auc)
print("Accuracy on test set: {:.3f}".format(grid_search.score(X_tst, Y_tst)))

AUC on testing data: 0.30808315464599245
Accuracy on test set: 0.172


## Question 4 - 5 points

Train Stacked Ensemble model by utilizing `from sklearn.ensemble import StackingClassifier`.

Good guide on how to train Stacked ensemble model can be found here: https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/


Stacked Ensemble is a technique to build Meta Learner (model). The Meta learner uses out-of-fold predictions of level-0 models (at least 2) to train level-1 model(meta learner). Meta learner is trained on the out-of-fold predictions done by the level-0 model in order to avoid overfitting.  

For example, if level-0 model was trained using cv=3 it means there are 3 level-0 sub-models each trained on the 2/3 of the data. Therefore the hold-out 1/3 part of the training data was not used by the model for training, and predictions on the out of fold parts of the dataset can be used by Meta Learner. 

Choose two models for the level-0 models:
- Best model from the Question 3
- Worst model from the question 3

You would only need model parameters, since sklearn `StackingClassifier` will retrain the the models.

Train `StackingClassifier` and produce:
- AUC on Testing data
- Accuracy on Testing data


*Hint*: to find best/worst model parameters for your `grid_search` object you can use below code. You would need to change `grid_search` to the variable name that holds your `GridSearchCV` object. 
```
import numpy as np

best_model_idx = np.argmax(grid_search.cv_results_['mean_test_score'])
worst_model_idx = np.argmin(grid_search.cv_results_['mean_test_score']) 
print("Index of worst model:",worst_model_idx)
print("Index of best model:",best_model_idx)

print("Best model params:")
print(grid_search.cv_results_['params'][best_model_idx])
print("")
print("Worst model params:")
print(grid_search.cv_results_['params'][worst_model_idx])
```

In [20]:
import numpy as np 
best_model_idx = np.argmax(grid_search.cv_results_['mean_test_score'])
worst_model_idx = np.argmin(grid_search.cv_results_['mean_test_score']) 
print("Index of worst model:",worst_model_idx)
print("Index of best model:",best_model_idx)
print("Best model params:")
print(grid_search.cv_results_['params'][best_model_idx])
print("")
print("Worst model params:")
print(grid_search.cv_results_['params'][worst_model_idx])

Index of worst model: 40
Index of best model: 0
Best model params:
{'learning_rate': 10, 'max_depth': 1, 'max_features': 0.8, 'n_estimators': 50, 'n_iter_no_change': 5, 'subsample': 0.6}

Worst model params:
{'learning_rate': 100, 'max_depth': 1, 'max_features': 2, 'n_estimators': 50, 'n_iter_no_change': 5, 'subsample': 0.6}


In [None]:
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
# define meta learner model
level1 = LogisticRegression()
#define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

In [21]:
level0 = list()
level0.append(('worst',GradientBoostingClassifier(n_estimators=50, learning_rate=100,subsample=0.6,max_features=2,n_iter_no_change=5,max_depth=1,random_state=0)))
level0.append(('best', GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,subsample=0.8,max_features=0.8,n_iter_no_change=10,max_depth=3,random_state=0)))

In [None]:
from sklearn.ensemble import StackingClassifier
# define meta learner model
level1 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,subsample=0.8,max_features=0.8,n_iter_no_change=10,max_depth=3,random_state=0)
# define the stacking ensemble
model1 = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
# fit the model on all available data
model1.fit(X_tr, Y_tr)

In [None]:
y_pred = model1.predict_proba(X_tst)[:, 1]
auc = metrics.roc_auc_score(Y_tst, y_pred)


print("AUC on testing data:", auc)
print("Accuracy on test set: {:.3f}".format(new_gb.score(X_tst, Y_tst)))