In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Machine Learning with Tree-Based Models in Python

https://www.datacamp.com/courses/machine-learning-with-tree-based-models-in-python



## part 1. classification tree Learning

>  terms
 - Decision region: region in the feature space where all instances are assigned to one class label.
 - Decision boundary: surfaces separating different decision regions.
 

> building blocks of a decision-tree: binary tree
   - node: question or prediction.
     - root: no parent node.
     - internal node: one parent node and two children nodes
     - leaf: no children - final prediction.
     

> <font color='blue'>**Information Gain (IG)**</font>
   - which feature to pick for splitting: by maximizing information gain.
   - two criteria to measure impurity of a node
       - ** gini index **
       - ** entropy **
       - Most of the time, the gini index and entropy lead to the same results. The gini index is slightly faster to compute and is the default criterion used in the 





> <font color='blue'>advantages of CARTs:</font>
  - easy to understand and interpret
  - flexibility: able to do non-linear models
  - not much preprocessing needed: standardization/scaling etc.
   

> limitations of CARTs
  - classification: can only produce orthogonal decision boundaries
  - sensitive to small variations in training sets
  - high variances: unconstrained CARTs can overfit the training set
  - solution: ensemble learning: more robust and less prone to errors
   
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.tree import DecisionTreeRegressor

In [3]:
%%html
### information gain
<img src = 'ig.png', width = 400, height = 500>

In [4]:
%%html
### information gain for regression
<img src = 'ig_regression.png', width = 400, height = 500>

## Part 2. The Bias-Variance Tradeoff
    - Overfitting v.s. underfitting


> 1) Generalization error of function f = bias **2 + variance + irreducible error
   - Bias: high bias leads to underfitting. 
         - Not complicated enough to capture insights. 
         - both large training and test errors.
   - Variance: high variance leads to overfitting. 



> 2) <font color='blue'>**Diagose Variance problem**</font> 
   - if f function suffers from high variance: CV error of f > training set error of f
     - f is said to overfit the training set. To remedy this:
        - decrease model complexity
        - for ex.: decrease max depth, increase min samples per leaf, ...
        - gather more data...

   
> 3) Diagose bias problem
   - if f function suffers from high bias: CV error of f = training set error of f >> desired error
     - f is said to underfit the training set. To remedy this:
        - increae model complexity
        - for ex.: increase max depth, decrease min samples per leaf, ...
        - gather more relavant features..

 
         
        

In [6]:
%%html
### bias and variance
<img src = 'bias_var.png', width = 400, height = 500>

- example to diagonose whether overfitting v.s. underfitting

In [None]:
# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Set SEED for reproducibility
SEED = 1

# Split the data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

# Instantiate a DecisionTreeRegressor dt
dt = DecisionTreeRegressor(max_depth= 4, min_samples_leaf=0.26, random_state=SEED)

# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv= 10, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1)

# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(0.5)


dt.fit(X_train, y_train)
y_pred_train = dt.predict(X_train)

RMSE_train = (mean_sqaure_error(y_train, y_pred_train))**(0.5)

# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

# Print RMSE_train
print('CV RMSE: {:.2f}'.format(RMSE_train))


> ** Ensemble example: voting classifer **

In [None]:
from sklearn.ensemble import VotingClassifier

# Set seed for reproducibility
SEED=1

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

# Define the list classifiers
classifiers = [('Logistic Regression', lr)
               , ('K Nearest Neighbours', knn)
               , ('Classification Tree', dt)]

# evaluate each classifier

# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

In [None]:
# Import VotingCLassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_pred, y_test)
print('Voting Classifier: {:.3f}'.format(accuracy))

## part 3. Bagging
** Bagging is an ensemble method involving training the same algorithm many times using different subsets sampled from the training data. **
   - Base estimator can be tree, logistic regression, knn, etc.
   - estimators use all features for training and prediction
 

   - bagging: bootstrap aggregation
       - bootstrap: sample with replacement
   - reduce variance of individual models in ensembles
   - do a bootstrap of samples (subset of training set)
   - Do a bagging, some samples are used multiple times and others are not sampled at all.
   
   
> 1) Bagging in classification and regression
   - Classification:
      - aggregates prediction by **majority voting**.
      - BaggingClassifier in sklearn 
   - Regression:
      - aggregates prediction by **averaging**.
      - BaggingRegresser in sklearn 

 ###### bagging v.s. voting classifer
 - bagging to do majority/averaging to get prediction--sklearn: BaggingClassifier
 - votingclassifier to from sklearn.ensemble import VotingClassifier is for majority rule voting (If ‘hard’)




simple example of bagging

In [None]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier


# Split the data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratefy = y ,test_size=0.3, random_state=SEED)

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_pred, y_test)
print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 

> 2) ** Out of bag (OOB) evaluation**
: We don't need to do cross-validation if we enable OOB in ensemble modeling, because OOB samples in each training is untouched.
   - On average, 63% of training samples are samples on each model. -- bagging
   - The remaining 37% constitute the OOB instances
   - each algorithm, use OOB samples to evaluate the model. 


In [10]:
%%html
### Out of bag evaluation
<img src = 'oob.png', width = 400, height = 500>

- example of OOB evaluation

In [None]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf= 8, random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, 
            n_estimators=50,
            oob_score=True,
            random_state=1)

# Fit bc to the training set 
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate test set accuracy
acc_test = accuracy_score(y_pred, y_test)

# Evaluate OOB accuracy
acc_oob = bc.oob_score_

# Print acc_test and acc_oob
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(acc_test, acc_oob))

### Part 4. Random Forest

  - base estimator: decision tree
  - each estimator is trained on a different boostrap sample having the same size as traning set
  - RF introduces further randomization: only use subset of d features(d < total features) without replacement
  
> 1) RF in classification and regression
   - Classification:
      - aggregates prediction by **majority voting**.
      - RandomForestClassifier in sklearn 
   - Regression:
      - aggregates prediction by **averaging**.
      - RandomForestRegresser in sklearn 


> 2) Feature importance
   - how much the tree nodes use a particular feature(weighted average) to reduce impurity
   
- for randomForestRegressor, its default metric is 'R2', if we want to use mse as metric to evaluate performance, we choose scoring = 'neg_mean_square_error', it is intended to use negative MSE, so model can maximize the metrics.

grid_rf = GridSearchCV(estimator = rf, scoring = 'neg_mean_sqaure_error', cv = 3)



- a simple random forest model

In [None]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Split the data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratefy = y ,test_size=0.3, random_state=SEED)


# Instantiate rf
rf = RandomForestRegressor(n_estimators=25,
            random_state=2)
            
# Fit rf to the training set    
rf.fit(X_train, y_train) 
# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Predict the test set labels
y_pred = rf.predict(X_test)

# Evaluate the test set RMSE
rmse_test = (MSE(y_pred, y_test))**(0.5)

# Print rmse_test
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))


"""
Plot feature importance
"""
# Create a pd.Series of features importances
importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='bar', color ='lightgreen')
plt.title('Features Importances')
plt.show()

### Part 5. Boosting

  - base estimator can be anything: decision tree is popular
  - boosting combines many weaker learners to form a strong learner.
  - Train an ensemble of predictors sequetially and each predictor tries to correct its predecessor.
  
> 1) Ada boosting (adaptive boosting)
   - each predictor pays more attention to the instances wrongly predicted by its predecessor.
   - achieved by **changing the weights of traning instances ** (more weights to wrongly predicted labels)
   - each predictor is assigned to a coefficient alpha and alpha depends on the predictor's training error.
   - <font color='blue'>a tradeoff between number of estimators and learning rate.</font>

    - Classification:
      - aggregates prediction by **Weighted majority voting**.
      - AdaBoostClassifier in sklearn 
    - Regression:
      - aggregates prediction by **weigthed averaging**.
      - AdaBoostRegresser in sklearn 


In [None]:
- example to use adaboost

In [None]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba =ada.predict_proba(X_test)[:,1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score 

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))


> ** 2) Gradient boosting **
   - sequential correction of predecessor's errors.
   - does NOT tweak the weights of training instances.
   - each predictor is trained using its predecessor's residual errors as labels.
   - Gradient boosting trees: a CART tree as base estimator
   - <font color='blue'>a tradeoff between shrikage and num of estimators.**</font>
   
GB cons: 
- involves an exhaustive search process
- each tree is trained to find its best split points and features, which may lead CARTs using the same split points and the same features. To metigates this:


> ** 3) Stochastic Gradient boosting **
   - each tree is trained on a random subset rows of training data.
   - the sampled instances (40% - 80% of the training set) are sampled without replacement.
   - features are sampled(without replacement) when choosing split points.
   - result: create further diversity of ensemble.
   - effect: add more variance to the ensemble model.

In [18]:
%%html
### Gradient boosting
<img src = 'grad_boost.png', width = 500, height = 600>

<img src = 'gra_boost_cl.png', width = 500, height = 600>

##### example of gradient boosting

In [None]:
# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate gb
gb = GradientBoostingRegressor(max_depth=4, 
            n_estimators=200,
            random_state=2)

# Fit gb to the training set
gb.fit(X_train, y_train)

# Predict test set labels
y_pred = gb.predict(X_test)

##### example of stachastic gradient boosting

In [None]:
# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate sgbr
sgbr = GradientBoostingRegressor(max_depth=4, 
            subsample=0.9,  # each tree only use 90% of samples
            max_features=0.85, # max 85% of features
            n_estimators=200,                                
            random_state=2)

# Fit sgbr to the training set
sgbr.fit(X_train, y_train)

# Predict test set labels
y_pred = sgbr.predict(X_test)

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute test set MSE
mse_test = MSE(y_test, y_pred)

# Compute test set RMSE
rmse_test = mse_test**(0.5)

# Print rmse_test
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))

### Part 6. Model Tuning

 - hyperparameters: not learnt from data, set prior to training
 - approches to hyperparamter tuning: Grid Search, Random Search, Bayesian optimization, Genetic algorithms

In [21]:
from sklearn.tree import DecisionTreeClassifier

seed = 1
dt = DecisionTreeClassifier(random_state= seed)
dt.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 1,
 'splitter': 'best'}

- a simple gridsearch example using decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

seed = 1
dt = DecisionTreeClassifier(random_state= seed)

# Define params_dt
params_dt = {'max_depth':[2,3,4]
             , 'min_samples_leaf': [0.12,0.14,0.16,0.18]
}

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Instantiate grid_dt
grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)

# Import roc_auc_score from sklearn.metrics
from sklearn.metrics import roc_auc_score

# Extract the best estimator
best_model = grid_dt.best_estimator_

# Predict the test set probabilities of the positive class
y_pred_proba = best_model.predict_proba(X_test)[:,1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test, y_pred_proba) 

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))

In [None]:
- simple example using random forest regressor for gridsearch

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Split the data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratefy = y ,test_size=0.3, random_state=SEED)


# Instantiate rf
rf = RandomForestRegressor(n_estimators=25,
            random_state=2)
            
# Define the dictionary 'params_rf'
params_rf = {'n_estimators': [100,350,500]
             , 'max_features':['log2','auto','sqrt']
             , 'min_samples_leaf':[2,10,30]
}

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Instantiate grid_rf
grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)

grid_rf.fit(X_train, y_train)

# Import mean_squared_error from sklearn.metrics as MSE 
from sklearn.metrics import mean_squared_error as MSE

# Extract the best estimator
best_model = grid_rf.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Compute rmse_test
rmse_test = MSE(y_pred, y_test)**(0.5)

# Print rmse_test
print('Test RMSE of best model: {:.3f}'.format(rmse_test)) 