<a href="https://colab.research.google.com/github/RiemanBall/Machine-Learning/blob/master/Tree_and_Ensemble/Tree_and_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
np.random.seed(0)

from math import ceil

from scipy.stats import randint as sp_randint
from scipy.stats import uniform

from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.inspection import permutation_importance

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

import xgboost as xgb

import matplotlib.pyplot as plt



%matplotlib inline

## Dataset
The dataset used here is from https://www.kaggle.com/c/ml2020spring-hw2/data, which is modified from [**Census-Income (KDD) Data Set**](https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)) and can be found in [**UCI Machine Learning Repository**](https://archive.ics.uci.edu/ml/index.php).

In [None]:
# Get the data from colab server
!gdown --id '1KSFIRh0-_Vr7SdiSCZP1ItV7bXPxMD92' --output data.tar.gz
!tar -zxvf data.tar.gz

/bin/bash: gdown: command not found
tar (child): data.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


### Preprocessing
- Read data:

    > Note that `X_train` and `X_test` is already preprocessed with one-hot encoding.

- Normalize data
- Split to training set and validation set

In [None]:
# Read data from csv files
X_train_fpath = './data/X_train'
Y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
output_fpath = './output_{}.csv'

# Parse csv files to numpy array
with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

### Some helping functions

In [None]:
# Helping function
def shuffle(X, Y):
    '''
    Shuffle dataset. X and Y should have same length at 0-dimension.
    Inputs:
        X: Nxm ndarray. N data, m features
        Y: Nx1 ndarray. N data, 1 label
    Outputs:
        X: Nxm ndarray. Shuffled X. N data, m features
        Y: Nx1 ndarray. Shuffled Y. N data, 1 label
    '''
    if X.shape[0] != Y.shape[0]:
        print("Shape of X doesn't match with shape of Y")
        return None

    if len(Y.shape) < 2:
        Y = Y.reshape(-1, 1)

    rand_indices = np.arange(X.shape[0])
    np.random.shuffle(rand_indices)
    return X[rand_indices, :], Y[rand_indices, :]

def accuracy(Y_pred_class, Y_truth):
    acc = 1 - np.mean(np.abs(Y_pred_class - Y_truth))
    return acc

def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



In [None]:
# Zero-model accuracy
print(f"The accuracy of zero-model: {accuracy(np.zeros_like(Y_train), Y_train)}")

The accuracy of zero-model: 0.7944743438513713


### Split the data
Before splitting the data, we shuffle it to have random sequence. Then we split to training set and validation set.

In [None]:
def split_train_val(X, Y, train_ratio=0.8):
    '''
    Split dataset (X, Y) to training set and validation set with training ratio = train_ratio
    '''
    num_data = X.shape[0]
    train_sz = ceil(num_data * train_ratio)
    return X[:train_sz], Y[:train_sz], X[train_sz:], Y[train_sz:]

# Shuffle the data
X_train, Y_train = shuffle(X_train, Y_train)
# Split to training set and validation set
X_train_set, Y_train_set, X_val_set, Y_val_set = split_train_val(X_train, Y_train)

train_size = X_train_set.shape[0]
val_size = X_val_set.shape[0]
test_size = X_test.shape[0]
data_dim = X_train_set.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of validation set: {}'.format(val_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))

Size of training set: 43405
Size of validation set: 10851
Size of testing set: 27622
Dimension of data: 510


In [None]:
# Display feature importance as computed from the decision tree
with open(X_test_fpath) as f:
    content = f.readline().strip('\n').split(',')
features = np.array(content)[1:]

## Decision Tree
We will use `DecisionTreeClassifier` from sklearn `tree` module. There are some main hyperparameters for the classifier. Here we only list the parameters that we will modify.

- `criterion`: By default the Gini impurity is used. We would like to use information gain, so we specify `entropy`.
- `max_depth` : We will use grid search method to find the best depth.
- `min_samples_split` : We will use grid search method to find the best depth.
- `min_samples_leaf`: We will use grid search method to find the best depth.

### Simple Decision Tree classifer

#### Train Decision Tree


In [None]:
# First see the performance with default setting
DTC = DecisionTreeClassifier(random_state=0)

# Fit the model
DTC = DTC.fit(X_train_set, Y_train_set)

#### Compute the accuracy from validation set

In [None]:
# Compute and display accuracy score
score = 100.0 * DTC.score(X_val_set, Y_val_set)
print(f"Decision Tree prediction accuracy ={score:5.1f}%\n")

# Show the trained parameters
depth = DTC.get_depth()
n_leaves = DTC.get_n_leaves()
print(f"Depth of the tree: {depth}")
print(f"Number of leaves of the tree: {n_leaves}")

Decision Tree prediction accuracy = 83.8%

Depth of the tree: 62
Number of leaves of the tree: 5189


#### Show the feature importance from the trained Decision Tree

In [None]:
vals = DTC.feature_importances_
ind = vals.argsort()[::-1]

for i in ind[0:10]:
    print(f"{features[i]:<36}"
          f"{100.0 * vals[i]:5.2f}%")

capital gains                       12.31%
dividends from stocks               11.30%
age                                  9.86%
weeks worked in year                 9.61%
num persons worked for employer      2.73%
 Male                                2.68%
 Professional specialty              2.60%
capital losses                       2.13%
 Executive admin and managerial      2.05%
 Female                              1.67%


#### Plot the decision surface

### Search for good hyperparameters
We first execute `RandomizedSearchCV` to quickly spot the good area of hyperparameters, and then use `GridSearchCV` to find the best hyperparameters throughly.

#### Random Search
Run a Random Search using 10 fold cross validation

In [None]:
# Random search
rand_search_params = {"criterion": ["gini", "entropy"],
                      "max_depth": sp_randint(10, 50),
                      "min_samples_split": sp_randint(1, np.sqrt(X_train_set.shape[0])),
                      "min_samples_leaf": sp_randint(2, np.sqrt(X_train_set.shape[0]) / 2)}
candidates = 50
DTC = DecisionTreeClassifier(random_state=0)


In [None]:
# Run a random search cross validation.
random_search = RandomizedSearchCV(DTC, rand_search_params, candidates, cv=10, n_jobs=-1, verbose=5)

random_search.fit(X_train_set, Y_train_set)


Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 15.7min finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=0,
            

In [None]:
# Show the random search result
report(random_search.cv_results_)

score = 100.0 * random_search.score(X_val_set, Y_val_set)
print(f"Best Decision Tree from random search has prediction accuracy{score:5.1f}% in validation set\n")

Model with rank: 1
Mean validation score: 0.876 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 13, 'min_samples_leaf': 7, 'min_samples_split': 118}

Model with rank: 2
Mean validation score: 0.876 (std: 0.003)
Parameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 163}

Model with rank: 3
Mean validation score: 0.875 (std: 0.003)
Parameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 121}

Model with rank: 4
Mean validation score: 0.875 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 35, 'min_samples_leaf': 8, 'min_samples_split': 118}

Model with rank: 5
Mean validation score: 0.875 (std: 0.005)
Parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 10, 'min_samples_split': 132}

Best Decision Tree from random search has prediction accuracy 87.0% in validation set



#### Grid Search
Run a Grid Search using 10 fold cross validation based on the random search result.

In [None]:
best_ind = np.argsort(random_search.cv_results_["rank_test_score"])[0]
best_params = random_search.cv_results_["params"][best_ind]
best_depth = best_params["max_depth"]
best_min_samples_split = best_params["min_samples_split"]
best_min_samples_leaf = best_params["min_samples_leaf"]

In [None]:
# Grid search
grid_params = {"criterion": ["gini", "entropy"],
               "max_depth": [best_depth - 1, best_depth, best_depth + 1],
               "min_samples_split": [best_min_samples_split - 10, best_min_samples_split, best_min_samples_split + 10],
               "min_samples_leaf": [best_min_samples_leaf - 1, best_min_samples_leaf, best_min_samples_leaf + 1] }

# Run a grid search CV.
grid_search = GridSearchCV(DTC, grid_params, cv=10, n_jobs=-1, verbose=5)

grid_search.fit(X_train_set, Y_train_set)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 13.6min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [12, 13, 14],
                 

In [None]:
# Show the grid search result
report(grid_search.cv_results_)

score = 100.0 * grid_search.score(X_val_set, Y_val_set)
print(f"Best Decision Tree from grid search has prediction accuracy ={score:5.1f}%\n")

Model with rank: 1
Mean validation score: 0.877 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 8, 'min_samples_split': 128}

Model with rank: 2
Mean validation score: 0.876 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 8, 'min_samples_split': 118}

Model with rank: 3
Mean validation score: 0.876 (std: 0.003)
Parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 7, 'min_samples_split': 128}

Model with rank: 4
Mean validation score: 0.876 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 7, 'min_samples_split': 118}

Model with rank: 5
Mean validation score: 0.876 (std: 0.004)
Parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 7, 'min_samples_split': 108}

Best Decision Tree from grid search has prediction accuracy = 87.2%



### Train the best Decision Tree classifier with whole training set

In [None]:
best_DTC = grid_search.best_estimator_
best_DTC.fit(X_train, Y_train)
score = 100.0 * best_DTC.score(X_train, Y_train)
print(f"Best Decision Tree from grid search has prediction accuracy ={score:5.1f}%\n")

Best Decision Tree from grid search has prediction accuracy = 88.7%



We can see the performance is improved after the Decision Tree is fine-tuned.

### Show the feature importance

In [None]:
vals = best_DTC.feature_importances_
ind = vals.argsort()[::-1]

for i in ind[0:10]:
    print(f"{features[i]:<36}"
          f"{100.0 * vals[i]:5.2f}%")

capital gains                       23.47%
dividends from stocks               17.62%
weeks worked in year                17.40%
age                                  6.20%
 Male                                5.07%
 Professional specialty              4.92%
 Executive admin and managerial      3.99%
 Female                              3.38%
capital losses                       2.53%
num persons worked for employer      1.62%


#### Compare with permutation importance

In [None]:
r = permutation_importance(best_DTC, X_train, Y_train,
                           n_repeats=30,
                           n_jobs=-1,
                           random_state=0)

In [None]:
ind = r.importances_mean.argsort()[::-1]

for i in ind[0:10]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{features[i]:<50}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

capital gains                                     0.032 +/- 0.001
weeks worked in year                              0.031 +/- 0.001
dividends from stocks                             0.022 +/- 0.001
age                                               0.021 +/- 0.001
 Professional specialty                           0.015 +/- 0.000
 Male                                             0.014 +/- 0.000
 Female                                           0.012 +/- 0.001
 Executive admin and managerial                   0.012 +/- 0.000
capital losses                                    0.007 +/- 0.000
num persons worked for employer                   0.006 +/- 0.000


### Predict testing data

In [None]:
best_DTC = DecisionTreeClassifier(criterion = 'gini', max_depth = 14, min_samples_leaf = 8, min_samples_split = 128, random_state=0)
best_DTC.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=14, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=128,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
predictions = best_DTC.predict(X_test)
with open(output_fpath.format('decision_tree'), 'w') as f:
    f.write('id,label\n')
    for i, label in enumerate(predictions):
        f.write('{},{}\n'.format(i, int(label)))

The testing result in https://www.kaggle.com/c/ml2020spring-hw2/data shows the accuracy = 0.88675

## Random Forest
We further extend to Random Forest.


### Search for good hyperparameters
We first execute `RandomizedSearchCV` to quickly spot the good area of hyperparameters, and then use `GridSearchCV` to find the best hyperparameters throughly.

#### Random Search
Run a Random Search using 5 fold cross validation

In [None]:
# Random search
rand_search_params = {"max_depth": sp_randint(50, 100),
                      "max_features": sp_randint(X_train_set.shape[1] / 2, X_train_set.shape[1]),
                      "min_samples_split": sp_randint(2, np.sqrt(X_train_set.shape[0]) / 2),
                      "n_estimators": sp_randint(20, 100)}
candidates = 50
RF = RandomForestClassifier(criterion = "gini", n_jobs = -1, random_state=0)


In [None]:
# Run a random search cross validation.
random_search = RandomizedSearchCV(RF, rand_search_params, candidates, cv=5, n_jobs=-1, verbose=5)

random_search.fit(X_train_set, Y_train_set.reshape(-1, ))


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 36.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 77.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 130.0min finished


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f89c547cb38>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f89c54a7518>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f89c54830b8>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f89c5483b00>},
                   verbose=5)

In [None]:
# Show the random search result
report(random_search.cv_results_)

score = 100.0 * random_search.score(X_val_set, Y_val_set.reshape(-1,))
print(f"Best Random Forest from random search has prediction accuracy{score:5.1f}% in validation set\n")

Model with rank: 1
Mean validation score: 0.889 (std: 0.001)
Parameters: {'max_depth': 59, 'max_features': 266, 'min_samples_split': 18, 'n_estimators': 80}

Model with rank: 2
Mean validation score: 0.888 (std: 0.002)
Parameters: {'max_depth': 79, 'max_features': 357, 'min_samples_split': 13, 'n_estimators': 98}

Model with rank: 3
Mean validation score: 0.888 (std: 0.002)
Parameters: {'max_depth': 61, 'max_features': 280, 'min_samples_split': 13, 'n_estimators': 54}

Model with rank: 4
Mean validation score: 0.887 (std: 0.001)
Parameters: {'max_depth': 71, 'max_features': 333, 'min_samples_split': 67, 'n_estimators': 96}

Model with rank: 5
Mean validation score: 0.887 (std: 0.001)
Parameters: {'max_depth': 92, 'max_features': 329, 'min_samples_split': 69, 'n_estimators': 80}

Best Random Forest from random search has prediction accuracy 88.6% in validation set



#### Grid Search
Run a Grid Search using 5 fold cross validation

##### Using the parameters found in the random search to do grid search

In [None]:
best_ind = np.argsort(random_search.cv_results_["rank_test_score"])[0]
best_params = random_search.cv_results_["params"][best_ind]
best_depth = best_params["max_depth"]
best_min_samples_split = best_params["min_samples_split"]
best_max_features = best_params["max_features"]
best_n_estimators = best_params["n_estimators"]

In [None]:
# Grid search
grid_params = {"max_depth": [best_depth - 5, best_depth],
               "min_samples_split": [best_min_samples_split - 10, best_min_samples_split],
               "n_estimators": [best_n_estimators - 5, best_n_estimators],
               "max_features": [best_max_features - 10, best_max_features] }

RF = RandomForestClassifier(criterion = "gini", n_jobs = -1, random_state=0)

# Run a grid search CV.
grid_search_RF = GridSearchCV(RF, grid_params, cv=5, n_jobs=-1, verbose=10)

grid_search_RF.fit(X_train_set, Y_train_set)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 23.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done  74 out of  80 | elapsed: 39.1min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 39.5min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': [54, 59], 'max_features': [256, 266],
                         'min_samples_split': [8, 18],
                         'n_estimators': [75, 80]},
             verbose=10)

In [None]:
# Show the grid search result
report(grid_search_RF.cv_results_)

score = 100.0 * grid_search_RF.score(X_val_set, Y_val_set)
print(f"Best Decision Tree from grid search has prediction accuracy ={score:5.1f}%\n")

Model with rank: 1
Mean validation score: 0.889 (std: 0.002)
Parameters: {'max_depth': 59, 'max_features': 256, 'min_samples_split': 18, 'n_estimators': 80}

Model with rank: 2
Mean validation score: 0.889 (std: 0.003)
Parameters: {'max_depth': 54, 'max_features': 256, 'min_samples_split': 18, 'n_estimators': 75}

Model with rank: 3
Mean validation score: 0.889 (std: 0.001)
Parameters: {'max_depth': 54, 'max_features': 266, 'min_samples_split': 18, 'n_estimators': 80}

Model with rank: 4
Mean validation score: 0.889 (std: 0.002)
Parameters: {'max_depth': 54, 'max_features': 256, 'min_samples_split': 18, 'n_estimators': 80}

Model with rank: 5
Mean validation score: 0.889 (std: 0.002)
Parameters: {'max_depth': 59, 'max_features': 256, 'min_samples_split': 18, 'n_estimators': 75}

Best Decision Tree from grid search has prediction accuracy = 88.6%



### Train the best Random Forest classifier with whole training set

In [None]:
best_RF = RandomForestClassifier(criterion = "gini", 
                                 max_depth = 54, 
                                 max_features = 266, 
                                 min_samples_split = 18, 
                                 n_estimators = 80, 
                                 n_jobs = -1, 
                                 random_state = 0)
best_RF.fit(X_train, Y_train.reshape(-1, ))
score = 100.0 * best_RF.score(X_train, Y_train.reshape(-1, ))
print(f"Best Random Forest  trained with whole training set has prediction accuracy ={score:5.1f}%\n")

Best Random Forest from grid search has prediction accuracy = 95.3%



We can see huge improvement comparing to Decision Tree.

### Show the feature importance

In [None]:
vals = best_RF.feature_importances_
ind = vals.argsort()[::-1]

for i in ind[0:10]:
    print(f"{features[i]:<36}"
          f"{100.0 * vals[i]:5.2f}%")

dividends from stocks               13.54%
weeks worked in year                11.71%
capital gains                       11.45%
age                                  6.57%
 Professional specialty              3.90%
 Executive admin and managerial      3.49%
 Male                                3.24%
 Female                              3.19%
capital losses                       2.42%
num persons worked for employer      1.92%


### Predict testing data

In [None]:
predictions = best_RF.predict(X_test)
with open(output_fpath.format('random_forest'), 'w') as f:
    f.write('id,label\n')
    for i, label in enumerate(predictions):
        f.write('{},{}\n'.format(i, int(label)))

The testing result in https://www.kaggle.com/c/ml2020spring-hw2/data shows the accuracy = 0.89790

## Gradient Boosted Decision Tree (GBDT)
Build a GBDT with XGBoost.

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, n_jobs = -1)

### Search for good hyperparameters
We first execute `RandomizedSearchCV` to quickly spot the good area of hyperparameters, and then use `GridSearchCV` to find the best hyperparameters throughly.

#### Random Search
Run a Random Search using 5 fold cross validation

In [None]:
rand_search_params = {"max_depth": sp_randint(4, 10),
                      "subsample": uniform(0, 1),
                      "colsample_bytree": uniform(0, 1)}

candidates = 50

In [None]:
random_search_xgb = RandomizedSearchCV(xgb_clf, rand_search_params, candidates, cv = 5, n_jobs = -1, verbose=5)

random_search_xgb.fit(X_train_set, Y_train_set.reshape(-1, ))



Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 40.1min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=0.1,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                  

In [None]:
# Show the random search result
report(random_search_xgb.cv_results_)

score = 100.0 * random_search_xgb.score(X_val_set, Y_val_set.reshape(-1,))
print(f"Best GBDT from random search has prediction accuracy{score:5.1f}% in validation set\n")

Model with rank: 1
Mean validation score: 0.895 (std: 0.003)
Parameters: {'colsample_bytree': 0.45460082307426564, 'max_depth': 8, 'subsample': 0.7157885779916943}

Model with rank: 2
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.7162063928708141, 'max_depth': 9, 'subsample': 0.5788796158396107}

Model with rank: 3
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.6502535610172493, 'max_depth': 8, 'subsample': 0.8290062940712538}

Model with rank: 4
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.5977346884057299, 'max_depth': 7, 'subsample': 0.7035756863210445}

Model with rank: 5
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.6546980081519742, 'max_depth': 9, 'subsample': 0.31963642271839987}

Best GBDT from random search has prediction accuracy 89.1% in validation set



#### Grid Search
Run a Grid Search using 5 fold cross validation

##### Using the parameters found in the random search to do grid search

In [None]:
best_ind = np.argsort(random_search_xgb.cv_results_["rank_test_score"])[0]
best_params = random_search_xgb.cv_results_["params"][best_ind]
best_depth = best_params["max_depth"]
best_subsample = best_params["subsample"]
best_colsample_bytree = best_params["colsample_bytree"]

In [None]:
# Grid search
grid_params_xgb = {"max_depth": [best_depth - 1, best_depth],
                   "subsample": [best_subsample - 0.1, best_subsample],
                   "colsample_bytree": [best_colsample_bytree - 0.1, best_colsample_bytree] }

xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, n_jobs = -1)

# Run a grid search CV.
grid_search_xgb = GridSearchCV(xgb_clf, grid_params_xgb, cv=5, n_jobs=-1, verbose=10)

grid_search_xgb.fit(X_train_set, Y_train_set.reshape(-1,))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  5.4min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:  6.7min remaining:   57.8s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  7.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  7.2min finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [None]:
# Show the grid search result
report(grid_search_xgb.cv_results_)

score = 100.0 * grid_search_xgb.score(X_val_set, Y_val_set)
print(f"Best GBDT from grid search has prediction accuracy ={score:5.1f}%\n")

Model with rank: 1
Mean validation score: 0.895 (std: 0.003)
Parameters: {'colsample_bytree': 0.45460082307426564, 'max_depth': 8, 'subsample': 0.7157885779916943}

Model with rank: 2
Mean validation score: 0.894 (std: 0.003)
Parameters: {'colsample_bytree': 0.35460082307426566, 'max_depth': 8, 'subsample': 0.6157885779916943}

Model with rank: 3
Mean validation score: 0.894 (std: 0.003)
Parameters: {'colsample_bytree': 0.35460082307426566, 'max_depth': 8, 'subsample': 0.7157885779916943}

Model with rank: 4
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.45460082307426564, 'max_depth': 8, 'subsample': 0.6157885779916943}

Model with rank: 5
Mean validation score: 0.894 (std: 0.002)
Parameters: {'colsample_bytree': 0.35460082307426566, 'max_depth': 7, 'subsample': 0.7157885779916943}

Best GBDT from grid search has prediction accuracy = 89.1%



### Train the best Random Forest classifier with whole training set

In [None]:
best_ind = np.argsort(grid_search_xgb.cv_results_["rank_test_score"])[0]
best_params = grid_search_xgb.cv_results_["params"][best_ind]
best_depth = best_params["max_depth"]
best_subsample = best_params["subsample"]
best_colsample_bytree = best_params["colsample_bytree"]

In [None]:
best_xgb_clf = xgb.XGBClassifier(n_estimators=100,
                                 max_depth = best_depth,
                                 subsample = best_subsample,
                                 colsample_bytree = best_colsample_bytree,
                                 learning_rate=0.1, 
                                 n_jobs = -1)

best_xgb_clf.fit(X_train, Y_train.reshape(-1, ))
score = 100.0 * best_xgb_clf.score(X_train, Y_train.reshape(-1, ))
print(f"Best GBDT trained with whole training set has prediction accuracy ={score:5.1f}%\n")

Best GBDT trained with whole training set has prediction accuracy = 91.0%



### Show the feature importance

In [None]:
vals = best_xgb_clf.feature_importances_
ind = vals.argsort()[::-1]

for i in ind[0:10]:
    print(f"{features[i]:<36}"
          f"{100.0 * vals[i]:5.2f}%")

 2                                   2.94%
 Male                                2.63%
 Professional specialty              2.59%
 Child under 18 never married        2.43%
 Nonfiler                            2.33%
weeks worked in year                 2.19%
capital gains                        1.97%
 Not in universe or children         1.72%
 0                                   1.58%
 Executive admin and managerial      1.48%


### Predict testing data

In [None]:
predictions = best_xgb_clf.predict(X_test)
with open(output_fpath.format('XGBDT'), 'w') as f:
    f.write('id,label\n')
    for i, label in enumerate(predictions):
        f.write('{},{}\n'.format(i, int(label)))

The testing result in https://www.kaggle.com/c/ml2020spring-hw2/data shows the accuracy = 0.90297