# Hyperparameter Tuning

In this notebook, we apply hyperparameter tuning to different estimators.

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer, matthews_corrcoef
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

## LGBM

In [2]:
# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
    pipeline_utils.add_outlier_handling(
            custom_pipeline=custom_pipeline,
            outlier_handling_func=outlier_remover.handle_outliers
        )
    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    #apply_knn_classifier(custom_pipeline, 9)
    pipeline_utils.apply_lgbm_classifier(custom_pipeline)

Try out GridSearchCV

In [4]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

loading data


In [23]:
# run the pipeline once to have the static steps done
pipe.run()

preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7179 [std=0.]
    validation_f1-score: 0.6461 [std=0.]
    validation_mcc: 0.4692 [std=0.]


In [28]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300, 400, 500],
    'estimator__learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'estimator__max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=pipe.get_pipeline(),
    param_grid=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=10
)

# Fit the grid search to the training data
grid_search.fit(pipe.X_train, pipe.y_train)

Fitting 2 folds for each of 225 candidates, totalling 450 fits


In [30]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_val)

# Print the best MCC score
best_score = grid_search.best_score_
print("Best MCC score:", best_score)

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best MCC score: 0.5007491382398774
Best parameters: {'estimator__learning_rate': 0.3, 'estimator__max_depth': -1, 'estimator__n_estimators': 400}


Let us get the cross validated performances of the tuned and not tuned models.

In [3]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 13.5341 [std=1.2449]
    score_time: 1.2355 [std=0.1053]
    cv_accuracy: 0.712 [std=0.0029]
    cv_f1-score: 0.6413 [std=0.003]
    cv_mcc: 0.457 [std=0.0056]
    validation_accuracy: 0.7099 [std=0.]
    validation_f1-score: 0.6359 [std=0.]
    validation_mcc: 0.452 [std=0.]


In [4]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)
pipeline_utils.apply_partly_tuned_lgbm_classifier(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 22.4414 [std=1.3748]
    score_time: 2.254 [std=0.2994]
    cv_accuracy: 0.7378 [std=0.0019]
    cv_f1-score: 0.6829 [std=0.0024]
    cv_mcc: 0.5124 [std=0.0035]
    validation_accuracy: 0.7423 [std=0.]
    validation_f1-score: 0.6899 [std=0.]
    validation_mcc: 0.52 [std=0.]


In [5]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)
pipeline_utils.apply_tuned_lgbm_classifier(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 23.383 [std=1.52]
    score_time: 2.3453 [std=0.2771]
    cv_accuracy: 0.7395 [std=0.0011]
    cv_f1-score: 0.6875 [std=0.0017]
    cv_mcc: 0.5164 [std=0.0021]
    validation_accuracy: 0.7415 [std=0.]
    validation_f1-score: 0.6891 [std=0.]
    validation_mcc: 0.5199 [std=0.]


## KNN

Try out GridSearchCV for KNN:

In [2]:
# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
    pipeline_utils.add_outlier_handling(
            custom_pipeline=custom_pipeline,
            outlier_handling_func=outlier_remover.handle_outliers
        )
    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age', 'position'])

    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)
    # add estimator
    pipeline_utils.apply_knn_classifier(custom_pipeline, 9, 'uniform', 1)
    #pipeline_utils.apply_lgbm_classifier(custom_pipeline)

In [3]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

loading data


In [4]:
# run the pipeline once to have the static steps done
pipe.run()

preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7323 [std=0.]
    validation_f1-score: 0.6802 [std=0.]
    validation_mcc: 0.5065 [std=0.]


In [5]:
param_grid = {
    "estimator__n_neighbors": [7, 9, 12],
    "estimator__weights": ["uniform", "distance"],
    "estimator__p": [
        1,
        2,
    ],
}


# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=pipe.get_pipeline(),
    param_grid=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=10,
)

# Fit the grid search to the training data
grid_search.fit(pipe.X_train, pipe.y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


In [6]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_val)

# Print the best MCC score
best_score = grid_search.best_score_
print("Best MCC score:", best_score)

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best MCC score: 0.4851473098350467
Best parameters: {'estimator__n_neighbors': 9, 'estimator__p': 1, 'estimator__weights': 'uniform'}


GridSearch for KNN after removing position and age doesn't lead to improvement regarding MCC. MCC dropped a little bit to 0.48

Let us get the cross validated performances of the tuned and not tuned models.

In [3]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)
pipeline_utils.apply_default_knn_classifier(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 7.5837 [std=0.6334]
    score_time: 41.2043 [std=4.29]
    cv_accuracy: 0.7179 [std=0.0018]
    cv_f1-score: 0.6687 [std=0.0025]
    cv_mcc: 0.4833 [std=0.0034]
    validation_accuracy: 0.7156 [std=0.]
    validation_f1-score: 0.6689 [std=0.]
    validation_mcc: 0.4792 [std=0.]


In [4]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)
pipeline_utils.apply_tuned_knn_classifier(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 8.2333 [std=1.0919]
    score_time: 242.5973 [std=3.3116]
    cv_accuracy: 0.7264 [std=0.0009]
    cv_f1-score: 0.6746 [std=0.0017]
    cv_mcc: 0.4954 [std=0.0018]
    validation_accuracy: 0.7273 [std=0.]
    validation_f1-score: 0.6746 [std=0.]
    validation_mcc: 0.4968 [std=0.]


## RandomForestClassifier

In this section, we will run a hyperparameter tuning with a RandomForestClassifier as estimator.

In [5]:
# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
    pipeline_utils.add_outlier_handling(
            custom_pipeline=custom_pipeline,
            outlier_handling_func=outlier_remover.handle_outliers
        )
    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age', 'position'])

    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)
    # add estimator
    pipeline_utils.apply_randomforest_classifier(custom_pipeline)

In [6]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

loading data


In [13]:
# run the pipeline once to have the static steps done
pipe.run()

preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7303 [std=0.]
    validation_f1-score: 0.6794 [std=0.]
    validation_mcc: 0.5028 [std=0.]


In [14]:
#param_grid = {
#    'estimator__bootstrap': [True, False], # default=True
#    'estimator__max_depth': [5, 8, 13, 21, 34, None], # default=None
#    'estimator__max_features': ['auto', 'sqrt'], # default=”sqrt”
#    'estimator__min_samples_leaf': [1, 2, 4], # default=1
#    'estimator__min_samples_split': [2, 5, 10], # default=2
#    'estimator__n_estimators': [25, 50, 100, 200, 300, 400, 500], # default=100
#}

param_grid = {
    'estimator__bootstrap': Categorical([True, False]), # default=True
    'estimator__max_depth': Integer(2, 50), # default=None
    'estimator__max_features': Categorical(['auto', 'sqrt']), # default=”sqrt”
    'estimator__min_samples_leaf': Integer(1, 10), # default=1
    'estimator__min_samples_split': Integer(2, 10), # default=2
    'estimator__n_estimators': Integer(25, 500), # default=100
}

# Perform grid search cross-validation
bayes_search = BayesSearchCV(
    estimator=pipe.get_pipeline(),
    search_spaces=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=10,
    n_iter=400,
    n_points=8
)

# Fit the grid search to the training data
bayes_search.fit(pipe.X_train, pipe.y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [None]:
# Get the best model from the grid search
best_model = bayes_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_val)

# Print the best MCC score
best_score = bayes_search.best_score_
print("Best MCC score:", best_score)

# Print the best parameters
best_params = bayes_search.best_params_
print("Best parameters:", best_params)

#### Calculated on Server Infrastructure

Best MCC score: 0.5021965928227141

Best parameters: OrderedDict([('estimator__bootstrap', False), ('estimator__max_depth', 25), ('estimator__max_features', 'auto'), ('estimator__min_samples_leaf', 4), ('estimator__min_samples_split', 10), ('estimator__n_estimators', 500)])

Let us get the cross validated performances of the tuned and not tuned models.

In [7]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 59.128 [std=3.579]
    score_time: 3.3441 [std=0.4279]
    cv_accuracy: 0.7223 [std=0.0012]
    cv_f1-score: 0.6684 [std=0.0021]
    cv_mcc: 0.4868 [std=0.0022]
    validation_accuracy: 0.7259 [std=0.]
    validation_f1-score: 0.6755 [std=0.]
    validation_mcc: 0.4947 [std=0.]


In [8]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=True)
add_test_steps(pipe)
pipeline_utils.apply_tuned_randomforest_classifier(pipe)

pipe.run()

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 258.14 [std=27.7921]
    score_time: 11.9446 [std=3.0933]
    cv_accuracy: 0.7405 [std=0.0011]
    cv_f1-score: 0.6802 [std=0.0011]
    cv_mcc: 0.5147 [std=0.002]
    validation_accuracy: 0.7456 [std=0.]
    validation_f1-score: 0.6861 [std=0.]
    validation_mcc: 0.5247 [std=0.]
