# Hyperparameter Tuning

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer, matthews_corrcoef

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
    pipeline_utils.add_outlier_handling(
            custom_pipeline=custom_pipeline,
            outlier_handling_func=outlier_remover.handle_outliers
        )
    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    #apply_knn_classifier(custom_pipeline, 9)
    pipeline_utils.apply_lgbm_classifier(custom_pipeline)

Try out GridSearchCV

In [4]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

loading data


In [23]:
# run the pipeline once to have the static steps done
pipe.run()

preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7179 [std=0.]
    validation_f1-score: 0.6461 [std=0.]
    validation_mcc: 0.4692 [std=0.]


In [28]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300, 400, 500],
    'estimator__learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'estimator__max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=pipe.get_pipeline(),
    param_grid=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=10
)

# Fit the grid search to the training data
grid_search.fit(pipe.X_train, pipe.y_train)

Fitting 2 folds for each of 225 candidates, totalling 450 fits


In [30]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_val)

# Print the best MCC score
best_score = grid_search.best_score_
print("Best MCC score:", best_score)

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best MCC score: 0.5007491382398774
Best parameters: {'estimator__learning_rate': 0.3, 'estimator__max_depth': -1, 'estimator__n_estimators': 400}


Try out GridSearchCV for KNN:

In [2]:
# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
    pipeline_utils.add_outlier_handling(
            custom_pipeline=custom_pipeline,
            outlier_handling_func=outlier_remover.handle_outliers
        )
    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age', 'position'])

    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)
    # add estimator
    pipeline_utils.apply_knn_classifier(custom_pipeline, 9, 'uniform', 1)
    #pipeline_utils.apply_lgbm_classifier(custom_pipeline)

In [3]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_cleaning=True, 
    skip_storing_prediction=True, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

loading data


In [4]:
# run the pipeline once to have the static steps done
pipe.run()

preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7323 [std=0.]
    validation_f1-score: 0.6802 [std=0.]
    validation_mcc: 0.5065 [std=0.]


In [5]:
param_grid = {
    "estimator__n_neighbors": [7, 9, 12],
    "estimator__weights": ["uniform", "distance"],
    "estimator__p": [
        1,
        2,
    ],
}


# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=pipe.get_pipeline(),
    param_grid=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=10,
)

# Fit the grid search to the training data
grid_search.fit(pipe.X_train, pipe.y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


In [6]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_val)

# Print the best MCC score
best_score = grid_search.best_score_
print("Best MCC score:", best_score)

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best MCC score: 0.4851473098350467
Best parameters: {'estimator__n_neighbors': 9, 'estimator__p': 1, 'estimator__weights': 'uniform'}


GridSearch for KNN after removing position and age doesn't lead to improvement regarding MCC. MCC dropped a little bit to 0.48