# Hyperparameter Tuning

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer, matthews_corrcoef

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    pipeline_utils.apply_knn_classifier(custom_pipeline, 9)

Try out GridSearchCV

In [2]:
# Build pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_cleaning=True, skip_storing_prediction=True, use_validation_set=True)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

loading data


In [3]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__learning_rate': [0.1, 0.05, 0.01],
    'estimator__max_depth': [3, 5, 7]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=pipe.get_pipeline(),
    param_grid=param_grid,
    scoring=make_scorer(matthews_corrcoef),
    cv=2,
    n_jobs=-1,
    verbose=3
)

# Fit the grid search to the training data
grid_search.fit(pipe.X_train, pipe.y_train['damage_grade'])

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(pipe.X_test)

# Calculate evaluation metrics
precision = precision_score(pipe.y_test, y_pred)
recall = recall_score(pipe.y_test, y_pred)
f1 = f1_score(pipe.y_test, y_pred)

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Print the best MCC score
best_score = grid_search.best_score_
print("Best MCC score:", best_score)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
