# Test for Hyperparameter Optimization

Since the base model with default values for hyperparameters shows already satisfying metrics, this is just a try to improve the results by using Bayesian Optimization for hyperparameter tuning.

Bayesian Optimization is used to optimize the R2 in a first attempt, and in a second attempt to optimize the RMSE. Metrics and learning curves are used for comparison.

The results are a bit disappointing. The trained models show lower metric scores than the base model.

The number of iterations is limited to 20 due to resource restrictions. Therefore, it is possible to get a better model with an increased number of iterations such as 50 or 100.

In [None]:
# importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data preparation, modeling, intepretation
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

# For Bayesian Optimization
import time
import optuna 
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score

# importing plotly and enable jupyter notebooks for showing optuna visualisations 
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
# reading data
df = pd.read_csv('data/Clean_Dataset.csv')
df.head()

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

# splitting train data into features and target
features_train = df_train.drop('price', axis = 1)
target_train = df_train['price']

# splitting test data into features and target
features_test = df_test.drop('price', axis = 1)
target_test = df_test['price']

# splitting val data into features and target
features_val = df_val.drop('price', axis = 1)
target_val = df_val['price']

In [None]:
# clean data function
def clean_data(df):
    """
        Returns clean data frame
        Args: DataFrame
    """

    # dropping 'Unnamed_ 0' column
    df = df.drop('Unnamed: 0', axis = 1)

    # dropping flight numbers
    df = df.drop('flight', axis = 1)

    #changing class into binary
    df.loc[:, 'class'] = df.loc[:, 'class'].replace({'Business': 0, 'Economy': 1})

    return df

In [None]:
#applying clean_data function on train data
features_train_cleaned = clean_data(features_train)
display(features_train_cleaned.head())

#applying clean_data function on test and val data
features_test_cleaned = clean_data(features_test)
features_val_cleaned = clean_data(features_val)

## Optimization on R2

In [None]:
# defining categorical and numerical columns
cat_cols = ['airline', 'source_city', 'departure_time', 'stops',
            'arrival_time', 'destination_city']

num_cols = ['duration', 'days_left']

In [None]:
# defining pipelines for each step
# numerical
numeric_transformer = Pipeline([('scaler', StandardScaler())])

# categorical
categorical_transformer = Pipeline([('ohe', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))])

# combining each pipeline step into ColumnTransformer for data preparation
preprocessor = ColumnTransformer([('num', numeric_transformer, num_cols),
                                  ('cat', categorical_transformer, cat_cols)], remainder = 'passthrough')

In [None]:
# recreating DataFrame back after preprocessing
features_train_preprocessed = preprocessor.fit_transform(features_train_cleaned)

ohe_col_list = preprocessor.transformers_[1][1].named_steps['ohe'].get_feature_names_out(cat_cols)
features_train_preprocessed = pd.DataFrame(features_train_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_train.index)
display(features_train_preprocessed.head())

features_test_preprocessed = preprocessor.transform(features_test_cleaned)
features_test_preprocessed = pd.DataFrame(features_test_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_test.index)
display(features_test_preprocessed.head())

features_val_preprocessed = preprocessor.transform(features_val_cleaned)
features_val_preprocessed = pd.DataFrame(features_val_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_val.index)
display(features_val_preprocessed.head())

In [None]:
# Bayesian Optimization on R2
def objective(trial):
    """return maximized f1-score"""
   
    # search space
    n_estimators = trial.suggest_int('n_estimators', 50, 250)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_categorical('max_features', choices = ['sqrt', 'log2', None])
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf}
    
    # random forest classifier object 
    model_rf = RandomForestRegressor(random_state=42, **params)
    
    # initiating cv
    score =  cross_val_score(estimator=model_rf, 
                             X=features_train_preprocessed, 
                             y=target_train, 
                             scoring='r2',
                             cv=5,
                             n_jobs=-1).mean()
    
    return score

# create a study (aim to maximize score) und setting a seed (random_state) for reproduceability
study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')

# perform hyperparamter tuning (while timing the process)
time_start = time.time()
# starting optimization process with our defined function and 50 iterations
study.optimize(objective, n_trials=20)
time_bayesian = time.time() - time_start

# store result in a data frame 
values_bayesian = [20, study.best_trial.number, study.best_trial.value, time_bayesian]
results_bayesian = pd.DataFrame([values_bayesian], columns = ['Number of iterations', 
                                                                        'Iteration Number of Optimal Hyperparamters', 
                                                                        'Score', 
                                                                        'Time Elapsed (s)'])

In [None]:
# show results
display(results_bayesian)
study.best_params

In [None]:
# Best model optimized on R2
best_params = {'n_estimators': 176, 
               'max_depth': 15, 
               'max_features': None, 
               'min_samples_split': 4, 
               'min_samples_leaf': 1}

In [None]:
# training and evaluating predictions on test data
best_model = RandomForestRegressor(random_state = 42, **best_params)

best_model.fit(features_train_preprocessed, target_train)

target_test_pred = best_model.predict(features_test_preprocessed)

print('R2: ', r2_score(target_test, target_test_pred))
print('RMSE: ', root_mean_squared_error(target_test, target_test_pred))

In [None]:
# evaluating predictions on val data
target_val_pred = best_model.predict(features_val_preprocessed)

# showing metrics
print('R2: ', r2_score(target_val, target_val_pred))
print('RMSE: ', root_mean_squared_error(target_val, target_val_pred))

In [None]:
# Computing learning curve (watch out, it will take some time on the current PC; 16 to 23 minutes)
train_sizes, train_scores, test_scores = learning_curve(estimator=best_model, 
                                                        X=features_train_preprocessed, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='r2')

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
# plotting learning curve
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("R2")
ax.legend(loc="best")
fig_lc;

## Optimization on RMSE

In [None]:
# optimization on RMSE
def objective(trial):
    """return maximized f1-score"""
   
    # search space
    n_estimators = trial.suggest_int('n_estimators', 50, 250)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_categorical('max_features', choices = ['sqrt', 'log2', None])
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf}
    
    # random forest classifier object 
    model_rf = RandomForestRegressor(random_state=42, **params)
    
    # initiating cv
    score =  cross_val_score(estimator=model_rf, 
                             X=features_train_preprocessed, 
                             y=target_train, 
                             scoring='neg_root_mean_squared_error',
                             cv=5,
                             n_jobs=-1).mean()
    
    return score

# create a study (aim to maximize score) und setting a seed (random_state) for reproduceability
study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')

# perform hyperparamter tuning (while timing the process)
time_start = time.time()
# starting optimization process with our defined function and 50 iterations
study.optimize(objective, n_trials=20)
time_bayesian = time.time() - time_start

# store result in a data frame 
values_bayesian = [20, study.best_trial.number, study.best_trial.value, time_bayesian]
results_bayesian = pd.DataFrame([values_bayesian], columns = ['Number of iterations', 
                                                                        'Iteration Number of Optimal Hyperparamters', 
                                                                        'Score', 
                                                                        'Time Elapsed (s)'])

In [None]:
# best model
best_params = {'n_estimators': 176, 
               'max_depth': 15, 
               'max_features': None, 
               'min_samples_split': 4, 
               'min_samples_leaf': 1}

In [None]:
# training and evaluating predictions on test data
best_model = RandomForestRegressor(random_state = 42, **best_params)

best_model.fit(features_train_preprocessed, target_train)

target_test_pred = best_model.predict(features_test_preprocessed)

print('R2: ', r2_score(target_test, target_test_pred))
print('RMSE: ', root_mean_squared_error(target_test, target_test_pred))

In [None]:
# evaluating predictions on val data
target_val_pred = best_model.predict(features_val_preprocessed)

# showing metrics
print('R2: ', r2_score(target_val, target_val_pred))
print('RMSE: ', root_mean_squared_error(target_val, target_val_pred))

In [None]:
# Computing learning curve (watch out, it will take some time on the current PC; 16 to 23 minutes)
train_sizes, train_scores, test_scores = learning_curve(estimator=best_model, 
                                                        X=features_train_preprocessed, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='neg_root_mean_squared_error')

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
# plotting learning curve
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("R2")
ax.legend(loc="best")
fig_lc;