**THE GOAL OF THIS NOTEBOOK IS TO ANALYSE HYPERPARAMETERS IMPORTANCE OF A KERAS SIMPLE MODEL.
We use the dataset from the 'Regression of Used Car Prices' competition.**

**- We start with a brief exploration and cleaning of the data.**

**- Next, we perform preliminary preprocessing using Scikit-learn.**

**- We then build a simple model with two Keras dense layers, using the preprocessed data as input.**

**- Afterward, we conduct a hyperparameter optimization study on the Keras model using Optuna.**

**- Finally, we analyze the importance of the hyperparameters, which is the main topic of this notebook.**

In [1]:
import datetime as dt
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
import os
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
import warnings

warnings.filterwarnings("ignore")

# ---- Adapt size of columns when displaying a DataFrame
#
pd.set_option('display.max_colwidth', None)

In [None]:
# ---- Get the current working directory where python is executing to adapt filepath used in this notebook
#
current_directory = os.getcwd()
print("Current directory :", current_directory)

In [3]:
#---- SKIP THIS SECTION ON KAGGLE ----
# ---- Section dedicated to script.py import
# 
import sys

# ---- Specify path to the folder containing the scripts
#
script_path = './scripts'
sys.path.append(script_path)

# ---- Import the usefull function in the scripts
#
from visualization import make_barplot, make_hist, make_box_for_discrete_features, make_box_for_continuous_features

In [4]:
# ---- Define path to all csv input files

FILEPATH_train = './data/train.csv'
FILEPATH_test = './data/test.csv'
FILEPATH_sample_submission = './data/sample_submission.csv'
# FILEPATH_train = '/kaggle/input/playground-series-s4e9/train.csv'
# FILEPATH_test = '/kaggle/input/playground-series-s4e9/test.csv'
# FILEPATH_sample_submission = '/kaggle/input/playground-series-s4e9/sample_submission.csv'

# ---- Define path that will be used to store output files
#
OUTPUT_PATH_results = './results'
FILEPATH_model_results = f'{OUTPUT_PATH_results}/model_results_KERAS.csv'
DIRPATH_saved_models = f'{OUTPUT_PATH_results}/model'

# ---- Define path to store results of study Optuna
relative_path = './results/study_KERAS_hyperparams.db'
STUDY_STORAGE_PATH = f'sqlite:///{relative_path}'

os.makedirs(OUTPUT_PATH_results, exist_ok=True)

# ---- Initialize seed
# 
random_state = 32

# I. EXPLORATORY DATA ANALASYS

## I.1 LOAD DATA AND QUICK EXPLORATION

In [5]:
data = pd.read_csv(FILEPATH_train)

In [None]:
data.head(3)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# ---- Get numerical columns names
#
numerical_columns = data.select_dtypes(include=['int64'])
numerical_columns_name = numerical_columns.columns.to_list()
numerical_columns_name 


In [None]:
# ---- Get numerical features names
# 
numerical_features = numerical_columns_name
numerical_features.remove('id')
numerical_features.remove('price')
numerical_features

In [None]:
# ---- Get categorical features names
# 
categorical_columns = data.select_dtypes(include=['object'])
categorical_features = categorical_columns.columns.to_list()
categorical_features

In [None]:
# ---- Print number of category for each feature
# 
for feature in categorical_features:
    print(feature, ': ', data[feature].nunique())

In [None]:
# ---- Explore deeper into categories
# 
for feature in categorical_features:
    print(data[feature].value_counts().nlargest(100), '\n\n****************************')

## I.2 SIMPLE DATA CLEANING

In [None]:
data.isna().sum()

In [15]:
def simple_cleaning(data):
    '''
    Replaces missing values with 'Unknown' in specific columns
    and converts text in categorical columns to lowercase.
    
    Parameters:
    ___________
    data: pd.DataFrame
        The DataFrame to clean.
    
    Returns:
    ________
    data: pd.DataFrame
        The DataFrame cleaned, with text values converted to lowercase.
    '''    
    # ---- Assign 'unknown' in column 'clean_title' and 'fuel_type' for missing values
    #
    data['clean_title'] = data['clean_title'].fillna('Unknown')
    data['fuel_type'] = data['fuel_type'].fillna('Unknown')
    data['accident'] = data['accident'].fillna('Unknown')
    
    # ---- Lowercase text in categorical columns
    #
    categorical_features = data.select_dtypes(include=['object']).columns
    data[categorical_features] = data[categorical_features].apply(lambda x: x.str.lower())
    
    return data

In [16]:
data = simple_cleaning(data)

In [None]:
# ---- Check if values are still missing
# 
data.isna().sum()

In [None]:
# ---- Look for duplicated rows
#
print(data.duplicated().sum())

# ---- Look for duplicated rows skipping column id
#
print(data.iloc[:,1:].duplicated().sum())

## I.3 DATA VISUALIZATION


### I.3.1 LIST OF USEFULL PLOTTING FUNCTIONS

### I.3.2. PLOTS

In [None]:
make_barplot(data, categorical_features, 10)

In [None]:
make_hist(data, numerical_features + ['price'])

In [22]:
X_df = data.drop(['price'], axis=1)
y_df = data.price

discrete_features_to_plot = ['model_year']
continious_features_to_plot = ['milage']

In [None]:
make_box_for_continuous_features(X_df, continious_features_to_plot, y_df, bins=100, ylim=(-1000,180000))
make_box_for_discrete_features(X_df, discrete_features_to_plot, y_df)

**In this final graph, we observe a trend between price and the model year. Therefore, we can decide to treat the model_year feature as a numeric variable rather than encoding it as we will for the categorical features. (I try it but it do not help the results, so I change my mind!)**

# II. SCIKIT LEARN DATA PREPROCESSING

In [None]:
# ---- Prepare data
#
X = data[numerical_features + categorical_features]
y = data['price']

X.info()

In [25]:
# ---- Split data
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [None]:
# ----- Specify a preprocessing method to group of features
#
# ----- Continuous numeric features to normalize
features_to_scale = ['milage']
# ----- Categorical type object features to encode
features_to_encode = categorical_features + ['model_year']
features_to_encode

In [27]:
# ---- Build preprocessor
#
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), features_to_scale),
        ("enc", OneHotEncoder(drop="first", handle_unknown='ignore', sparse=False), features_to_encode) #Specify sparse=False to avoid raising the XLA_GPU_JIT: SparseFillEmptyRows error when using GPU
    ],
    remainder='passthrough',
)

In [28]:
# ---- Preprocess data with sklearn
#
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_shape = X_train_preprocessed.shape

# III.KERAS MODEL AND OPTUNA OBJECTIVE FUNCTION

## III.1. FIRST, SOME USEFULL FUNCTIONS WE WILL USE LATER

In [29]:
def format_results_to_df(history, hyperparameters, random_state):
    '''
    Build a DataFrame from the results

    Parameters
    ----------
    history : History
        The training history object containing the metrics.
    hyperparameters : dict
        Hyperparameters of the trained model.
    random_state : int
        Random state use to split data for reproducibility.

    Returns
    -------
    results_df: pd.Dataframe
    '''
    # ---- List all files in 'DIRPATH_saved_models' directory and find the latest
    #
    list_of_files = glob.glob(os.path.join(DIRPATH_saved_models, '*'))
    latest_file = max(list_of_files, key=os.path.getctime)

    # ---- Initialize the an empty dictionnary and fill with parameters and results
    #
    results_dict = {}

    # ---- Add filepath to the last saved model
    #
    results_dict['model_filepath'] = latest_file

    # ----- Get back metrics
    #
    train_rmse_history = history.history['root_mean_squared_error']
    val_rmse_history = history.history['val_root_mean_squared_error']
    best_val_rmse = min(val_rmse_history)
    best_epoch_rmse = val_rmse_history.index(best_val_rmse) + 1
    train_rmse = train_rmse_history[val_rmse_history.index(best_val_rmse)]

    # ---- Add metrics to results dictionary
    #
    results_dict['best_val_rmse'] = best_val_rmse
    results_dict['associated_train_rmse'] = train_rmse
    results_dict['best_epoch_rmse'] = best_epoch_rmse

    # ---- Add model hyperparameters
    #
    results_dict = {**results_dict, **hyperparameters}

    # ---- Add random_state used to split data
    #
    results_dict['random_state'] = random_state

    results_df = pd.DataFrame([results_dict])

    return results_df

In [30]:
def save_metrics(result_df, column_to_sort='best_val_rmse', comment=None, filepath=FILEPATH_model_results):
    '''
    Build a CSV file 'model_results.csv' in the current directory if it does not exist.
    Populate the file with data from the given DataFrame, adding the date and random_state value.
    
    Parameters
    ----------
    result_df : pandas.DataFrame
        The results of all trained models.
    column_to_sort : str
        Column to use to sort the dataset.
    comment: str
        Optionnal comment.
    filepath: str
        Complete path to store file including name and extension. Default FILEPATH_model_results.
    
    Returns
    -------
    Builds or appends to a CSV file (a the given filepath) with the results.
    '''
    # ---- Add comment to the result DataFrame
    #
    result_df['comment'] = comment
    
    # ----Check if the file already exists
    #
    if os.path.exists(filepath):
        # ---- If the file exists, read the current data and append new data
        #
        existing_df = pd.read_csv(filepath, sep=';')
        updated_df = pd.concat([existing_df, result_df], ignore_index=True)
    else:
        # ---- If the file doesn't exist, create a new DataFrame
        #
        updated_df = result_df
    
    # ---- Sort and drop duplicates
    #
    updated_df = updated_df.sort_values(column_to_sort)
    updated_df.drop_duplicates(inplace=True)
    
    # ---- Write the updated DataFrame to the CSV file
    #
    # ---- Extract the name of the path of the directory where should be store the file
    directory = os.path.dirname(filepath)
    
    # ---- Create the path to the directory if not exist
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    # ---- Save file to .csv
    updated_df.to_csv(filepath, index=False, sep=';')

In [31]:
def get_callbacks():
    '''
    Create the list of callbacks for Keras model training.

    Returns:
    --------
    [savemodel_callback, early_stopping_callback]: list 
        List of Keras callbacks including EarlyStopping and ModelCheckpoint.
    '''

    # ---- Add callback to save best model
    # 
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    # ---- Create the directory to store the  model if not exist
    os.makedirs(DIRPATH_saved_models, exist_ok=True)
    # ---- Specify the complete path and name of the model to save
    save_dir = f"{DIRPATH_saved_models}/model_{timestamp}.keras"

    savemodel_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=save_dir,
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True
    )

    # ---- Add callback to stop training if no more improving
    #
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_root_mean_squared_error',
        min_delta=0,
        patience=5,
        verbose=0,
        mode="auto",
        baseline=None,
        restore_best_weights=False,
        start_from_epoch=0,
    )

    return [savemodel_callback, early_stopping_callback]

## III.2. BUILD THE OBJECTIVE FUNCTION WITH THE MODEL KERAS

In [32]:
def objective(trial):
    '''
    Defines and trains a model using hyperparameters suggested by Optuna.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        The Optuna trial object used for hyperparameter optimization.

    Returns:
    --------
    validation_score : float
        The best validation root mean squared error (RMSE) achieved.
    '''
    # ---- Choose hyperparameters:
    #
    parameters = {
        'dense_1_units' : trial.suggest_int('dense_1_units', 16, 128, step=16),
        'dense_1_kernel_regul_l1' : trial.suggest_uniform('dense_1_kernel_regul_l1', 0, 0.01),
        'dropout_1_dropout' :  trial.suggest_uniform('dropout_1_dropout', 0, 0.1),
        'batch_size': trial.suggest_int('batch_size', 64, 1280, step=64),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0005, 0.1),
    }

    parameters['dense_2_units'] = parameters['dense_1_units']
    parameters['dense_2_kernel_regul_l1'] = parameters['dense_1_kernel_regul_l1']
    parameters['dropout_2_dropout'] = parameters['dropout_1_dropout']
    
    # ---- Feed model with preprocessing layers
    #
    input = Input(shape=(X_shape[1],))

    dense_1 = tf.keras.layers.Dense(parameters.get('dense_1_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_1_kernel_regul_l1')),
                                    name='Dense_n1'
                                    )(input)
    
    dropout_1 = tf.keras.layers.Dropout(parameters.get('dropout_1_dropout'), name='Dropout_n1')(dense_1)

    dense_2 = tf.keras.layers.Dense(
                                    parameters.get('dense_2_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_2_kernel_regul_l1')),
                                    name='Dense_n2'
                                    )(dropout_1)
    
    dropout_2 = tf.keras.layers.Dropout(parameters.get('dropout_2_dropout'), name='Dropout_n2')(dense_2)
    
    output = tf.keras.layers.Dense(1, name='Output')(dropout_2)

    # ---- Build model with the multiple inputs
    #
    model = Model(inputs=input, outputs=output)

    # ---- Compile model
    #
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=parameters['learning_rate']),
        loss='mse',
        metrics=['root_mean_squared_error']
    )

    callbacks = get_callbacks()
    epochs = 100

    # ---- Fit model
    #
    history = model.fit(x=X_train_preprocessed,
                    y=y_train,
                    validation_data = (X_test_preprocessed , y_test),
                    epochs=epochs,
                    batch_size=parameters['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )
    
    # ---- Format result to dataframe and save results
    #
    result_df = format_results_to_df(history, parameters, random_state)
    save_metrics(result_df=result_df,
                 column_to_sort='best_val_rmse',
                 comment='hyper parameters optimisation')

    # ---- Return score
    #
    validation_score = min(history.history['val_root_mean_squared_error'])

    return validation_score

## III.3. RUN THE STUDY, SAVE MODEL AND RESULTS - MODEL WITH SKLEARN PREPROCESSING

In [None]:
# ---- Run optuna optimization
#
study = optuna.create_study(study_name=f'{dt.datetime.now().strftime("%Y%m%d_%H%M")}',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                           )
study.optimize(objective, n_trials=50)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')
general_results_df.head(5)

# IV. HYPER PARAMETERS ANALSYE

## IV.1. HYPER PARAMETERS ANALSYE WITH OPTUNA

In [35]:
# ---- List all optuna study we did
#
studies = optuna.get_all_study_summaries(storage=STUDY_STORAGE_PATH)
print([study.study_name for study in studies])

In [None]:
# ---- Reload the last study to analyse
# 
study_name = studies[-1].study_name
print(f'Name of the choosen study: {study_name}')

study = optuna.load_study(study_name=study_name, storage=STUDY_STORAGE_PATH)

# ---- Retrieve the parameters names
#
param_names = set()

for trial in study.trials:
    param_names.update(trial.params.keys())

print(f"Suggested parameters of the study : {param_names}")

In [None]:
optuna.visualization.plot_param_importances(study, params=list(param_names))

In [None]:
optuna.visualization.plot_slice(study, params=list(param_names))

## IV.2. ANALYSE OF THE INTERACTION BETWEEN HYPER PARAMETERS

In [None]:
# ---- Relaod the DataFrame containing the results of the study ans filter it
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')

# ---- Selected only hyper parameters columns and 'best_val_rmse'
#
selected_column = list(param_names) + ['best_val_rmse']
filtered_df = general_results_df[selected_column]

# ---- Display a paiplot between parameters
#
sns.pairplot(data=filtered_df, hue='best_val_rmse')

# V. MAKE A SUBMISSION

**We did not attempt any feature engineering but let's make a prediction anyway!**

## V.I. RELOAD BEST MODEL

In [None]:
# ---- Retrieve the best model
#
best_val_rmse = general_results_df['best_val_rmse'].min()
best_model_data = general_results_df[general_results_df['best_val_rmse'] == best_val_rmse]
display(best_model_data)

best_model_filepath = best_model_data['model_filepath'].values[0]

In [None]:
# ---- Reload best model
# 
best_model = tf.keras.models.load_model(best_model_filepath)
best_model.summary()

## V.II. PREPROCESS SUBMISSION DATA

In [42]:
# ---- Load data
#
X_submission = pd.read_csv(FILEPATH_test)
X_submission.head(2)

# ---- Prepare data as we did for training data
#
X_submission_id = X_submission['id']
X_submission = X_submission.drop(['id'], axis=1)
X_submission.isna().sum()

# ---- Clean as we did for data used to train model
#
X_submission = simple_cleaning(X_submission)

# ---- Preprocess data with sklearn
#
X_sub_preprocessed = preprocessor.transform(X_submission)

## V.III. PREDICT AND MAKE SUBMISSION

In [None]:
# ---- Predict
# 
y_submission = best_model.predict(X_sub_preprocessed)

# ---- Build a dataframe
#
y_df = pd.DataFrame(y_submission, columns=['price'])
y_df['price'] = y_df['price'].apply(lambda x: int(x))
submission_df = pd.concat([X_submission_id, y_df], axis=1)
submission_df

# ---- Save prediction in csv file
#
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")

filepath = f'{OUTPUT_PATH_results}/submission_KERAS{timestamp}.csv'
submission_df.to_csv(filepath, index=False)
