In [None]:
import datetime as dt
import glob
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
import os
import re
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
import shap
import string
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Normalization, CategoryEncoding, Input, StringLookup, Concatenate
from tensorflow.keras.utils import plot_model
import warnings

warnings.filterwarnings("ignore")

# ---- Adapt size of columns when displaying a DataFrame
#
pd.set_option('display.max_colwidth', None)

In [None]:
# ---- Get the current working directory where python is executing to adapt filepath used in this notebook
#
current_directory = os.getcwd()
print("Current directory :", current_directory)

In [3]:
#---- SKIP THIS SECTION ON KAGGLE ----
# ---- Section dedicated to script.py import
# 
import sys

# ---- Specify path to the folder containing the scripts
#
script_path = './scripts'
sys.path.append(script_path)

# ---- Import the usefull function in the scripts
#
from visualization import make_barplot, make_hist, make_box_for_discrete_features, make_box_for_continuous_features

In [23]:
# ---- Define path to all csv input files
#
FILEPATH_train = './data/train.csv'
FILEPATH_test = './data/test.csv'
FILEPATH_sample_submission = './data/sample_submission.csv'
# FILEPATH_train = '/kaggle/input/playground-series-s4e9/train.csv'
# FILEPATH_test = '/kaggle/input/playground-series-s4e9/test.csv'
# FILEPATH_sample_submission = '/kaggle/input/playground-series-s4e9/sample_submission.csv'

# ---- Define path that will be used to store output files
#
OUTPUT_PATH_results = './results'
FILEPATH_model_results = f'{OUTPUT_PATH_results}/model_results_KERAS.csv'
DIRPATH_saved_models = f'{OUTPUT_PATH_results}/model'

# ---- Define path to store results of study Optuna
relative_path = f'{OUTPUT_PATH_results}/study_KERAS.db'
STUDY_STORAGE_PATH = f'sqlite:///{relative_path}'

os.makedirs(OUTPUT_PATH_results, exist_ok=True)

# ---- Initialize seed
# 
random_state = 32

# I. EXPLORATORY DATA ANALASYS

## I.1 LOAD DATA AND QUICK EXPLORATION

In [5]:
data = pd.read_csv(FILEPATH_train)

In [None]:
data.head(3)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# ---- Get numerical columns names
#
numerical_columns = data.select_dtypes(include=['int64'])
numerical_columns_name = numerical_columns.columns.to_list()
numerical_columns_name 


In [None]:
# ---- Get numerical features names
# 
numerical_features = numerical_columns_name
numerical_features.remove('id')
numerical_features.remove('price')
numerical_features

In [None]:
# ---- Get categorical features names
# 
categorical_columns = data.select_dtypes(include=['object'])
categorical_features = categorical_columns.columns.to_list()
categorical_features

In [None]:
# ---- Print number of category for each feature
# 
for feature in categorical_features:
    print(feature, ': ', data[feature].nunique())

In [None]:
# ---- Explore deeper into categories
# 
for feature in categorical_features:
    print(data[feature].value_counts().nlargest(100), '\n\n****************************')

## I.2 SIMPLE DATA CLEANING

In [None]:
data.isna().sum()

In [9]:
def simple_cleaning(data):
    '''
    Replaces missing values with 'Unknown' in specific columns
    and converts text in categorical columns to lowercase.
    
    Parameters:
    ___________
    data: pd.DataFrame
        The DataFrame to clean.
    
    Returns:
    ________
    data: pd.DataFrame
        The DataFrame cleaned, with text values converted to lowercase.
    '''    
    # ---- Assign 'unknown' in column 'clean_title' and 'fuel_type' for missing values
    #
    data['clean_title'] = data['clean_title'].fillna('Unknown')
    data['fuel_type'] = data['fuel_type'].fillna('Unknown')
    data['accident'] = data['accident'].fillna('Unknown')
    
    # ---- Lowercase text in categorical columns
    #
    categorical_features = data.select_dtypes(include=['object']).columns
    data[categorical_features] = data[categorical_features].apply(lambda x: x.str.lower())
    
    return data

In [10]:
data = simple_cleaning(data)

In [None]:
# ---- Check if values are still missing
# 
data.isna().sum()

In [None]:
# ---- Look for duplicated rows
#
print(data.duplicated().sum())

# ---- Look for duplicated rows skipping column id
#
print(data.iloc[:,1:].duplicated().sum())

## I.3 DATA VISUALIZATION


### I.3.1 LIST OF USEFULL PLOTTING FUNCTIONS

### I.3.2. PLOTS

In [None]:
make_barplot(data, categorical_features, 10)

In [None]:
make_hist(data, numerical_features + ['price'])

In [22]:
X_df = data.drop(['price'], axis=1)
y_df = data.price

discrete_features_to_plot = ['model_year']
continious_features_to_plot = ['milage']

In [None]:
make_box_for_continuous_features(X_df, continious_features_to_plot, y_df, bins=100, ylim=(-1000,180000))
make_box_for_discrete_features(X_df, discrete_features_to_plot, y_df)

**In this final graph, we observe a trend between price and the model year. Therefore, we can decide to treat the model_year feature as a numeric variable rather than encoding it as we will for the categorical features. (I try it but it do not help the results, so I change my mind!)**

In [None]:
# ---- Quick overview of direct correlation between features and target
# 
def make_mi_scores(data_df, categorical_features, numerical_features, target):
    """
    Calculate mutual information (MI) scores between features and the target variable.

    Categorical features are factorized before calculating MI using `mutual_info_regression`. 
    The function returns a sorted pandas Series of MI scores, showing the importance of each feature 
    for predicting the target.

    Parameters
    ----------
    data_df : pd.DataFrame
        DataFrame containing the features and target variable.
    categorical_features : list
        List of column names representing categorical features.
    numerical_features : list
        List of column names representing numerical features.
    target : str
        Column name of the target variable.
    
    Returns
    -------
    pd.Series
        MI scores for each feature, sorted in descending order.
    """

    X_df = data_df.copy()
    y_df = X_df[target]
    X_df = X_df[categorical_features + numerical_features]

    # ---- Factorize categorical features to integer
    #
    for feature in categorical_features:
        X_df[feature], _ = data_df[feature].factorize()

    # ---- Detect discret features
    #
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X_df.dtypes]

    # ---- Calcul mutual info scores
    #
    mi_scores = mutual_info_regression(X_df, y_df, discrete_features=discrete_features, random_state=0)

    # ---- Build dataframe of results
    #
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_df.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    
    return mi_scores

make_mi_scores(data, categorical_features, numerical_features, 'price')



# II. MODELISATION DEEP LEARNING

## II.1. FIRST, SOME USEFULL FUNCTIONS WE WILL USE LATER

In [12]:
def format_results_to_df(history, hyperparameters, random_state):
    '''
    Build a DataFrame from the results

    Parameters
    ----------
    history : History
        The training history object containing the metrics.
    hyperparameters : dict
        Hyperparameters of the trained model.
    random_state : int
        Random state use to split data for reproducibility.

    Returns
    -------
    results_df: pd.Dataframe
    '''
    # ---- List all files in 'DIRPATH_saved_models' directory and find the latest
    #
    list_of_files = glob.glob(os.path.join(DIRPATH_saved_models, '*'))
    latest_file = max(list_of_files, key=os.path.getctime)

    # ---- Initialize the an empty dictionnary and fill with parameters and results
    #
    results_dict = {}

    # ---- Add filepath to the last saved model
    #
    results_dict['model_filepath'] = latest_file

    # ----- Get back metrics
    #
    train_rmse_history = history.history['root_mean_squared_error']
    val_rmse_history = history.history['val_root_mean_squared_error']
    best_val_rmse = min(val_rmse_history)
    best_epoch_rmse = val_rmse_history.index(best_val_rmse) + 1
    train_rmse = train_rmse_history[val_rmse_history.index(best_val_rmse)]

    # ---- Add metrics to results dictionary
    #
    results_dict['best_val_rmse'] = best_val_rmse
    results_dict['associated_train_rmse'] = train_rmse
    results_dict['best_epoch_rmse'] = best_epoch_rmse

    # ---- Add model hyperparameters
    #
    results_dict = {**results_dict, **hyperparameters}

    # ---- Add random_state used to split data
    #
    results_dict['random_state'] = random_state

    results_df = pd.DataFrame([results_dict])

    return results_df

In [13]:
def save_metrics(result_df, column_to_sort='best_val_rmse', comment=None, filepath=FILEPATH_model_results):
    '''
    Build a CSV file 'model_results.csv' in the current directory if it does not exist.
    Populate the file with data from the given DataFrame, adding the date and random_state value.
    
    Parameters
    ----------
    result_df : pandas.DataFrame
        The results of all trained models.
    column_to_sort : str
        Column to use to sort the dataset.
    comment: str
        Optionnal comment.
    filepath: str
        Complete path to store file including name and extension. Default FILEPATH_model_results.
    
    Returns
    -------
    Builds or appends to a CSV file (a the given filepath) with the results.
    '''
    # ---- Add comment to the result DataFrame
    #
    result_df['comment'] = comment
    
    # ----Check if the file already exists
    #
    if os.path.exists(filepath):
        # ---- If the file exists, read the current data and append new data
        #
        existing_df = pd.read_csv(filepath, sep=';')
        updated_df = pd.concat([existing_df, result_df], ignore_index=True)
    else:
        # ---- If the file doesn't exist, create a new DataFrame
        #
        updated_df = result_df
    
    # ---- Sort and drop duplicates
    #
    updated_df = updated_df.sort_values(column_to_sort)
    updated_df.drop_duplicates(inplace=True)
    
    # ---- Write the updated DataFrame to the CSV file
    #
    # ---- Extract the name of the path of the directory where should be store the file
    directory = os.path.dirname(filepath)
    
    # ---- Create the path to the directory if not exist
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    # ---- Save file to .csv
    updated_df.to_csv(filepath, index=False, sep=';')

In [14]:
def get_callbacks():
    '''
    Create the list of callbacks for Keras model training.

    Returns:
    --------
    [savemodel_callback, early_stopping_callback]: list 
        List of Keras callbacks including EarlyStopping and ModelCheckpoint.
    '''

    # ---- Add callback to save best model
    # 
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    # ---- Create the directory to store the  model if not exist
    os.makedirs(DIRPATH_saved_models, exist_ok=True)
    # ---- Specify the complete path and name of the model to save
    save_dir = f"{DIRPATH_saved_models}/model_{timestamp}.keras"

    savemodel_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=save_dir,
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True
    )

    # ---- Add callback to stop training if no more improving
    #
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_root_mean_squared_error',
        min_delta=0,
        patience=5,
        verbose=0,
        mode="auto",
        baseline=None,
        restore_best_weights=False,
        start_from_epoch=0,
    )

    return [savemodel_callback, early_stopping_callback]

## II.2. FIRST METHOD : MODEL USING KERAS PREPROCESSING LAYERS

### II.2.1. DATA PREPARATION  - MODEL WITH KERAS PREPROCESSING LAYERS

In [None]:
# ---- Prepare data
#
X = data[numerical_features + categorical_features]
y = data['price']

X.info()

In [16]:
# ---- Specify a preprocessing method to group of features
#
# ---- Continuous numeric features to normalize
numeric_continuous_features = ['milage']

# ---- Categorical type object features to encode
categorical_object_features = categorical_features

# ---- Discrete numeric features to encode
numeric_discrete_features = ['model_year']


In [17]:
# ---- Split data
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# ---- Build the dictionnaries of numpy array to each sets
#
X_train_np = {}
X_test_np = {}

# ---- Reshape(-1,1) to conform to shape of array to Layers of Tensorflow version 16.2
for column in X_train.columns:
    X_train_np[column] = np.array(X_train[column]).reshape(-1, 1)
    X_test_np[column] = np.array(X_test[column]).reshape(-1, 1)

# ---- Build the numpy array for target
#
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

### II.2.2. BUILD MODEL AND GRID FOR PARAMETERS OPTIMIZATION  - MODEL WITH KERAS PREPROCESSING LAYERS

In [18]:
# ---- Preprare preprocessing layers
#
# ---- Initialize a dictionnary to collect all Input Layers
all_inputs = {}
# ---- Initialize a list to collect the result of all preprocessed layers
preprocessed_features = []

# ---- Apply StringLookup and CategoryEncoding to categorical features
#
for feature in categorical_object_features:
    # ---- Build feature Input layer
    feature_input = Input(shape=(1,), name=feature, dtype="string")
    all_inputs[feature] = feature_input
    # ---- Fit StringLookup layer on X_train_np[feature]
    lookup = StringLookup(output_mode="int", name=f'{feature}_Lookup')
    lookup.adapt(X_train_np[feature])
    # ---- First transform feature Input with StringLookup
    lookup_encoded_feature = lookup(feature_input)
    # ---- Then encode with CategoryEncoding
    onehot_encoded_feature = CategoryEncoding(
                                            num_tokens=lookup.vocabulary_size(),
                                            name=f'{feature}_OneHotEncoding',
                                            output_mode="multi_hot" # multi-hot to conform to shape required by TensorFlow 16.2
                                            )(lookup_encoded_feature)
    
    preprocessed_features.append(onehot_encoded_feature)

# ---- Apply Normalization to continuous numeric features
#       
for feature in numeric_continuous_features:
    # ---- Build feature Input layer
    feature_input = Input(shape=(1,), name=feature)
    all_inputs[feature] = feature_input
    # ---- Fit Normalization layer on X_train_np[feature]   
    normalizer = Normalization(name=f'{feature}_Normalization')
    normalizer.adapt(X_train_np[feature])
    # ---- Transform feature Input
    normalized_feature = normalizer(feature_input)
    preprocessed_features.append(normalized_feature)

# ---- Apply CategoryEncoding to discrete numeric features
#  
for feature in numeric_discrete_features:
    # ---- Build feature Input layer
    feature_input = Input(shape=(None,), name=feature)
    all_inputs[feature] = feature_input
    # Encode with the input with CategoryEncoding 
    onehot_encoded_feature = CategoryEncoding(
                                            num_tokens=len(np.unique(X_train_np[feature])),
                                            name=f'{feature}_OneHotEncoding',
                                            output_mode="multi_hot" # multi-hot to conform to shape require by TensorFlow 16.2
                                            )(feature_input)
    preprocessed_features.append(onehot_encoded_feature)

# ---- Concatenate all preprocessing layers
#
all_encoded_features = Concatenate()(preprocessed_features)

In [19]:
def objectiveKERAS(trial):
    '''
    Defines and trains a model using hyperparameters suggested by Optuna.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        The Optuna trial object used for hyperparameter optimization.

    Returns:
    --------
    validation_score : float
        The best validation root mean squared error (RMSE) achieved.
    '''
    # ---- Choose hyperparameters:
    #
    parameters = {
        'dense_1_units' : trial.suggest_int('dense_1_units', 16, 128, step=16),
        'dense_1_kernel_regul_l1' : trial.suggest_uniform('dense_1_kernel_regul', 0, 0.02),
        'dropout_1_dropout' :  trial.suggest_uniform('dropout_1_dropout', 0, 0.2),
        'batch_size': 100,
        'learning_rate': 0.001,
    }
    parameters['dense_2_units'] = parameters['dense_1_units']
    parameters['dense_2_kernel_regul_l1'] = parameters['dense_1_kernel_regul_l1']
    parameters['dropout_2_dropout'] = parameters['dropout_1_dropout']
    
    # ---- Feed model with preprocessing layers
    #
    dense_1 = tf.keras.layers.Dense(parameters.get('dense_1_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_1_kernel_regul_l1')),
                                    name='Dense_n1'
                                    )(all_encoded_features)
    
    dropout_1 = tf.keras.layers.Dropout(parameters.get('dropout_1_dropout'), name='Dropout_n1')(dense_1)

    dense_2 = tf.keras.layers.Dense(
                                    parameters.get('dense_2_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_2_kernel_regul_l1')),
                                    name='Dense_n2'
                                    )(dropout_1)
    
    dropout_2 = tf.keras.layers.Dropout(parameters.get('dropout_2_dropout'), name='Dropout_n2')(dense_2)

    output = tf.keras.layers.Dense(1, name='Output')(dropout_2)
    
    # ---- Build model with the multiple inputs
    #
    model = Model(inputs=all_inputs, outputs=output)

    # ---- Compile model
    #
    model.compile(
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=parameters['learning_rate']),
        loss='mse',
        metrics=['root_mean_squared_error']
    )
    
    callbacks = get_callbacks()
    epochs = 100

    # ---- Fit model
    #
    history = model.fit(x=X_train_np,
                    y=y_train_np,
                    validation_data = (X_test_np , y_test_np),
                    epochs=epochs,
                    batch_size=parameters['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )
    
    # ---- Format result to dataframe and save results
    #
    result_df = format_results_to_df(history, parameters, random_state)
    save_metrics(result_df=result_df,
                 column_to_sort='best_val_rmse',
                 comment='keras prepro layers')

    # ---- Return score
    #
    validation_score = min(history.history['val_root_mean_squared_error'])

    return validation_score

### II.2.3. RUN THE STUDY AND SAVE MODEL AND RESULTS - MODEL WITH KERAS PREPROCESSING LAYERS

In [None]:
# ---- Run optuna optimization
#
study = optuna.create_study(study_name='keras prepro',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                           )
study.optimize(objectiveKERAS, n_trials=2) #We set to only 2 trials beacause it's very slow!

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')
general_results_df.head(3)

**We can see that the training took quite a long time. Maybe beacause with Keras preprocessing layers, each sample is reprocessed at every epoch. Let's try preprocessing all data before training with sklearn to see the difference.**

## II.3. SECOND METHOD : MODEL USING SKLEARN PREPROCESSING

### II.3.1. DATA PREPARATION  - MODEL WITH SKLEARN PREPROCESSING

In [None]:
# ----- Specify a preprocessing method to group of features
#
# ----- Continuous numeric features to normalize
features_to_scale = ['milage']
# ----- Categorical type object features to encode
features_to_encode = categorical_features + ['model_year']
features_to_encode

In [36]:
# ---- Build preprocessor
#
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), features_to_scale),
        ("enc", OneHotEncoder(drop="first", handle_unknown='ignore', sparse=False), features_to_encode), #Specify sparse=False to avoid raising the XLA_GPU_JIT: SparseFillEmptyRows error when using GPU
    ],
    remainder='passthrough',
)

In [37]:
# ---- Split data
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [38]:
# ---- Preprocess data with sklearn
#
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_shape = X_train_preprocessed.shape

### II.3.2. BUILD MODEL AND GRID FOR PARAMETERS OPTIMIZATION  - MODEL WITH SKLEARN PREPROCESSING

In [39]:
def objective_SK1(trial):
    '''
    Defines and trains a model using hyperparameters suggested by Optuna.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        The Optuna trial object used for hyperparameter optimization.

    Returns:
    --------
    validation_score : float
        The best validation root mean squared error (RMSE) achieved.
    '''
    # ---- Choose hyperparameters:
    #
    parameters = {
        'dense_1_units' : trial.suggest_int('dense_1_units', 16, 128, step=16),
        'dense_1_kernel_regul_l1' : trial.suggest_uniform('dense_1_kernel_regul', 0, 0.01),
        'dropout_1_dropout' :  trial.suggest_uniform('dropout_1_dropout', 0, 0.1),
        'batch_size': 100,
        'learning_rate': 0.001,
    }

    parameters['dense_2_units'] = parameters['dense_1_units']
    parameters['dense_2_kernel_regul_l1'] = parameters['dense_1_kernel_regul_l1']
    parameters['dropout_2_dropout'] = parameters['dropout_1_dropout']
    
    # ---- Feed model with preprocessing layers
    #
    input = Input(shape=(X_shape[1],))

    dense_1 = tf.keras.layers.Dense(parameters.get('dense_1_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_1_kernel_regul_l1')),
                                    name='Dense_n1'
                                    )(input)
    
    dropout_1 = tf.keras.layers.Dropout(parameters.get('dropout_1_dropout'), name='Dropout_n1')(dense_1)

    dense_2 = tf.keras.layers.Dense(
                                    parameters.get('dense_2_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_2_kernel_regul_l1')),
                                    name='Dense_n2'
                                    )(dropout_1)
    
    dropout_2 = tf.keras.layers.Dropout(parameters.get('dropout_2_dropout'), name='Dropout_n2')(dense_2)
    
    output = tf.keras.layers.Dense(1, name='Output')(dropout_2)

    # ---- Build model with the multiple inputs
    #
    model = Model(inputs=input, outputs=output)

    # ---- Compile model
    #
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=parameters['learning_rate']),
        loss='mse',
        metrics=['root_mean_squared_error']
    )

    callbacks = get_callbacks()
    epochs = 100

    # ---- Fit model
    #
    history = model.fit(x=X_train_preprocessed,
                    y=y_train,
                    validation_data = (X_test_preprocessed , y_test),
                    epochs=epochs,
                    batch_size=parameters['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )
    
    # ---- Format result to dataframe and save results
    #
    result_df = format_results_to_df(history, parameters, random_state)
    save_metrics(result_df=result_df,
                 column_to_sort='best_val_rmse',
                 comment='sk_learn prepro - Standard Scaler: milage, OneHotencoder: others')

    # ---- Return score
    #
    validation_score = min(history.history['val_root_mean_squared_error'])

    return validation_score

### II.3.3. RUN THE STUDY, SAVE MODEL AND RESULTS - MODEL WITH SKLEARN PREPROCESSING

In [None]:
# ---- Run optuna optimization
#
study = optuna.create_study(study_name='sk_learn prepro',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                           )
study.optimize(objective_SK1, n_trials=8)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')
general_results_df.head(5)

**Preprocessing all data before training is quite 3 times faster than using keras preprocessing layers during training.**

## II.4. RELAOD BEST MODEL

In [42]:
# ---- Retrieve best score
#
best_val_rmse = general_results_df['best_val_rmse'].min()
best_model_data = general_results_df[general_results_df['best_val_rmse'] == best_val_rmse]
best_model_filepath = best_model_data['model_filepath'].values[0]

In [None]:
# ---- Reload best model
# 
best_model = tf.keras.models.load_model(best_model_filepath)
best_model.summary()

In [None]:
# ---- Check parameters of each layer of the loaded model
#
for layer in best_model.layers:
    print(f"Layer name: {layer.name}")
    print(f"Config: {layer.get_config()}")

In [None]:
# ---- Build graph of the model
#
graph_filepath = f'{OUTPUT_PATH_results}/graph_model_keras.png'

plot_model(best_model, to_file=graph_filepath, show_shapes=True)

## II.5. MAKE PREDICTION FOR SUBMISSION

### II.5.1. PREPARE DATA

**Again, we need to identify whether the model we have choosen, uses Keras preprocessing or sklearn preprocessing. We prepare the data in the appropriate way for the model.**

In [None]:
# ---- Load data
#
X_submission = pd.read_csv(FILEPATH_test)
X_submission.head(2)

In [None]:
X_submission.info()

In [None]:
# ---- Prepare data as we did for training data
#
X_submission_id = X_submission['id']
X_submission = X_submission.drop(['id'], axis=1)
X_submission.isna().sum()

In [49]:
# ---- Clean as we did for data used to train model
#
X_submission = simple_cleaning(X_submission)

In [None]:
# ---- Get categorical features names
# 
object_columns = X_submission.select_dtypes(include=['object'])
categorical_features = object_columns.columns.to_list()
categorical_features

In [51]:
# Choose the relevant preprocessing depending on the selected model
#
# ---- IF MODEL WITH KERAS PREPROCESSING LAYER ----
# ---- Build the dictionnary to pass to the model as we did for model training
#
if best_model_data['comment'][0] == 'keras prepro layers':
    X_sub_preprocessed = {}
    for column in X_submission.columns:
        X_sub_preprocessed[column] = np.array(X_submission[column]).reshape(-1, 1)

# ---- IF MODEL WITH SKLEARN PREPROCESSING ----
# ---- Preprocess data wih the preprocessor we used for model training
elif best_model_data['comment'][0] == 'sk_learn prepro - Standard Scaler: milage, OneHotencoder: others':
    X_sub_preprocessed = preprocessor.transform(X_submission)
else:
    print('UNEXPECTED MODEL, CHECK MODEL AND PREPROCESSING REQUIRED')

### II.5.2. MAKE PREDICTION AND SAVE RESULTS TO CSV

In [None]:
# ---- Predict
# 
y_submission = best_model.predict(X_sub_preprocessed)

# ---- Build a dataframe
#
y_df = pd.DataFrame(y_submission, columns=['price'])
y_df['price'] = y_df['price'].apply(lambda x: int(x))
submission_df = pd.concat([X_submission_id, y_df], axis=1)
submission_df

In [53]:
# ---- Save prediction in csv file
#
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs(OUTPUT_PATH_results, exist_ok=True)
filepath = f'{OUTPUT_PATH_results}/submission_KERAS{timestamp}.csv'
submission_df.to_csv(filepath, index=False)

# III. TRY DATA ENGINEERING

## III.1. PREPROCESS A FEW TEXTUAL FEATURES TO HELP MODEL TO CATCH MORE INFORMATION

### III.1.1. FEATURE: "TRANSMISSION"

In [54]:
# ---- Functions to preprocess column "transmission"
#
# ---- Get transmission speed number
def get_transmission_speed(sentence):
    
    transmission_speed = 0
    pattern = r"(\d+)[-\s]*(speed)?"
    match = re.search(pattern, sentence, flags=re.IGNORECASE)
    
    if match:
        transmission_speed = int(match.group(1))
        
    return transmission_speed

# ---- Remove transmission speed number out of the text    
def remove_transmission_pattern(sentence):
  
    pattern = r"\d+[-\s]*(speed)?"
    sentence_modified = re.sub(pattern, '', sentence, flags=re.IGNORECASE).strip()
    
    return sentence_modified.lower()

# ---- Build a synonym dictionnary
dict_transmission = {
    'Auto' : ['A/T', 'Automatic', 'AT'],
    'Hybrid': ['Transmission w/Dual Shift Mode','At/Mt', 'Variable'],
    'Manuel' : ['M/T', 'Mt', 'Manual'],
    # ----- Apply 0 to unknown value
    #
    'Unknown' : ['', '–']
}

reversed_dict ={}

for key, values in dict_transmission.items():
    for value in values:
        reversed_dict[value.lower()] = key.lower()

# ---- Substitute tokens with their synonyms when possible
def map_transmission_tokens(column, mapping_dict=reversed_dict):

    column = column.apply(lambda x : mapping_dict.get(x,x))

    return column

# ---- Apply complete preprocessing to the column transmission
def process_column_transmission(X_df):
    '''
    Process 'transmisison' columns of the given DataFrame.
    
    Parameters:
    -----------
    X_df : DataFrame
        The input DataFrame containing 'transmisison' column.
    
    Returns:
    --------
    results_df: DataFrame
        Transformed data based on 'transmisison'.
    '''
    results_df = X_df.copy()
    
    results_df['transmission_speed'] = results_df['transmission'].apply(get_transmission_speed)
    results_df['transmission_type'] =  results_df['transmission'].apply(remove_transmission_pattern)
    results_df['transmission_type'] = map_transmission_tokens(results_df['transmission_type'], mapping_dict=reversed_dict)
    results_df.drop(['transmission'], axis=1, inplace=True)

    return results_df

### III.1.2. FEATURE: "ENGINE"

In [55]:
# ----- Functions to preprocess column "engine"
#
dict_pattern = {
    'pattern_HP' : r'(\d+)\.(\d)hp',
    'pattern_size' : r'(\d+)\.(\d*)\s*l(iter)?',
    'pattern_cylinder_1' : r'[a-z]*(\d+)\s(cylinder)?',
    'pattern_cylinder_2' : r'[a-z](\d)',
    'pattern_soupape' : r'(\d\d)v'
}

# ---- Get engine Horse Power value
def get_engine_HP(sentence, pattern_HP=dict_pattern['pattern_HP']):

    engine_HP = 0

    match = re.search(pattern_HP, sentence, flags=re.IGNORECASE)
    if match: 
        engine_HP = int(float(f"{match.group(1)}.{match.group(2)}"))

    return engine_HP

# ---- Get engine size
def get_engine_size(sentence, pattern_size=dict_pattern['pattern_size']):

    engine_size = 0
    match = re.search(pattern_size, sentence, flags=re.IGNORECASE)

    if match:
        engine_size = float(f"{match.group(1)}.{match.group(2)}")
    
    return engine_size

# ---- Get engine number of cylinder
def get_engine_cylinder(sentence, pattern_cylinder_1=dict_pattern['pattern_cylinder_1'], pattern_cylinder_2=dict_pattern['pattern_cylinder_2']):

    engine_cylinder = 0
    
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor for example with the pattern 'i4'
    if not ('ah' in sentence or 'kw' in sentence):
        match = re.search(pattern_cylinder_1, sentence, flags=re.IGNORECASE)
        if match:
            engine_cylinder = int(f"{match.group(1)}")
        else:
            match = re.search(pattern_cylinder_2, sentence, flags=re.IGNORECASE)
            if match:
                engine_cylinder = int(f"{match.group(1)}")
                
    return engine_cylinder

# ---- Get engine number of soupape
def get_engine_soupape(sentence, pattern_soupape=dict_pattern['pattern_soupape']):

    engine_soupape = 0
    
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor for example with the pattern '697V'
    if not ('ah' in sentence or 'kw' in sentence):
        match = re.search(pattern_soupape, sentence, flags=re.IGNORECASE)
        if match:
            engine_soupape = int(f"{match.group(1)}")
    
    return engine_soupape

# ---- Remove all the pattern from a sentence
def remove_pattern(sentence, pattern_values=dict_pattern.values()):
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor
    if not ('ah' in sentence or 'kw' in sentence):
        for pattern in pattern_values:
            sentence = re.sub(pattern, '', sentence, flags=re.IGNORECASE).strip()
    
    return sentence

# ---- Apply a first processing of the sentence using the funcion above
def previous_version_process_column_engine(column):
    
    serie = column.apply(lambda x: x.lower())
    serie = serie.apply(get_engine_HP)
    serie.name = None
    df = pd.DataFrame(serie, columns=['engine_HP'])
    df['engine_size'] = column.apply(get_engine_size)
    df['engine_cylinder'] = column.apply(get_engine_cylinder)
    df['engine_soupape'] = column.apply(get_engine_soupape)
    df['engine_text'] = column.apply(remove_pattern)

    return df

Let's have a look of what is remaing in column 'engine_text' after preprocessing column engine using the funcion previous_process_column_engine().

In [None]:
# ---- Check the previous result
#
temporary_result_df = previous_version_process_column_engine(X['engine'])
temporary_result_df

Let's make a Bag Of Word of column engine_text to see the remainging vocabulary

In [None]:
# ---- Remove punctuation and split sentence into a list of word
#
def process_text(sentence):

    translator = str.maketrans('','', string.punctuation)
    clean_sentence = sentence.translate(translator).split()

    return clean_sentence

# ---- Fit vectorizer on column_engine_text to get back vocabulary
#
column_engine_text = temporary_result_df['engine_text']
vectorizer = CountVectorizer(analyzer=process_text)
vectorizer.fit(column_engine_text)

vocabulary_dict = vectorizer.vocabulary_

# ---- Explore vocanulary in a DataFrame
#
df_vocabulary_df = pd.DataFrame.from_dict(vocabulary_dict , orient='index', columns=['count'])
df_vocabulary_df.sort_values('count', ascending=False)

In [None]:
# ---- Print again values of the column fuel type of the original dataset
#
data['fuel_type'].unique()

In [59]:
# ---- Build a synonym dictionary related to fuel_type
#
fuel_type_dict = {
    'gasoline' : 'gasoline',
    'gas' : 'gasoline',
    'hybrid' : 'hybrid',
    'electricgas': 'hybrid',
    'gaselectric' : 'hybrid',
    'e85 flex fuel' : 'e85 flex fuel',
    'flexible' : 'e85 flex fuel',
    'diesel' : 'diesel',
    'hydrogen' : 'hydrogen',
    'electric' : 'electric',
    'battery' : 'electric',
    'ah': 'electric',
    '70kw': 'electric',
    '697v': 'electric',
    '160kw': 'electric',
    '1112ah': 'electric'
}

def match_fuel_type(sentence):
    '''
    Find in a sentence the first token that matches a key in the dictionary fuel_type_dict and return the associated value.
    If no token matches, return None.

    Parameters
    ----------
    sentence: str
        String to search for matching fuel type.

    Returns:
    --------
    mapping_value : str or None
        The matched fuel type or None if no match is found.
    '''
    # ---- Initialize the result to None
    #
    mapping_value = None

    # ---- Preprocess sentence and split to list
    #
    tokens = process_text(sentence)

    if len(tokens) > 0:

        # ---- Loop over each token of the document
        #
        for token in tokens:

            # ---- Get the value if the token matches a key in the dictionary
            #
            mapping_value = fuel_type_dict.get(token.lower())

            # ---- Break the loop if a match is found
            #
            if mapping_value is not None:
                break

    return mapping_value

Let's build now the final version of the preprocesing function of column engine

In [60]:
def process_columns_engine_and_fuel_type(X_df):
    '''
    Process both 'engine' and 'fuel_type' columns from a given DataFrame
    
    Parameters:
    -----------
    X_df : DataFrame
        The input DataFrame containing both 'engine' and 'fuel_type'.
    
    Returns:
    --------
    results_df: DataFrame
        Transformed data based on 'engine' and 'fuel_type'.
    '''
    results_df = X_df.copy()
    
    # ---- Lowercase
    #
    results_df['engine'] =  results_df['engine'].apply(lambda x: x.lower())
    results_df['fuel_type'] = results_df['fuel_type'].apply(lambda x: x.lower())

    # ---- Add the new columns to the DataFrame
    #
    results_df['engine_HP'] = results_df['engine'].apply(get_engine_HP)
    results_df['engine_size'] = results_df['engine'].apply(get_engine_size)
    results_df['engine_cylinder'] = results_df['engine'].apply(get_engine_cylinder)
    results_df['engine_soupape'] = results_df['engine'].apply(get_engine_soupape)
    results_df['engine_text'] = results_df['engine'].apply(remove_pattern)
    results_df['engine_text'] = results_df['engine_text'].apply(match_fuel_type)
    
    # ---- Adapt fuel_type column with value in engine_text column
    #
    results_df['fuel_type'] = np.where(results_df['engine_text'].notna(), results_df['engine_text'], results_df['fuel_type'])
    
    # ---- Drop column engine_text and engine
    #
    results_df.drop(['engine_text', 'engine'], axis = 1, inplace = True)
    
    return results_df

### III.1.3. BUILD THE PREPROCESSOR FOR BOTH COLUMN ENGINE AND TRANSMISSION

In [None]:
# ---- BUILD PREPRO_ENGINE_TRANSMISSION
#

# ---- Preprocess column 'engine' preprocessing
#
engine_col_transformer = FunctionTransformer(process_columns_engine_and_fuel_type, validate=False)

# ---- Preprocess column 'transmisson' preprocessing
#
transmission_col_transformer = FunctionTransformer(process_column_transmission, validate=False)

# ---- Full preprocessor step1
#
prepro_engine_transmission = ColumnTransformer(
    transformers=[
        ("engine", engine_col_transformer, ['engine', 'fuel_type']),
        ("trans", transmission_col_transformer, ['transmission'])
    ],
    remainder='passthrough',
)
prepro_engine_transmission.set_output(transform='pandas')

### III.1.4. CHECK OUTPUT OF PREPROCESSOR_STEP1

In [None]:
X_train_preprocessed_step1 = prepro_engine_transmission.fit_transform(X_train)
X_train_preprocessed_step1.head(5)

In [None]:
# ---- Check features values count
# 
X_train_preprocessed_step1.nunique()

In [64]:
# Seperate features into group
#
new_numeric_discrete_features = [ 
    'trans__transmission_speed',
    'engine__engine_soupape',
    'engine__engine_cylinder',
    'engine__engine_size',
    'engine__fuel_type'
    ]

new_continious_features = [ 
    'engine__engine_HP',
    ]

In [None]:
# ---- Visualize newly generated data with graphs
#
make_box_for_discrete_features(X_train_preprocessed_step1, new_numeric_discrete_features + ['engine__fuel_type'], y_train)
make_box_for_continuous_features(X_train_preprocessed_step1, new_continious_features, y_train, bins=100, ylim=(-1000,180000))

**In these graphs, we observe simple trends between price and the variables trans__transmission_speed, engine__engine_soupape, engine__engine_cylinder, and engine__engine_HP. Numeric values of the feature seems to indicate it's importance related to price. Therefore, we can decide to treat these variables as numeric.**

**In contrast, there is no evident trend between price and engine__engine_size, so we could treat it as a categorical feature and encode it accordingly without loosing any information.**

## III.2. RE-BUILD COMPLETE MODEL

### III.2.1 ADAPT PREVIOUS PREPROCESSING PIPELINE AND PREPROCESS DATA

In [None]:
X_train_preprocessed_step1.info()

In [None]:
# ---- Specify a preprocessing strategy to apply to groups  of features for preprocessing step 2
#
categorical_columns_step2 = X_train_preprocessed_step1.select_dtypes(include=['object']).columns.to_list()
numerical_columns_step2 = X_train_preprocessed_step1.select_dtypes(include=['int64', 'float64']).columns.to_list()

# ---- Features we decide to scale
#
features_to_scale_step2 = ['remainder__milage', 'engine__engine_HP', 'engine__engine_size']
print(f'features_to_scale_step2 : {features_to_scale_step2}')

# ---- Fetures we decide to encode
#
features_to_encode_step2 = categorical_columns_step2 
numerical_columns_step2.remove('remainder__milage')
numerical_columns_step2.remove('engine__engine_HP')
numerical_columns_step2.remove('engine__engine_size')
features_to_encode_step2.extend(numerical_columns_step2)
print(f'features_to_encode_step2 : {features_to_encode_step2}')

In [68]:
# ---- BUILD PREPROCESSOR STEP 2
#
prepro_step2 = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), features_to_scale_step2),
        ("cat", OneHotEncoder(drop="first", handle_unknown='ignore', sparse=False), features_to_encode_step2),#Specify sparse=False to avoid raising the XLA_GPU_JIT: SparseFillEmptyRows error when using GPU
    ],
    remainder='passthrough',
)

# ---- CHAIN BOTH PREPROCESSOR STEP 1 AND STEP 2 IN A PIPELINE
#
preprocessor_global = Pipeline(
    steps=[
        ("step1", prepro_engine_transmission),
        ("step2", prepro_step2)
    ]
)

In [69]:
# ---- Fit preprocessor and transform data
#
X_train_preprocessed = preprocessor_global.fit_transform(X_train)
X_test_preprocessed = preprocessor_global.transform(X_test)

X_shape = X_train_preprocessed.shape

### III.2.2 RE-BUILD MODEL AND GRID FOR PARAMETERS OPTIMIZATION  - MODEL WITH SKLEARN PREPROCESSING

In [70]:
def objective_SK2(trial):
    '''
    Defines and trains a model using hyperparameters suggested by Optuna.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        The Optuna trial object used for hyperparameter optimization.

    Returns:
    --------
    validation_score : float
        The best validation root mean squared error (RMSE) achieved.
    '''

    # ---- Choose hyperparameters:
    #
    parameters = {
        'dense_1_units' : trial.suggest_int('dense_1_units', 16, 80, step=16),
        'dense_1_kernel_regul_l1' : trial.suggest_uniform('dense_1_kernel_regul', 0, 0.01),
        'dropout_1_dropout' :  trial.suggest_uniform('dropout_1_dropout', 0, 0.1),
        'batch_size': 100,
        'learning_rate': 0.001, #it is the default value
    }

    parameters['dense_2_units'] = parameters['dense_1_units']
    parameters['dense_2_kernel_regul_l1'] = parameters['dense_1_kernel_regul_l1']
    parameters['dropout_2_dropout'] = parameters['dropout_1_dropout']
    
    # ---- Feed model with preprocessing layers
    #
    input = Input(shape=(X_shape[1],))

    dense_1 = tf.keras.layers.Dense(parameters.get('dense_1_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_1_kernel_regul_l1')),
                                    name='Dense_n1'
                                    )(input)
    
    dropout_1 = tf.keras.layers.Dropout(parameters.get('dropout_1_dropout'), name='Dropout_n1')(dense_1)

    dense_2 = tf.keras.layers.Dense(
                                    parameters.get('dense_2_units'),
                                    activation='relu',
                                    kernel_regularizer=regularizers.L1(parameters.get('dense_2_kernel_regul_l1')),
                                    name='Dense_n2'
                                    )(dropout_1)
    
    dropout_2 = tf.keras.layers.Dropout(parameters.get('dropout_2_dropout'), name='Dropout_n2')(dense_2)
    
    output = tf.keras.layers.Dense(1, name='Output')(dropout_2)

    # ---- Build model with the multiple inputs
    #
    model = Model(inputs=input, outputs=output)

    # ---- Compile model
    #
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=parameters['learning_rate']),
        loss='mse',
        metrics=['root_mean_squared_error']
    )

    callbacks = get_callbacks()
    epochs = 100

    # ---- Fit model
    #
    history = model.fit(x=X_train_preprocessed,
                    y=y_train,
                    validation_data = (X_test_preprocessed , y_test),
                    epochs=epochs,
                    batch_size=parameters['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )
    
    # ---- Format result to dataframe and save results
    #
    result_df = format_results_to_df(history, parameters, random_state)
    save_metrics(
        result_df=result_df,
        column_to_sort='best_val_rmse',
        comment='sklearn prepro - Standard Scaler: [milage, engine_HP, engine_size] + OneHotEncoder: others + DATA ENG: [transmission, engine]'
        )

    # Return score
    validation_score = min(history.history['val_root_mean_squared_error'])

    return validation_score

## III.3. RUN A NEW STUDY AND SAVE RESULTS

In [None]:
# ---- Run optuna optimization
#
study = optuna.create_study(study_name='sklearn prepro + DATA ENG',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                            )

study.optimize(objective_SK2, n_trials=8)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')
general_results_df.head(5)

## III.4 MAKES NEW SUBMISSION ....

**Models feeded with preprocessed data with data engineering does not have the best ranking... **

# IV. MODEL EXPLAINIBILITY

## IV.1. FEATURES IMPORTANCE

In [73]:
# ---- Let's choose the best model with keras preprocessing layers to determine feature importance 
#
mask = general_results_df['comment'] == 'keras prepro layers'
selected_val_rmse = general_results_df[mask]['best_val_rmse'].min()
selected_model_filepath = general_results_df.loc[general_results_df['best_val_rmse'] == selected_val_rmse, 'model_filepath'].values[0]

In [None]:
# ---- Reload best model
# 
selected_model = tf.keras.models.load_model(selected_model_filepath)
selected_model.summary()

In [None]:
# ---- Specify a set of samples to be used as reference by SHAP for comparison with the samples to analyze.
# ---- We will select the first 50 samples for this reference set.
X_sample_df = X.head(50)

# ---- Specify the samples to analyze.
# ---- 
sample_to_analyse = X.iloc[200:210, :]


# ---- Define a function to adapt our model to the shape and type of input required by shap.KernelExplainer.
#
def predict_f(X):
    '''
    Make predictions for a given set of samples.

    Parameters:
    -----------
    X: np.array
        A NumPy array containing the data for which predictions are made. 
        Rows correspond to samples, and columns correspond to features.

    Returns:
    --------
    np.array:
        A 1D array containing the predicted values from the model.
    '''
    X_dict = {column: (X[:, i]).astype(dtype).reshape(-1, 1)
              for i, (column, dtype) in enumerate(zip(X_sample_df.columns, X_sample_df.dtypes))} 
    # We use X_sample_df to retrieve the column names and data types. 
    # When shap.KernelExplainer converts a DataFrame to a NumPy array, the data types may change, 
    # which can cause issues with the model's prediction function.

    return selected_model.predict(X_dict).flatten()


# ---- Initialize SHAP
#
shap.initjs()

# ---- Initialize the SHAP explainer using the reference set of samples.
#
explainer = shap.KernelExplainer(predict_f, X_sample_df)

# ---- Calculate SHAP values corresponding to the each individual sample.
#
shap_values = explainer(sample_to_analyse)

# ---- Generate the barplot of mean features importance calculated on the sample to analyse
#
shap.plots.bar(shap_values)

In [None]:
# ---- More detail overview of the feature importance
#
shap.summary_plot(shap_values, sample_to_analyse)

In [None]:
# We could also plot the force_plot for a specific sample, let's choose one
#
choosen_sample =  X.iloc[1000:1001, :] #We need it to be a DataFrame, that's why we take a slice

# ---- Calculate the shap_values and generate the shap._explanation.Explanation object
#
shap_values_choosen_sample = explainer(choosen_sample)

# ---- Display the plot
#
shap.force_plot(shap_values_choosen_sample,
                feature_names=choosen_sample.columns.to_list(),
                matplotlib=True,
                figsize=(40, 3), 
                out_names=f'sample_index = {choosen_sample.index[0]}'
                )