In [7]:
import datetime as dt
import lightgbm
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import numpy as np
import optuna
import os
import pandas as pd
import plotly
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
import seaborn as sns
import string
import warnings

# ---- Reduction of verbosity
#
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)


# ---- Adapt size of column when displaying DataFrame
#
pd.set_option('display.max_colwidth', None)

In [None]:
# ---- Get the current working directory where python is executing to adapt filepath used in this notebook
#
current_directory = os.getcwd()
print("Current directory :", current_directory)

In [9]:
# ---- SKIP THIS SECTION ON KAGGLE ----

# ---- Section dedicated to script.py import
# 
import sys

# ---- Specify path to the folder containing the scripts
#
script_path = './scripts'
sys.path.append(script_path)

# ---- Import the usefull function in the scripts
#
from visualization import make_barplot, make_hist, make_box_for_discrete_features, make_box_for_continuous_features

In [10]:
# ---- Define path to all csv input files
#
FILEPATH_train = './data/train.csv'
FILEPATH_test = './data/test.csv'
FILEPATH_sample_submission = './data/sample_submission.csv'
# FILEPATH_train = '/kaggle/input/playground-series-s4e9/train.csv'
# FILEPATH_test = '/kaggle/input/playground-series-s4e9/test.csv'
# FILEPATH_sample_submission = '/kaggle/input/playground-series-s4e9/sample_submission.csv'

# ---- Define path that will be used to store output files
#
OUTPUT_PATH_results = './results'
FILEPATH_model_results = f'{OUTPUT_PATH_results}/model_results_LGBM.csv'

# ---- Define path to store results of study Optuna
relative_path = './results/study_LGBM.db'
STUDY_STORAGE_PATH = f'sqlite:///{relative_path}'

# ---- Create OUTPUT_PATH_results directory
os.makedirs(OUTPUT_PATH_results, exist_ok=True)

# ---- Initialize seed
# 
random_state = 32

# I. EXPLORATORY DATA ANALASYS

## I.1 LOAD DATA AND QUICK EXPLORATION

In [11]:
# ---- Load train data
#
data = pd.read_csv(FILEPATH_train)

In [None]:
data.head(3)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# ---- Get numerical features names
#
numerical_features = ['model_year', 'milage']
numerical_features

In [None]:
# ---- Get categorical features names
# 
object_columns = data.select_dtypes(include=['object'])
categorical_features = object_columns.columns.to_list()
categorical_features

In [None]:
# ---- Print number of category for each feature
# 
for feature in categorical_features:
    print(feature, ': ', data[feature].nunique())

In [None]:
# ---- Explore deeper
# 
for feature in categorical_features:
    print(data[feature].value_counts().nlargest(100), '\n\n****************************')

## I.2 SIMPLE DATA CLEANING

In [None]:
data.isna().sum()

In [14]:
def simple_cleaning(data):
    '''
    Replaces missing values with 'Unknown' in specific columns
    and converts text in categorical columns to lowercase.
    
    Parameters:
    ___________
    data: pd.DataFrame
        The DataFrame to clean.
    
    Returns:
    ________
    data: pd.DataFrame
        The DataFrame cleaned, with text values converted to lowercase.
    '''    
    # ---- Assign 'unknown' in column 'clean_title' and 'fuel_type' for missing values
    #
    data['clean_title'] = data['clean_title'].fillna('Unknown')
    data['fuel_type'] = data['fuel_type'].fillna('Unknown')
    data['accident'] = data['accident'].fillna('Unknown')
    
    # ---- Lowercase text in categorical columns
    #
    categorical_features = data.select_dtypes(include=['object']).columns
    data[categorical_features] = data[categorical_features].apply(lambda x: x.str.lower())
    
    return data

In [15]:
data = simple_cleaning(data)

In [None]:
# ---- Check if values are still missing
# 
data.isna().sum()

In [None]:
# ---- Look for duplicated rows
#
print(data.duplicated().sum())
# ---- Look for duplicated rows skipping column id
print(data.iloc[:,1:].duplicated().sum())

## I.3 DATA VISUALIZATION


### I.3.1. LIST OF USEFULL PLOTTING FUNCTIONS

### I.3.2. PLOTS

In [None]:
make_barplot(data, categorical_features, 10)

In [None]:
make_hist(data, numerical_features + ['price'])

In [21]:
X_df = data.drop(['price'], axis=1)
y_df = data.price

discrete_features_to_plot = ['model_year'] + categorical_features
continious_features_to_plot = ['milage']

In [None]:
make_box_for_continuous_features(X_df, continious_features_to_plot, y_df, bins=100, ylim=(-1000,180000))
make_box_for_discrete_features(X_df, discrete_features_to_plot, y_df)

**We observe a clear relationship between the feature milage and price, as well as between model_year and price. However, no significant trend is apparent between the categorical features and price when examining the "raw data."**

**For an initial model, we will apply standard scaling to the numerical features and one-hot encoding to the categorical features. In a subsequent step, we may explore further data engineering, such as reorganizing categorical features and applying ordinal encoding, to potentially improve the model.**

# **II. MODELISATION**

## II.1. FIRST, SOME USEFULL FUNCTIONS WE WILL USE LATER TO STORE MODELS RESULTS

In [23]:
def save_metrics(results_df, column_to_sort='mean_test_rmse', comment=None, filepath=FILEPATH_model_results):
    '''
    Save the metrics of the trained models to a CSV file at the specified filepath.
    If the file exists, it appends new results; otherwise, it creates a new file.
    
    Parameters
    ----------
    results_df : pandas.DataFrame
        The DataFrame containing the results of all trained models.
    column_to_sort : str, optional
        Column to use for sorting the dataset, by default 'mean_test_rmse'.
    comment : str, optional
        An optional comment to append to the results, by default None.
    filepath: str, optional
        The path to the results. By default, uses FILEPATH_model_results.
    
    Returns
    -------
    None
        The function either creates or appends to a CSV file (with path FILEPATH_model_results) with the new results.
    '''
    # ---- Add comment columns to the given DataFrame
    #
    results_df['comment'] = comment
    
    # ----Check if the file already exists
    #
    if os.path.exists(filepath):
        # ---- If the file exists, read the current data and append new data
        #
        existing_df = pd.read_csv(filepath, sep=';')
        updated_df = pd.concat([existing_df, results_df], ignore_index=True)
    else:
        # ---- If the file doesn't exist, create a new DataFrame
        #
        updated_df = results_df
    
    updated_df = updated_df.sort_values(column_to_sort)
    
    # ---- Drop duplicates if exist
    #
    updated_df.drop_duplicates(inplace=True)
    
    # ---- Write the updated DataFrame to the CSV file
    #
    # ---- Extract the name of the path of the directory where should be store the file
    directory = os.path.dirname(filepath)
    
    # ---- Create the path to the directory if not exist
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    # ---- Save file to .csv
    updated_df.to_csv(filepath, index=False, sep=';')


In [24]:
def format_study_results_to_df(study, random_state):
    '''
    Extract trial results and parameters from an Optuna study into a DataFrame.

    This function extracts completion date, user-defined attributes, and hyperparameters
    for each trial in the given Optuna study. It compiles this data into a DataFrame and sorts
    it by the 'mean_test_rmse' column in descending order.

    Parameters:
    ----------
        study: optuna.study.Study
            The Optuna study object containing trials.
        random_state: int
            The random state used during model training, to be included in the DataFrame.

    Returns:
    --------
        results_df: pd.DataFrame
            A DataFrame with trial completion date (`study_date`), user attributes, and hyperparameters.
    '''

    results_list = []

    for trial in study.trials:
        
        study_date = trial.datetime_complete.strftime("%Y%m%d_%H%M%S") 
        
        results_list.append({
            'study_date': study_date,
            **trial.user_attrs,
            **trial.params,
            'random_state': random_state
        })

    results_df = pd.DataFrame.from_dict(results_list).sort_values(['mean_test_rmse'], ascending=False)

    return results_df


## II.2. DATA PREPARATION

In [25]:
# ---- Select features and target
#
y = data['price']
X = data[numerical_features + categorical_features]

In [None]:
# ----- Check y_train
#
y.head(5)

In [None]:
# ----- Check X_train
#
X.head(5)

## II.3. PERFORM A SIMPLE MODEL (LINEAR REGRESSION)

In [None]:
# ----- Specify a preprocessing strategy to apply to group of features

# ----- Features to scale
#
features_to_scale = numerical_features.copy()
print(f'features to scale: {features_to_scale}')
      
# ----- Features to encode
#
features_to_encode = categorical_features.copy()
print(f'features_to_encode: {features_to_encode}')

In [29]:
# ---- Build the preprocessor for linear regression
#
preprocessor_LR = ColumnTransformer(
    transformers=[
        ("scaler",  StandardScaler(), numerical_features),
        ("encoder", OneHotEncoder(drop="first",
                                  handle_unknown='ignore',
                                 ),
         features_to_encode
        ),
    ]
)

In [30]:
# ---- Build complete pipeline
#
model = LinearRegression()
pipe = Pipeline(steps=[("prepro_LR", preprocessor_LR), ("model", model)])

In [None]:
# ---- Evaluate model
#
results = cross_validate(estimator=pipe, X=X, y=y, cv=3, scoring="neg_root_mean_squared_error", return_train_score=True)

# ---- Results on train and test sets
#
train_scores = results['train_score']
test_scores = results['test_score']
print(f"train_scores: {train_scores}")
print(f"test_scores: {test_scores}")

# ---- Mean of the results on test set
#
mean_test_score = np.mean(test_scores)
print(f"Mean of the results on test set: {mean_test_score}")

**We observe a significant variance between the results on the training and test sets. It appears that the linear model is not effectively capturing the underlying patterns in the data in its current form.**

## II.4. PERFORM A QUICK STUDY ON LGBM REGRESSOR WITH OPTUNA

**We chose LGBM Regressor because it is both fast and efficient, so it will have quick results on various configurations of model and features.**

### II.4.1. BUILD THE OPTUNA STUDY

In [32]:
# ---- This function make an random selection of features. It will be used in the Optuna studies
#
def feature_selector(X_df, features_to_suggest, trial):
    """
    Select features for a model by applying a random selection strategy to specified features by Optuna.

    Parameters:
    ----------
    X_df : pd.DataFrame
        DataFrame containing all features (excluding the target variable). Each column represents a feature.
    
    features_to_suggest : list of str
        List of feature names on which random selection should be applied. Features not listed in this 
        parameter will be automatically included in the selection with a fixed value.
    trial : optuna.trial.Trial
        An Optuna trial object used to make random feature selections.

    Returns:
    -------
    feature_mask : np.ndarray
        An array of binary values (0 or 1) where each element corresponds to a feature in `X_df.columns`. 
    """

    # ---- Intiliaze an empty list of index
    features_selection_index = []
    
    # ---- Get all features names
    features_list = X_df.columns.to_list()
    
    # ---- Loop over each feature
    for i, feature in enumerate(features_list):
        
        if feature in features_to_suggest:
            # ---- Make a random choice for others
            features_selection_index.append(trial.suggest_int(f'{features_list[i]}', 0, 1))

        else:
             # ---- Force suggestion to 1 to features to keep
            features_selection_index.append(trial.suggest_int(f'{features_list[i]}', 1, 1))
            
    # ---- Build the final mask to apply for feature selection
    feature_mask = np.array(features_selection_index, dtype=int)
    
    return feature_mask
    

In [None]:
# ---- Specify the list of features on which we will let Optuna apply a random selection. We force the selection for the other features.
#
features_to_suggest = [
    'ext_col',
    'int_col',
    'clean_title'
]
print(f'features_to_suggest: {features_to_suggest}')

In [None]:
# ----- Specify the preprocessing strategy for each group of features (no scaling on numerical features with LGBM it might be for a tree-based model).
#
# ----- Features to Ordinal Encode with sklearn (To avoid increasing dimensionality, we don't apply OneHot Encoding it should be sufficient for a tree-based model).
features_to_encode = categorical_features
print(f'features_to_encode: {features_to_encode}')

In [35]:
# ---- Build the optuna study
#
def objective_quick_performing(trial):
    '''
    Objective function for Optuna optimization.

    This function selects features and optimizes hyperparameters of a LGBM Regressor model.
    For each trial, a subset of features is selected using a binary mask, and several hyperparameters
    of the LGBM Regressor model are adjusted.

    Parameters:
    ----------
        trial:  optuna.trial.Trial
            An Optuna trial object that allows exploring different values for hyperparameters and feature selection.

    Returns:
    --------
        mean_test_rmse: float
            The negative mean squared error (MSE) score obtained through cross-validation (3-folds).
    '''
    # ---- Hyperparameter optimization
    #
    hyper_params = {
        'cat_smooth': trial.suggest_int('cat_smooth', 1, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0005, 0.03),
        'max_depth' : trial.suggest_int('max_depth', 3, 8),
        'min_child_samples' : trial.suggest_int('min_child_samples', 50, 1000),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.000001, 0.5),
        'n_estimators' : trial.suggest_int('n_estimators', 300, 700),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'subsample' : trial.suggest_float('subsample', 0.2, 0.4, step=0.01)
    }

    # ---- Make the feature selection
    #
    # ---- Generate the mask
    feature_mask = feature_selector(X_df=X, features_to_suggest=features_to_suggest, trial=trial)
  
    # ---- Ensure at least one feature is selected
    if feature_mask.sum() == 0:
        return np.inf
    
    # ---- Select features based on the mask
    selected_features = X.columns[feature_mask == 1]
    X_selected = X[selected_features]
  
    # ---- Update the list features_to_encode based on selected features
    #
    features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]

    # ---- Build the preprocessor with the updated list features_to_encode
    #
    preprocessor_LGBM = ColumnTransformer(
        transformers=[
            ("ord", OrdinalEncoder(handle_unknown='use_encoded_value',
                                   unknown_value=-1,
                                  ),
             features_to_encode_updated
            )
        ],
        remainder='passthrough'
    )
    
    # ---- Create the model with the chosen parameters
    #
    model = LGBMRegressor(
        **hyper_params,
        random_state=random_state,
        verbose=-1  
    )
    
    # ---- Build the pipeline
    #
    pipe = Pipeline(
        steps=[
            ("prepro_LGBM", preprocessor_LGBM),
            ("model", model)
        ]
    )
    
    # ---- Cross-Validation on selected features
    #
    kf = 3
    
    # ---- Perform cross-validation
    results = cross_validate(pipe, X_selected, y, cv=kf, scoring="neg_root_mean_squared_error", return_train_score=True)
    
    # ---- Compute the mean RMSE for train and test sets
    mean_train_rmse = - np.mean(results['train_score'])
    mean_test_rmse = - np.mean(results['test_score'])

    # ---- Store results in the trial's user attribute
    #
    trial.set_user_attr("mean_train_rmse", mean_train_rmse)
    trial.set_user_attr("mean_test_rmse", mean_test_rmse)
    trial.set_user_attr("model_name", type(model).__name__)
    
    # ---- Return the mean score on test for the objective function
    #
    return mean_test_rmse


### II.4.2. RUN THE STUDY AND SAVE RESULTS

In [36]:
# ----- Run the optimization with Optuna
#
study = optuna.create_study(study_name=f'{dt.datetime.now().strftime("%Y%m%d_%H%M")}',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                            )
study.optimize(objective_quick_performing, n_trials=30)

In [37]:
results_df = format_study_results_to_df(study, random_state)
comment = 'OrdinalEncoder: categorical features'
save_metrics(results_df, comment=comment)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')

general_results_df.transpose()

## II.5. MAKE SUBMISSION

### II.5.1. REBUILD AND REFIT BEST MODEL

In [None]:
# ---- Retrieve the best model from the final study we conducted
#
comment = 'OrdinalEncoder: categorical features'
best_rmse = general_results_df.loc[general_results_df['comment']==comment,'mean_test_rmse'].min()

best_model_configuration = general_results_df[general_results_df['mean_test_rmse'] == best_rmse]
display(best_model_configuration.transpose())

In [None]:
# ---- Select features to the same way the model was fitted
#
selected_features = []
for feature in X.columns:
    if best_model_configuration.get(feature) is not None:
        if best_model_configuration.get(feature).iloc[0] == 1:
            selected_features.append(feature)
selected_features

In [None]:
# ----- Specify the preprocessing strategy used for evaluating model
#
# ----- Features to Ordinal Encode with sklearn
features_to_encode = categorical_features
print(f'features_to_encode: {features_to_encode}')

In [42]:
# ---- Re-build the model
# ---- We fit the model on all data this time (X and y).
    
# ---- Retrieve model parameters
#
n_estimators = best_model_configuration.n_estimators.iloc[0]
max_depth = best_model_configuration.max_depth.iloc[0]
min_child_samples = best_model_configuration.min_child_samples.iloc[0]
learning_rate = best_model_configuration.learning_rate.iloc[0]
subsample = best_model_configuration.subsample.iloc[0]

# ---- Retrieve the feature selection
#
X_selected = X[selected_features]
  
# ---- Update the list features_to_encode based on selected features
#
features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]

# ---- Build the preprocessor with the updated list features_to_encode
#
preprocessor_LGBM = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(handle_unknown='use_encoded_value',
                               unknown_value=-1,
                              ),
         features_to_encode_updated
        )
    ],
    remainder='passthrough'
)
    
# ---- Create the model with the chosen parameters
#
model = LGBMRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_child_samples=min_child_samples,
    subsample=subsample,
    random_state=random_state,
    learning_rate=learning_rate,
    verbose=-1  
)
    
# ---- Build the pipeline
#
pipe = Pipeline(
    steps=[
        ("prepro_LGBM", preprocessor_LGBM),
        ("model", model)
    ]
)

In [None]:
# ---- Refit the model
pipe.fit(X_selected, y)

### II.5.2. PREDICT AND SAVE PREDICTION

In [None]:
# ---- Load test.csv file
# 
data_final = pd.read_csv(FILEPATH_test)

# ---- Prepare data to the same way the model was fitted
#
data_final = simple_cleaning(data_final)

# ---- Select features to the same way the model was fitted
#
X_final_selected = data_final[selected_features]
X_final_selected.head(5)

In [None]:
# ---- Predict
# 
y_submission = pipe.predict(X_final_selected)
y_df = pd.DataFrame(y_submission, columns=['price'])
y_df

In [None]:
# ---- Build submission dataFrame
# 
y_df['price'] = y_df['price'].apply(lambda x: int(x))
submission_df = pd.concat([data_final['id'], y_df], axis=1)
submission_df

In [47]:
# ---- Save prediction to csv
# 
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")

filepath = f'{OUTPUT_PATH_results}/submission_LGBM{timestamp}.csv'
submission_df.to_csv(filepath, index=False)

# **III. TRY DATA ENGINEERING - 1ST ATTEMPT**

## III.1. BUILD PREPROCESSING FUNCTIONS FOR COLUMNS WITH TEXT TO HELP MODEL TO CATCH MORE INFORMATION

### III.1.1. PREPROCESS FUNCTION FOR FEATURE: "TRANSMISSION"

In [48]:
# ---- Functions to preprocess column "transmission"
#
# ---- Get transmission speed number
def get_transmission_speed(sentence):
    
    transmission_speed = 0
    pattern = r"(\d+)[-\s]*(speed)?"
    match = re.search(pattern, sentence, flags=re.IGNORECASE)
    
    if match:
        transmission_speed = int(match.group(1))
        
    return transmission_speed

# ---- Remove transmission speed number out of the text    
def remove_transmission_pattern(sentence):
  
    pattern = r"\d+[-\s]*(speed)?"
    sentence_modified = re.sub(pattern, '', sentence, flags=re.IGNORECASE).strip()
    
    return sentence_modified.lower()

# ---- Build a synonym dictionnary
dict_transmission = {
    'Auto' : ['A/T', 'Automatic', 'AT'],
    'Hybrid': ['Transmission w/Dual Shift Mode','At/Mt', 'Variable'],
    'Manuel' : ['M/T', 'Mt', 'Manual'],
    # ----- Apply 0 to unknown value
    #
    'Unknown' : ['', '–']
}

reversed_dict ={}

for key, values in dict_transmission.items():
    for value in values:
        reversed_dict[value.lower()] = key.lower()

# ---- Substitute tokens with their synonyms when possible
def map_transmission_tokens(column, mapping_dict=reversed_dict):

    column = column.apply(lambda x : mapping_dict.get(x,x))

    return column

# ---- Apply complete preprocessing to the column transmission
def process_column_transmission(X_df):
    '''
    Process 'transmisison' columns of the given DataFrame.
    
    Parameters:
    -----------
    X_df : DataFrame
        The input DataFrame containing 'transmisison' column.
    
    Returns:
    --------
    results_df: DataFrame
        Transformed data based on 'transmisison'.
    '''
    results_df = X_df.copy()
    
    results_df['transmission_speed'] = results_df['transmission'].apply(get_transmission_speed)
    results_df['transmission_type'] =  results_df['transmission'].apply(remove_transmission_pattern)
    results_df['transmission_type'] = map_transmission_tokens(results_df['transmission_type'], mapping_dict=reversed_dict)
    results_df.drop(['transmission'], axis=1, inplace=True)

    return results_df

### III.1.2. PREPROCESSING FUNCTION FEATURE: "ENGINE"

In [49]:
# ----- Functions to preprocess column "engine"
#
dict_pattern = {
    'pattern_HP' : r'(\d+)\.(\d)hp',
    'pattern_size' : r'(\d+)\.(\d*)\s*l(iter)?',
    'pattern_cylinder_1' : r'[a-z]*(\d+)\s(cylinder)?',
    'pattern_cylinder_2' : r'[a-z](\d)',
    'pattern_soupape' : r'(\d\d)v'
}

# ---- Get engine Horse Power value
def get_engine_HP(sentence, pattern_HP=dict_pattern['pattern_HP']):

    engine_HP = 0

    match = re.search(pattern_HP, sentence, flags=re.IGNORECASE)
    if match: 
        engine_HP = int(float(f"{match.group(1)}.{match.group(2)}"))

    return engine_HP

# ---- Get engine size
def get_engine_size(sentence, pattern_size=dict_pattern['pattern_size']):

    engine_size = 0
    match = re.search(pattern_size, sentence, flags=re.IGNORECASE)

    if match:
        engine_size = float(f"{match.group(1)}.{match.group(2)}")
    
    return engine_size

# ---- Get engine number of cylinder
def get_engine_cylinder(sentence, pattern_cylinder_1=dict_pattern['pattern_cylinder_1'], pattern_cylinder_2=dict_pattern['pattern_cylinder_2']):

    engine_cylinder = 0
    
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor for example with the pattern 'i4'
    if not ('ah' in sentence or 'kw' in sentence):
        match = re.search(pattern_cylinder_1, sentence, flags=re.IGNORECASE)
        if match:
            engine_cylinder = int(f"{match.group(1)}")
        else:
            match = re.search(pattern_cylinder_2, sentence, flags=re.IGNORECASE)
            if match:
                engine_cylinder = int(f"{match.group(1)}")
                
    return engine_cylinder

# ---- Get engine number of soupape
def get_engine_soupape(sentence, pattern_soupape=dict_pattern['pattern_soupape']):

    engine_soupape = 0
    
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor for example with the pattern '697V'
    if not ('ah' in sentence or 'kw' in sentence):
        match = re.search(pattern_soupape, sentence, flags=re.IGNORECASE)
        if match:
            engine_soupape = int(f"{match.group(1)}")
    
    return engine_soupape

# ---- Remove all the pattern from a sentence
def remove_pattern(sentence, pattern_values=dict_pattern.values()):
    # ---- Skip if 'ah' or 'kw' in the sentence to avoid confusion with electric motor
    if not ('ah' in sentence or 'kw' in sentence):
        for pattern in pattern_values:
            sentence = re.sub(pattern, '', sentence, flags=re.IGNORECASE).strip()
    
    return sentence

# ---- Apply a first processing of the sentence using the funcion above
def previous_version_process_column_engine(column):
    
    serie = column.apply(lambda x: x.lower())
    serie = serie.apply(get_engine_HP)
    serie.name = None
    df = pd.DataFrame(serie, columns=['engine_HP'])
    df['engine_size'] = column.apply(get_engine_size)
    df['engine_cylinder'] = column.apply(get_engine_cylinder)
    df['engine_soupape'] = column.apply(get_engine_soupape)
    df['engine_text'] = column.apply(remove_pattern)

    return df


Let's have a look of what is remaing in column 'engine_text' after preprocessing column engine using the funcion previous_process_column_engine().

In [None]:
# ---- Check the previous result
#
temporary_result_df = previous_version_process_column_engine(X['engine'])
temporary_result_df

Let's make a Bag Of Word of column engine_text to see the remainging vocabulary

In [None]:
# ---- Remove punctuation and split sentence into a list of word
#
def process_text(sentence):

    translator = str.maketrans('','', string.punctuation)
    clean_sentence = sentence.translate(translator).split()

    return clean_sentence

# ---- Fit vectorizer on column_engine_text to get back vocabulary
#
column_engine_text = temporary_result_df['engine_text']
vectorizer = CountVectorizer(analyzer=process_text)
vectorizer.fit(column_engine_text)

vocabulary_dict = vectorizer.vocabulary_

# ---- Explore vocanulary in a DataFrame
#
df_vocabulary_df = pd.DataFrame.from_dict(vocabulary_dict , orient='index', columns=['count'])
df_vocabulary_df.sort_values('count', ascending=False)

In [None]:
# ---- Print again values of the column fuel type of the original dataset
#
data['fuel_type'].unique()

In [53]:
# ---- Build a synonym dictionary related to fuel_type
#
fuel_type_dict = {
    'gasoline' : 'gasoline',
    'gas' : 'gasoline',
    'hybrid' : 'hybrid',
    'electricgas': 'hybrid',
    'gaselectric' : 'hybrid',
    'e85 flex fuel' : 'e85 flex fuel',
    'flexible' : 'e85 flex fuel',
    'diesel' : 'diesel',
    'hydrogen' : 'hydrogen',
    'electric' : 'electric',
    'battery' : 'electric',
    'ah': 'electric',
    '70kw': 'electric',
    '697v': 'electric',
    '160kw': 'electric',
    '1112ah': 'electric'
}

def match_fuel_type(sentence):
    '''
    Find in a sentence the first token that matches a key in the dictionary fuel_type_dict and return the associated value.
    If no token matches, return None.

    Parameters
    ----------
    sentence: str
        String to search for matching fuel type.

    Returns:
    --------
    mapping_value : str or None
        The matched fuel type or None if no match is found.
    '''
    # ---- Initialize the result to None
    #
    mapping_value = None

    # ---- Preprocess sentence and split to list
    #
    tokens = process_text(sentence)

    if len(tokens) > 0:

        # ---- Loop over each token of the document
        #
        for token in tokens:

            # ---- Get the value if the token matches a key in the dictionary
            #
            mapping_value = fuel_type_dict.get(token.lower())

            # ---- Break the loop if a match is found
            #
            if mapping_value is not None:
                break

    return mapping_value

Let's build now the final version of the preprocesing function of column engine.

In [54]:
def process_columns_engine_and_fuel_type(X_df):
    '''
    Process both 'engine' and 'fuel_type' columns from a given DataFrame
    
    Parameters:
    -----------
    X_df : DataFrame
        The input DataFrame containing both 'engine' and 'fuel_type'.
    
    Returns:
    --------
    results_df: DataFrame
        Transformed data based on 'engine' and 'fuel_type'.
    '''
    results_df = X_df.copy()
    
    # ---- Lowercase
    #
    results_df['engine'] =  results_df['engine'].apply(lambda x: x.lower())
    results_df['fuel_type'] = results_df['fuel_type'].apply(lambda x: x.lower())

    # ---- Add the new columns to the DataFrame
    #
    results_df['engine_HP'] = results_df['engine'].apply(get_engine_HP)
    results_df['engine_size'] = results_df['engine'].apply(get_engine_size)
    results_df['engine_cylinder'] = results_df['engine'].apply(get_engine_cylinder)
    results_df['engine_soupape'] = results_df['engine'].apply(get_engine_soupape)
    results_df['engine_text'] = results_df['engine'].apply(remove_pattern)
    results_df['engine_text'] = results_df['engine_text'].apply(match_fuel_type)
    
    # ---- Adapt fuel_type column with value in engine_text column
    #
    results_df['fuel_type'] = np.where(results_df['engine_text'].notna(), results_df['engine_text'], results_df['fuel_type'])
    
    # ---- Drop column engine_text and engine
    #
    results_df.drop(['engine_text', 'engine'], axis = 1, inplace = True)
    
    return results_df

### III.1.3. PREPARE DATA WITH THE NEW PREPROCESSING FUNCTIONS

In [55]:
data_enhanced = process_column_transmission(data)
data_enhanced = process_columns_engine_and_fuel_type(data_enhanced)

### III.1.4. EXPLORE AND DISPLAY NEW PREPARED FEATURES

In [None]:
# ---- Check features values count
# 
data_enhanced.nunique()

In [57]:
# ---- Separate the new features we have created into group for plotting
#
new_numeric_discrete_features = [ 
    'transmission_speed',
    'transmission_type',
    'engine_soupape',
    'engine_cylinder',
    'engine_size',
    'fuel_type'
    ]

new_continious_features = [ 
    'engine_HP',
    ]

In [None]:
# ---- Visualize the new features with graphs
#
make_box_for_discrete_features(data_enhanced.drop(['price'], axis=1), new_numeric_discrete_features, data_enhanced['price'])
make_box_for_continuous_features(data_enhanced.drop(['price'], axis=1), new_continious_features, data_enhanced['price'], bins=100, ylim=(-1000,180000))

**In these graphs, we see that the values of features such as transmission_speed, engine_soupape, engine_cylinder, and engine_HP appear to be ordered by their importance with respect to the price. This suggests that these features have a hierarchical impact on the price, and we can treat them as numeric variables, their numerical values are directly related to changes in the target value (price).**

**In contrast, the relationship between price and engine_size remains complex. Therefore, treating engine size as a categorical feature and encoding it accordingly will not result in the loss of information and may even benefit the model. We will try this later!**

## III.2. BUILD PREPROCESSOR FOR CATEGORICAL FEATURES

**In this section we will deal with columns with 'object' type.**

**In the first study we Optuna and LGBM, we have Ordinal Encoded them instead of OneHot Encoding them to avoid increasing dimensionality.
However, the values of these features were not sorted by their importance with respect to the car prices. Maybe be we can help the model by giving it more information by using a customized encoder that use the mean value of the target for each category of each feature.**

**Let's build our TargetEncoder class.**

**For this step we need to be carefull of data leakage. Because to compute the mean price, we use the target! So we have to do this only on train set and not on test set. That's why we can't preprocess all the data before training model with cross validation. The preprocessor we will have to be fitted on train set only!!**

In [59]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    '''
    Custom Target Encoder for categorical features using the mean of the target variable.

    This transformer encodes categorical features by replacing each category with the mean of the target
    variable for that category.

    Attributes
    ----------
    target_means_ : dict
        A dictionary storing the mean target value for each category in each column.
        
    features_names_ : list
        A list of feature names provided during the fit process.
    
    Methods
    -------
    fit(X, y)
        Fits the encoder by calculating the mean target value for each category in the input DataFrame `X`.

    transform(X)
        Transforms the input DataFrame `X` by mapping each category to its corresponding mean target value.
    
    get_feature_names_out(input_features=None)
        Returns the names of the features that have undergone transformation.
    '''
    def __init__(self):
        '''
        Initializes the TargetEncoder with empty storage for target means and feature names.
        '''
        self.target_means_ = {}
        self.features_names_= []
    
    def fit(self, X, y):
        '''
        For each column of a DataFrame, calculate the mean target value for each category.
        
        Parameters:
        ----------
        X : pd.DataFrame
            The input data to fit on, containing the features to process.
        y : pd.Series or np.ndarray
            Target variable (price) to calculate the mean for each category.
        '''
        df = pd.concat([X,y], axis=1)
        self.features_names_ = X.columns.to_list()
        
        for column in X.columns:
            # ---- Calculate the mean target value for each caterory
            #
            self.target_means_[column] = df.groupby(df[column])[y.name].mean()

        return self
    
    def transform(self, X):
        '''
        Transform the input by mapping the value by the mean target value of it's category.
        
        Parameters:
        ----------
        X : pd.DataFrame
            The input data to fit on, containing the features to process.
        Returns:
        --------
        X : pd.DataFrame
            The transformed DataFrame with each original value mapped with the mean target value of the corresponding category.

        '''
        # ---- Replace each category by the associated mean value of the target
        #
        X
        for column in X.columns:
            X[column] = X[column].map(self.target_means_[column])
        return X
    

    def get_feature_names_out(self, input_features=None):
    #This function will be useful to retrieve the names of the features to plot feature importance with optuna later
    #We specify an unused argument `input_features` due to sklearn's internal structure
        '''
        Return the names of the features given to the transformer.

        Parameters:
        -----------
        input_features : list of str, optional
            Input feature names.

        Returns:
        -------
        list
            Names of the features that have undergone transformation.
        '''
        return self.features_names_

## III.3. PERFORM A NEW STUDY WITH LGBM REGRESSOR

### III.3.1 PREPARE DATA

In [60]:
# ---- Select features and target
#
y = data_enhanced['price']
X = data_enhanced.drop(['id','price'], axis=1)

In [None]:
X.info()

In [None]:
# ---- Specify preprocessing strategy for each feature
#
categorical_features_enhanced = X.select_dtypes(include=['object']).columns.to_list()
features_to_encode = categorical_features_enhanced
print(f'features_to_encode: {features_to_encode}')

# ---- We let numerical feature passthrough preprocessing

In [None]:
# ---- Specify the list of features on which we will let Optuna apply a random selection. We force the selection for the other features.
#
features_to_suggest = [
    'int_col',
    'engine_size'
]
print(f'features_to_suggest: {features_to_suggest}')

### III.3.2 RE-BUILD A STUDY FOR OPTUNA

In [64]:
# ---- Build the optuna study
#
def objective_with_data_eng(trial):
    '''
    Objective function for Optuna optimization.

    This function selects features and optimizes hyperparameters of a LGBM Regressor model.
    For each trial, a subset of features is selected using a binary mask, and several hyperparameters
    of the LGBM Regressor model are adjusted.

    Parameters:
    ----------
        trial:  optuna.trial.Trial
            An Optuna trial object that allows exploring different values for hyperparameters and feature selection.

    Returns:
    --------
        mean_test_rmse: float
            The negative mean squared error (MSE) score obtained through cross-validation (3-folds).
    '''
    # ---- Hyperparameter optimization
    #
    hyper_params = {
        'cat_smooth': trial.suggest_int('cat_smooth', 1, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0005, 0.03),
        'max_depth' : trial.suggest_int('max_depth', 3, 8),
        'min_child_samples' : trial.suggest_int('min_child_samples', 50, 1000),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.000001, 0.5),
        'n_estimators' : trial.suggest_int('n_estimators', 300, 700),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'subsample' : trial.suggest_float('subsample', 0.2, 0.4, step=0.01)
        }

    # ---- Make the feature selection
    #
    # ---- Generate the mask
    feature_mask = feature_selector(X_df=X, features_to_suggest=features_to_suggest, trial=trial)
  
    # ---- Ensure at least one feature is selected
    if feature_mask.sum() == 0:
        return np.inf
    
    # ---- Select features based on the mask
    selected_features = X.columns[feature_mask == 1]
    X_selected = X[selected_features]
  
    # ---- Update the list features_to_encode based on selected features
    #
    features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]


    # ---- Build the pipeline for complete mean target encoding
    #
    mean_target_pipe = Pipeline(
        steps=[
            ('target_encoder', TargetEncoder()),
            ('imputer', SimpleImputer(strategy='mean'))#TargetEncoder replace by Nan the value of the test set that are not present in the train set. We replace those Nan value by the mean value of the associated column
        ]
    )
    
    # ---- Build the preprocessor to apply mean_target_pipe to the updated list features_to_encode
    #
    preprocessor_LGBM = ColumnTransformer(
        transformers=[
            ('enc', mean_target_pipe, features_to_encode_updated),
        ],
        remainder='passthrough'
    )
    
    # ---- Create the model with the chosen parameters
    #
    model = LGBMRegressor(
        **hyper_params,
        random_state=random_state,
        verbose=-1  
    )
    
    # ---- Build the model pipeline
    #
    pipe = Pipeline(
        steps=[
            ("prepro_LGBM", preprocessor_LGBM),
            ("model", model)
        ]
    )
    
    # ---- Cross-Validation on selected features
    #
    kf = 3
    
    # ---- Perform cross-validation
    #
    results = cross_validate(pipe, X_selected, y, cv=kf, scoring="neg_root_mean_squared_error", return_train_score=True)
    
    # ---- Compute the mean RMSE for train and test sets
    #
    mean_train_rmse = - np.mean(results['train_score'])
    mean_test_rmse = - np.mean(results['test_score'])

    # ---- Store results in the trial's user attribute
    #
    trial.set_user_attr("mean_train_rmse", mean_train_rmse)
    trial.set_user_attr("mean_test_rmse", mean_test_rmse)
    trial.set_user_attr("model_name", type(model).__name__)
    
    # ---- Return the mean score on test for the objective function
    #
    return mean_test_rmse

### III.3.3 RUN THE STUDY AND SAVE RESULTS

In [65]:
# ----- Run the optimization with Optuna
#
study = optuna.create_study(study_name=f'{dt.datetime.now().strftime("%Y%m%d_%H%M")}',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                            )
study.optimize(objective_with_data_eng, n_trials=30)

In [66]:
# ---- Save results
#
results_df = format_study_results_to_df(study, random_state)
comment = 'Data Eng: [transmission, engine], Target Encoder: categorical features'
save_metrics(results_df, comment=comment)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')

general_results_df.transpose()

## III.4. MAKE SUBMISSION

### III.4.1. REBUILD ET REFIT BEST MODEL

In [None]:
# ---- Retrieve the best model from the final study we conducted
#
comment = 'Data Eng: [transmission, engine], Target Encoder: categorical features'
best_rmse = general_results_df.loc[general_results_df['comment']==comment,'mean_test_rmse'].min()

best_model_configuration = general_results_df[general_results_df['mean_test_rmse'] == best_rmse]
display(best_model_configuration.transpose())

In [69]:
# ---- Select features to the same way the model was fitted
#
selected_features = []
for feature in X.columns:
    if best_model_configuration.get(feature) is not None:
        if int(best_model_configuration.get(feature).iloc[0]) == 1:
            selected_features.append(feature)

In [None]:
# ---- Specify preprocessing strategy for each feature
#
categorical_features_enhanced = X.select_dtypes(include=['object']).columns.to_list()
features_to_encode = categorical_features_enhanced
print(f'features_to_encode: {features_to_encode}')

# ---- We let numerical feature passthrough preprocessing

In [71]:
# ---- Re-build the model
# ---- We fit the model on all data this time (X and y).
    
# ---- Retrieve model parameters
#
n_estimators = best_model_configuration.n_estimators.iloc[0]
max_depth = best_model_configuration.max_depth.iloc[0]
min_child_samples = best_model_configuration.min_child_samples.iloc[0]
learning_rate = best_model_configuration.learning_rate.iloc[0]
subsample = best_model_configuration.subsample.iloc[0]

# ---- Retrieve the feature selection
#
X_selected = X[selected_features]

# ---- Update the list features_to_encode based on selected features
#
features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]


# ---- Build the pipeline for complete mean target encoding
#
mean_target_pipe = Pipeline(
    steps=[
        ('target_encoder', TargetEncoder()),
        ('imputer', SimpleImputer(strategy='mean'))#TargetEncoder replace by Nan the value of the test set that are not present in the train set. We replace those Nan value by the mean value of the associated column
    ]
)
    
# ---- Build the preprocessor to apply mean_target_pipe to the updated list features_to_encode
#
preprocessor_LGBM = ColumnTransformer(
    transformers=[
        ('enc', mean_target_pipe, features_to_encode_updated),
    ],
        remainder='passthrough'
)
    
# ---- Create the model with the chosen parameters
#
model = LGBMRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_child_samples=min_child_samples,
    subsample=subsample,
    random_state=random_state,
    learning_rate=learning_rate,
    verbose=-1  
)
    
# ---- Build the pipeline
#
pipe = Pipeline(
    steps=[
        ("prepro_LGBM", preprocessor_LGBM),
        ("model", model)
    ]
)

In [None]:
# ---- Refit the model
#
pipe.fit(X_selected, y)

### III.4.2. PREDICT AND SAVE PREDICTION

In [None]:
# ---- Load test.csv file
# 
data_final = pd.read_csv(FILEPATH_test)

# ---- Prepare data to the same way the model was fitted
#
data_final = simple_cleaning(data_final)
data_final_enhanced = process_column_transmission(data_final)
data_final_enhanced = process_columns_engine_and_fuel_type(data_final_enhanced)

# ---- Select features to the same way the model was fitted
#
X_final_selected = data_final_enhanced[selected_features]
X_final_selected.head(5)

In [None]:
# ---- Predict
# 
y_submission = pipe.predict(X_final_selected)
y_df = pd.DataFrame(y_submission, columns=['price'])
y_df

In [None]:
# ---- Build submission dataFrame
# 
y_df['price'] = y_df['price'].apply(lambda x: int(x))
submission_df = pd.concat([data_final['id'], y_df], axis=1)
submission_df

In [76]:
# ---- Save prediction to csv
# 
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")

filepath = f'{OUTPUT_PATH_results}/submission_LGBM{timestamp}.csv'
submission_df.to_csv(filepath, index=False)

# IV. TRY DATA ENGINEERING - 2ND ATTEMPT

**Except 'milage' and maybe 'model_year', all the numerical feature we have take predetermmined discrete value. We could treat them as categorical and apply to them the same Target Encoding preprocessing as we did for categorical features. Let's see if it improve the results!**

## IV.1. PREPROCESS ALSO NUMERIC DISCRETE FEATURES

In [None]:
# ---- Specify preprocessing strategy for each feature
#
all_columns = X.columns.to_list()
features_to_encode = all_columns
features_to_encode.remove('milage')
features_to_encode.remove('model_year')
print(f'features_to_encode: {features_to_encode}')

# ---- We let numerical feature passthrough preprocessing

In [None]:
# ---- Specify the list of features on which we will let Optuna apply a random selection. We force the selection for the other features.
#
features_to_suggest = [
    'int_col',
    'engine_size'
]
print(f'features_to_suggest: {features_to_suggest}')

## IV.2. PERFORM A NEW STUDY WITH OPTUNA

### IV.2.1 RE-BUILD A STUDY FOR OPTUNA

In [79]:
# ---- Build the optuna study
#
def objective_with_data_eng_2(trial):
    '''
    Objective function for Optuna optimization.

    This function selects features and optimizes hyperparameters of a LGBM Regressor model.
    For each trial, a subset of features is selected using a binary mask, and several hyperparameters
    of the LGBM Regressor model are adjusted.

    Parameters:
    ----------
        trial:  optuna.trial.Trial
            An Optuna trial object that allows exploring different values for hyperparameters and feature selection.

    Returns:
    --------
        mean_test_rmse: float
            The negative mean squared error (MSE) score obtained through cross-validation (3-folds).
    '''
    # ---- Hyperparameter optimization
    #
    hyper_params = {
        'cat_smooth': trial.suggest_int('cat_smooth', 1, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0005, 0.03),
        'max_depth' : trial.suggest_int('max_depth', 3, 8),
        'min_child_samples' : trial.suggest_int('min_child_samples', 50, 1000),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.000001, 0.5),
        'n_estimators' : trial.suggest_int('n_estimators', 300, 700),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'subsample' : trial.suggest_float('subsample', 0.2, 0.4, step=0.01)
        }

    # ---- Make the feature selection
    #
    # ---- Generate the mask
    feature_mask = feature_selector(X_df=X, features_to_suggest=features_to_suggest, trial=trial)
  
    # ---- Ensure at least one feature is selected
    if feature_mask.sum() == 0:
        return np.inf
    
    # ---- Select features based on the mask
    selected_features = X.columns[feature_mask == 1]
    X_selected = X[selected_features]
  
    # ---- Update the list features_to_encode based on selected features
    #
    features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]


    # ---- Build the pipeline for complete mean target encoding
    #
    mean_target_pipe = Pipeline(
        steps=[
            ('target_encoder', TargetEncoder()),
            ('imputer', SimpleImputer(strategy='mean'))#TargetEncoder replace by Nan the value of the test set that are not present in the train set. We replace those Nan value by the mean value of the associated column
        ]
    )
    
    # ---- Build the preprocessor to apply mean_target_pipe to the updated list features_to_encode
    #
    preprocessor_LGBM = ColumnTransformer(
        transformers=[
            ('enc', mean_target_pipe, features_to_encode_updated),
        ],
        remainder='passthrough'
    )
    
    # ---- Create the model with the chosen parameters
    #
    model = LGBMRegressor(
        **hyper_params,
        random_state=random_state,
        verbose=-1  
    )
    
    # ---- Build the pipeline
    #
    pipe = Pipeline(
        steps=[
            ("prepro_LGBM", preprocessor_LGBM),
            ("model", model)
        ]
    )
    
    # ---- Cross-Validation on selected features
    #
    kf = 3
    
    # ---- Perform cross-validation
    #
    results = cross_validate(pipe, X_selected, y, cv=kf, scoring="neg_root_mean_squared_error", return_train_score=True)
    
    # ---- Compute the mean RMSE for train and test sets
    #
    mean_train_rmse = - np.mean(results['train_score'])
    mean_test_rmse = - np.mean(results['test_score'])

    # ---- Store results in the trial's user attribute
    #
    trial.set_user_attr("mean_train_rmse", mean_train_rmse)
    trial.set_user_attr("mean_test_rmse", mean_test_rmse)
    trial.set_user_attr("model_name", type(model).__name__)
    
    # ---- Return the mean score on test for the objective function
    #
    return mean_test_rmse

### IV.2.2. RUN THE STUDY AND SAVE RESULTS

In [80]:
# ----- Run the optimization with Optuna
#
study = optuna.create_study(study_name=f'{dt.datetime.now().strftime("%Y%m%d_%H%M")}',
                            direction='minimize',
                            storage=STUDY_STORAGE_PATH,
                            load_if_exists=True
                            )
study.optimize(objective_with_data_eng_2, n_trials=50)

In [81]:
# ---- Save results
#
results_df = format_study_results_to_df(study, random_state)
comment = 'Data Eng: [transmission, engine], Target Encoder: all features, except [milages, model_year]'
save_metrics(results_df, comment=comment)

In [None]:
# ---- Have a look at the results
#
general_results_df = pd.read_csv(FILEPATH_model_results, sep=';')

general_results_df.transpose()

## IV.3. MAKE SUBMISSION

### IV.3.1. REBUILD AND REFIT BEST MODEL

In [None]:
# ---- Retrieve the best model rom the final study we conducted
#
comment = 'Data Eng: [transmission, engine], Target Encoder: all features, except [milages, model_year]'
best_rmse = general_results_df.loc[general_results_df['comment']==comment,'mean_test_rmse'].min()

best_model_configuration = general_results_df[general_results_df['mean_test_rmse'] == best_rmse]
display(best_model_configuration.transpose())

In [None]:
# ---- Select features to the same way the model was fitted
#
selected_features = []
for feature in X.columns:
    if best_model_configuration.get(feature) is not None:
        if best_model_configuration.get(feature).iloc[0] == 1:
            selected_features.append(feature)
selected_features

In [None]:
# ---- Specify preprocessing strategy for each feature
#
all_columns = X.columns.to_list()
features_to_encode = all_columns
features_to_encode.remove('milage')
features_to_encode.remove('model_year')
print(f'features_to_encode: {features_to_encode}')

# ---- We let numerical feature passthrough preprocessing

In [86]:
# ---- Re-build the model
# ---- We fit the model on all data this time (X and y).
    
# ---- Retrieve model parameters
#
n_estimators = best_model_configuration.n_estimators.iloc[0]
max_depth = best_model_configuration.max_depth.iloc[0]
min_child_samples = best_model_configuration.min_child_samples.iloc[0]
learning_rate = best_model_configuration.learning_rate.iloc[0]
subsample = best_model_configuration.subsample.iloc[0]

# ---- Retrieve the feature selection
#
X_selected = X[selected_features]

# ---- Update the list features_to_encode based on selected features
#
features_to_encode_updated = [feature for feature in features_to_encode if feature in selected_features]


# ---- Build the pipeline for complete mean target encoding
#
mean_target_pipe = Pipeline(
    steps=[
        ('target_encoder', TargetEncoder()),
        ('imputer', SimpleImputer(strategy='mean'))#TargetEncoder replace by Nan the value of the test set that are not present in the train set. We replace those Nan value by the mean value of the associated column
    ]
)
    
# ---- Build the preprocessor to apply mean_target_pipe to the updated list features_to_encode
#
preprocessor_LGBM = ColumnTransformer(
    transformers=[
        ('enc', mean_target_pipe, features_to_encode_updated),
    ],
    remainder='passthrough'
)
    
# ---- Create the model with the chosen parameters
#
model = LGBMRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_child_samples=min_child_samples,
    subsample=subsample,
    random_state=random_state,
    learning_rate=learning_rate,
    verbose=-1  
)
    
# ---- Build the pipeline
#
pipe = Pipeline(
    steps=[
        ("prepro_LGBM", preprocessor_LGBM),
        ("model", model)
    ]
)

In [None]:
# ---- Refit the model
#
pipe.fit(X_selected, y)

### IV.3.2. PREDICT AND SAVE PREDICTION

In [None]:
# ---- Load test.csv file
# 
data_final = pd.read_csv(FILEPATH_test)

# ---- Prepare data to the same way the model was fitted
#
data_final = simple_cleaning(data_final)
data_final_enhanced = process_column_transmission(data_final)
data_final_enhanced = process_columns_engine_and_fuel_type(data_final_enhanced)

# ---- Select features to the same way the model was fitted
#
X_final_selected = data_final_enhanced[selected_features]
X_final_selected.head(5)

In [None]:
# ---- Predict
# 
y_submission = pipe.predict(X_final_selected)
y_df = pd.DataFrame(y_submission, columns=['price'])
y_df

In [None]:
# ---- Build submission dataFrame
# 
y_df['price'] = y_df['price'].apply(lambda x: int(x))
submission_df = pd.concat([data_final['id'], y_df], axis=1)
submission_df

In [91]:
# ---- Save prediction to csv
# 
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")

filepath = f'{OUTPUT_PATH_results}/submission_LGBM{timestamp}.csv'
submission_df.to_csv(filepath, index=False)

# V. MODEL EXPLAINIBILITY AND ANALYSE OF MODEL

## V.I. FEATURES IMPORTANCE

In [92]:
# ---- Retrieve the features names given to the model LGBM
#
feature_names = preprocessor_LGBM.get_feature_names_out()

# ---- Retrieve the features importances determined by LGBM model
#
importances = model.feature_importances_

In [None]:
# ---- Create a DataFrame for plotting
#
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# ---- Sort the DataFrame by importance
#
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# ---- Plot feature importance using Seaborn
#
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Importance',
    y='Feature',
    data=importance_df,
    palette='Blues_d'
)
plt.xlabel('Feature Importance')
plt.title('Best Model Feature Importance')
plt.show()

### V.II. HYPERPARAMETER IMPORTANCE

In [94]:
# ---- List all optuna study we did
#
studies = optuna.get_all_study_summaries(storage=STUDY_STORAGE_PATH)
print([study.study_name for study in studies])

In [None]:
# ---- Reload the last study to analyse
# 
study_name = studies[-1].study_name
print(f'Name of the choosen study: {study_name}')

study = optuna.load_study(study_name=study_name, storage=STUDY_STORAGE_PATH)

# ---- Select hyper parameters to analyse
#
hyper_params_to_plot = [
    'cat_smooth',
    'colsample_bytree',
    'learning_rate',
    'max_depth',
    'min_child_samples',
    'min_split_gain',
    'n_estimators',
    'num_leaves',
    'subsample'
]

# ---- Plot parameters importances of the final study
#
optuna.visualization.plot_param_importances(study, params=hyper_params_to_plot)

In [None]:
optuna.visualization.plot_slice(study, params=hyper_params_to_plot)