# 1/ Import packages and data :

## Import package 

In [1]:
import pandas as pd
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_column", 1000)
import os
from unidecode import unidecode
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency, spearmanr
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import re
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, RFECV
import sys
from pre_processing import pre_processing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#sys.path.append("pre_processing.py")
from statistics import mean
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
import catboost
from catboost import CatBoostRegressor
import optuna
import xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import joblib
import yaml
import shap

In [None]:
os.getcwd()

## Collect data 

In [None]:
df = pd.read_csv("./data/train.csv", encoding="utf-8")
df

In [None]:
# Rename col : if necessary
# df.columns = [name.split(" ")[0] for name in df.columns]

In [None]:
# target : Ewltp (g/km)
target = "Ewltp (g/km)"

# 2/ Data cleaning :

## Basic treatment 

In [None]:
# Detect useless col 

## Detect useless col after empirical analysis :
useless_columns = ["ID", "Vf" ,"De", "Ernedc (g/km)", "MMS", "Mp", "Mk", "Man", "Cn", "Date of registration"]

In [None]:
## Detect columns that contain only one value :
for i in list(df.columns) :
    if df[i].nunique()==1 :
        useless_columns.append(i)
        
useless_columns

In [2]:
def basic_treatment(df :pd.DataFrame, useless_columns : list, drop_duplicate=True) -> pd.DataFrame:
    """Perform basic data treatment on a DataFrame.

        This function performs basic data treatment on a given DataFrame:
        1. Removes duplicate rows.
        2. Drops specified useless columns.

        Args:
            df (pd.DataFrame): The input DataFrame to be treated.
            useless_columns (list): A list of column names to be removed from the DataFrame.

        Returns:
            pd.DataFrame: A DataFrame with duplicate rows removed, specified columns dropped,
            and string values converted to lowercase.
    """
    
    # step 1: drop duplicate
    if drop_duplicate==True :
        df = df.drop_duplicates()
        df = df.reset_index(drop = True)

    # step 2 : drop useless col 
    df = df.drop(useless_columns, axis=1)

    # step 3 : lowercase caracter 
    df = df.applymap(lambda s : s.lower() if (type(s) == str and pd.isna(s)==False) else s) #map or applymap

    # step 4 : drop white space 
    df = df.applymap(lambda s : s.strip() if (isinstance(s, str) and pd.isna(s)==False) else s) 

    # step 5 : drop multiple(double, triple) space 
    df = df.applymap(lambda s:s.replace("  ", " ") if (isinstance(s, str) and pd.isna(s)==False) else s) 

    # step 6 : replace " " by "_" 
    df = df.applymap(lambda s:s.replace(" ", "_") if (isinstance(s, str) and pd.isna(s)==False) else s) 

    # step 7 : remove accent 
    df = df.applymap(lambda s: unidecode(s) if (isinstance(s, str) and pd.isna(s)==False) else s) 

    return df


In [None]:
%%time

df = basic_treatment(df=df, useless_columns=useless_columns, drop_duplicate=True)
df

## Data filter (useless)

## Data transformation (useless)

## Check and change type of columns (all good)

In [None]:
for i in df.columns :
    print(f'{i} : {df[i].dtypes}')

## Check and handle abnormal features

In [None]:
# Analyze abnormal values (simple):

#Num col :
df.describe()

In [None]:
# Cat col : 
for i in df.columns :
    if df[i].dtypes == object:
        print(f'{i} : {df[i].unique()} \n')

In [None]:
# Treat anomalies (simple) :
# Num : No anormal values
# Cat : No anormal values

In [None]:
# Analyze and treat abnormal values (deeper) :
# Analyser et vérifier les incoherences dans la base train et test (Voir si les incoherences sont présentes dans les 2 bases ou pas :


# Verif incoherance : "Fuel consumption " 
# Solution : Supprimer les lignes incohérentes, car il y avait des voitures électrique qui consommaient du Fuel, ce qui est incohérent.
df = df[~((df['Ft'] == 'electric') & (df['Fuel consumption '].notna()))]
df = df.reset_index(drop = True)

# Verif incoherance : "z (Wh/km)" 
# Solution : Corriger les lignes incohérentes. Ajouter "/electric" à la col "Ft" si : z (Wh/km) != NaN, Fuel consumption != NaN, et Ft ne contient pas "electric"
df.loc[(df['z (Wh/km)'].notna()) & (df['Fuel consumption '].notna() & ~(df["Ft"].str.contains("electric"))), "Ft"] += "/electric"

# Verif incoherance : "Electric range (km)" 
# Solution : Corriger les lignes incohérentes. Même solution que pour "z (Wh/km)"
# PAS DE CODE A FAIRE

df

In [None]:
#temp :
# Save df :
df.to_pickle("./data/train_clean.pkl")

## Check and handle missing features (NaN)

In [None]:
# Analyze Non-NaN :
# Define the possible groupby variables : ['Cr', 'Ft', 'Mh']
# Before choosing the groupby col, Check if the groupby var has not NaN in x_test.

def analyse_non_nan(df : pd.DataFrame):
    """Analyse the columns that contains NaN value

    Args:
        df (pd.DataFrame): database
        
    Print : "column name" : "type of column" | Number of NaN : "nb"
    """
    for i in df.columns :
        if df[i].isna().any()==False :
            print(f'{i} : {df[i].dtypes}')

analyse_non_nan(df=df)

In [None]:
# Analyze NaN :
def analyse_nan(df : pd.DataFrame):
    """Analyse the columns that contains NaN value

    Args:
        df (pd.DataFrame): database
        
    Print : "column name" : "type of column" | Number of NaN : "nb"
    """
    for i in df.columns :
        if df[i].isna().any() :
            print(f'{i} : {df[i].dtypes} | Number of NaN : {df[i].isna().sum()}')

analyse_nan(df=df)

In [None]:
# Delete columns which contains more than 50% of NaN or useless :

Col_to_drop = ["Enedc (g/km)", "Erwltp (g/km)"]
df = df.drop(Col_to_drop, axis=1)

In [3]:
# Impute by fix value :
# df = df.dropna(subset=[""]).reset_index(drop=True)
# df[""] = df[""].fillna()


def fillna_fix_value(df, fillna_value):
    
    # Recreate the new dict :
    fillna_value = {key: value for key, value in fillna_value.items() if key in list(df.columns)}
    
    for i in list(fillna_value.keys()) :
        # Cat col :
        if df[i].dtypes==object :
            if type(fillna_value[i])==str :
                df[i] = df[i].fillna(fillna_value[i])
            else :
                print(f"{i} must be a 'str' !")
                break
        
        # Num col :
        elif df[i].dtypes==float or df[i].dtypes==int :
            if type(fillna_value[i])==float or type(fillna_value[i])==int :
                if i=="z (Wh/km)":
                    # "z (Wh/km)" : 0 if "Ft"!= "electric"/"hybrid"('petrol/electric', 'diesel/electric') :
                    df.loc[~((df["Ft"].str.contains("electric"))),i] = df.loc[~((df["Ft"].str.contains("electric"))),i].fillna(fillna_value[i])
                
                elif i=="Fuel consumption ":
                    # "Fuel consumption "= 0 if "Ft"= "electric" :
                    df.loc[df["Ft"]=="electric",i] = df.loc[df["Ft"]=="electric",i].fillna(fillna_value[i])
                
                else :    
                    df[i] = df[i].fillna(fillna_value[i])
            else :
                print(f"{i} must be a 'float' or 'int' !")
                break
    return df


In [None]:
# Dictionnary of imputationby fix value (Num and Cat) :
dict_imputation_fix = {"Country" : "unknown", "z (Wh/km)": 0,
                       "Fuel consumption ": 0,"Electric range (km)": 0}

df = fillna_fix_value(df=df, fillna_value=dict_imputation_fix)

In [4]:
# Impute by non-fix value (create the function for the next imputation): 

def fillna_non_fix(x_train, x_test, fillna_method, groupby_col, display_groupby_col=False):
    """Suppose that the columns name are the same in train and test.
        Suppose that the groupby col of the x_train has not NaN value.

    Args:
        x_train (_type_): _description_
        x_test (_type_): _description_
        fillna_method (_type_): _description_
        groupby_col (_type_): _description_
        groupby_col_comparison (bool, optional): _description_. Defaults to False.

    Returns:
        _type_: _description_
    """    
    # Recreate the new dict :
    fillna_method = {key: value for key, value in fillna_method.items() if key in list(x_train.columns)}
    
    # Check values of keys :
    for i in list(fillna_method.keys()) :  
        if fillna_method[i] not in ["mode","median","mean"] :
            print (f"{i} must be imputed by mean, median or mean")
            return x_train, x_test

    # While loop :
    j = len(groupby_col)  
    while j>=0:
        
        if j > 0:
            try:
                for i in list(fillna_method.keys()) : 
                    grouped = x_train.groupby(groupby_col[:j])
                    
                    if fillna_method[i]=="mode" : 
                        mode = grouped[i].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
                    elif fillna_method[i]=="median" : 
                        mode = grouped[i].median()
                    elif fillna_method[i]=="mean" : 
                        mode = grouped[i].mean() 
                    
                    x_test[i] = x_test.groupby(groupby_col[:j])[i].transform(lambda x: x.fillna(mode[x.name]))
                    x_train[i] = x_train.groupby(groupby_col[:j])[i].transform(lambda x: x.fillna(mode[x.name]))
                    
                    if x_train[i].isna().any()==True or x_test[i].isna().any()==True :
                        if fillna_method[i]=="mode" : 
                            mode = x_train[i].mode()[0]
                        elif fillna_method[i]=="median" : 
                            mode = x_train[i].median()   
                        elif fillna_method[i]=="mean" : 
                            mode = x_train[i].mean() 

                        x_test[i] = x_test[i].fillna(mode)
                        x_train[i] = x_train[i].fillna(mode)
                        
                
                if display_groupby_col==True:
                    print(groupby_col[:j])
                
                j=-1
                
            except :
                j-=1
    
        elif j==0: 
            for i in list(fillna_method.keys()) :     
                if fillna_method[i]=="mode" : 
                    mode = x_train[i].mode()[0]
                elif fillna_method[i]=="median" : 
                    mode = x_train[i].median()   
                elif fillna_method[i]=="mean" : 
                    mode = x_train[i].mean() 
        
                x_test[i] = x_test[i].fillna(mode)
                x_train[i] = x_train[i].fillna(mode)
            
            j=-1
        
    return x_train, x_test

In [None]:
#* Pour le moment, faire seulement groupby 1 variable :

# Avant de selectionner la col pour le groubpy, on doit verifier en 2 etapes :
"""
col_for_groupby = "Ft"
# 1/ Verifier les valeurs .unique() avant de choisir le variable groupby :
for i in [x_val1, x_val2, x_test]:
    if all(element in set(x_train[col_for_groupby].unique()) for element in set(i[col_for_groupby].unique()))== False:
        print(f"Cannot use the column {col_for_groupby} for groupby")
        break
    else :
        print(all(element in set(x_train[col_for_groupby].unique()) for element in set(i[col_for_groupby].unique())))

# 2/ Verifier si la variable ne possède pas de NaN 
for i in [x_train, x_val1, x_val2, x_test]:
    if i[col_for_groupby].isna().any()== True:
        print(f"Cannot use the column {col_for_groupby} for groupby")
        break
    else :
        print(i[col_for_groupby].isna().any())
"""

# Groupby_col : ['Ft'], []
# Attention : Before choosing the Groupby variable. 
# Verify for each variable, if :
# - Categorical
# - Number of unique value is <= 30
# Verify for each groupby combinaison, if :
# - The number of unique combinaison is <= 80
# - Same unique value in train, val1, val2, test
# - Same unique combinaison in val1, val2, test

# Verifier si l'imputation train/val est la même que train/val2 et que train/test. 
# voir si ca impute bien avec les mêmes combinaisons de groupby.



# Fillna all groupby(Ft) :
groupby_col = ['Ft']
dict_imputation_non_fix = {"VFN": "mode", "T": "mode", "Tan": "mode",
                            "Va": "mode","Ve": "mode","Ct": "mode",
                            "m (kg)": "median","Mt": "median","W (mm)": "median",
                            "At1 (mm)": "median","At2 (mm)": "median","Fm": "mode",
                            "ec (cm3)": "median","ep (KW)": "median", "z (Wh/km)" : "median",
                            "IT": "mode","Fuel consumption ": "median", "Electric range (km)":"median", 'Cr':"mode", 'Mh':"mode"}

#x_train, x_test = fillna_non_fix(x_train=x_train, x_test=x_test, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)

# 3/ Split data as train/val :

In [None]:
#Split :

num_bins = 10  # Number of bins for stratification
y_bins = pd.cut(df[target], bins=num_bins, labels=False)
df_train, df_val = train_test_split(df, test_size=0.25, stratify=y_bins, random_state=42)

#reset_index :
df_train.reset_index(drop = True, inplace=True)
df_val.reset_index(drop = True, inplace=True)

df_train

# 4/ Feature engineering :

## Feature creation (useless)

## Feature selection

In [None]:
## 2.1/ Drop columns used for features creation (option) 
## 2.2/ Drop columns with low variance (num) and have the same value more than 99% of time (cat)
## 2.3/ Select with correlation method
## 2.4/ Select with RFE method (option)

In [None]:
# 2.1/ Drop columns used for features creation (option) : useless


In [None]:
# 2.2/ Drop columns with low variance (num) and have the same value more than 99.5% of time (cat) :

def drop_col_with_same_value(df_train, df_test, target):
    col_to_drop = [col for col in df_train.columns if (df_train[col].value_counts().iloc[0]/df_train.shape[0] >= 0.995 and col !=target)]
    df_train = df_train.drop(col_to_drop, axis=1)
    df_test = df_test.drop(col_to_drop, axis=1)  
    return df_train, df_test

df_train , df_val = drop_col_with_same_value(df_train=df_train, df_test=df_val, target=target)

df_train.columns

In [None]:
# 2.3/ Select with correlation method : 

In [None]:
# Function to calculate Cramer's V
def cramers_v(x, y):
    
    # Créer un tableau de contingence
    contingency_table = pd.crosstab(x, y)
    
    # Effectuer le test du chi-carré
    chi2_stat, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    
    # Calculer le coefficient de Cramér-V
    n = contingency_table.sum().sum()
    min_dim = min(contingency_table.shape) - 1
    cramer_v = np.sqrt(chi2_stat / (n * min_dim))
    
    return cramer_v

In [None]:
# Function to calculate ANOVA eta_carre

def eta_carre(x, y) :
    #x=Cat col
    #y=Num col
    
    #Rename y serie name :
    y = y.copy()
    y.name = y.name.split()[0]
    
    # Replace "specific character" by "_" of x value :
    # Define the characters to be replaced
    characters_to_replace = ['-', '/', '*', '.', '?', ')', '(']
    # Create a regex pattern to match any of the characters to be replaced
    regex_pattern = '|'.join(map(re.escape, characters_to_replace))
    # Apply the replacement using regex
    x = x.apply(lambda s: re.sub(regex_pattern, '_', s) if isinstance(s, str) and not pd.isna(s) else s)
     
    
    # Convertir la variable qualitative en variables indicatrices (dummies)
    data_dummies = pd.get_dummies(x)
    new_col_name = {}
    for i in data_dummies.columns :
        new_col_name[i] = f'{x.name}_{i}'
    data_dummies = data_dummies.rename(columns=new_col_name)
    
    # Fusionner les données dummies avec le jeu de données original
    data = pd.concat([y, data_dummies], axis=1)

    # Modèle linéaire
    formula = f"{y.name} ~"
    for i in data_dummies.columns:
        formula+= f' {i} +'
    formula=formula[:-2]
    model = ols(formula, data=data).fit()

    # ANOVA
    result = anova_lm(model, typ=2)

    # Calcul de l'Eta carré
    eta_squared = result['sum_sq'][0] / (result['sum_sq'][0] + result['sum_sq'][1])

    return eta_squared

In [None]:
# Features_selection : 
# Ne pas supprimer les col du groupby : ['Ft']

def Features_selection(df_train, df_test, target, prediction_type, groupby_col=groupby_col, threshold=0.8):
    
    # 0/ Build list for each col type :
    list_number_col = df_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Num list :
    # Numeric and > 2 :
    list_num_col = [col for col in list_number_col if ((df_train[col].nunique() > 2) & (col!=target) & (col not in groupby_col))]

    # Cat list :
    # Numeric and <= 2 :
    list_binary_col = [col for col in list_number_col if ((df_train[col].nunique() <= 2) & (col!=target) & (col not in groupby_col))]
    # Categorical col :
    list_cat_col = df_train.select_dtypes(include=['object']).columns.tolist()
    list_cat_col = [col for col in list_cat_col if ((col!=target) & (df_train[col].nunique() <= 100) & (col not in groupby_col))] + list_binary_col
    
    
    
    
    # 1/ Num-Num :
    print("Num-Num")
    df_train_num=df_train[list_num_col]
    
    # A/ corr matrix :
    corr_matrix_num = df_train_num.corr(numeric_only=True, method="spearman")
    
    # B/ Find pairs of columns with correlation above the threshold :
    highly_correlated_pairs_num = []
    for i in range(len(corr_matrix_num.columns)):
        for j in range(i):
            if abs(corr_matrix_num.iloc[i, j]) > threshold:
                colname_i = corr_matrix_num.columns[i]
                colname_j = corr_matrix_num.columns[j]
                highly_correlated_pairs_num.append((colname_i, colname_j, corr_matrix_num.iloc[i, j]))
    highly_correlated_pairs_num = sorted(highly_correlated_pairs_num, key=lambda x: x[2], reverse=True)
    
    # C/ Get correlation between explicative num and target :
    dict_corr_target_num = {}
    if prediction_type=="regression":
        for i in list_num_col :
            correlation, p_value = spearmanr(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_num[i] = correlation
    
    elif prediction_type=="classification":
        for i in list_num_col :
            correlation = eta_carre(df_train.dropna(subset=[i]).reset_index(drop=True)[target], df_train.dropna(subset=[i]).reset_index(drop=True)[i])
            dict_corr_target_num[i] = correlation
    
    # D/ Get set of col to drop :
    col_to_drop_num = set()
    for pair in highly_correlated_pairs_num :
        corr_target_0 = dict_corr_target_num[pair[0]]
        corr_target_1 = dict_corr_target_num[pair[1]]
        
        if corr_target_0 > corr_target_1 :
            col_to_drop_num.add(pair[1])
            
        elif corr_target_0 <= corr_target_1 :
            col_to_drop_num.add(pair[0])
    
    # E/ Drop num col :
    df_train = df_train.drop(list(col_to_drop_num), axis=1)
    df_test = df_test.drop(list(col_to_drop_num), axis=1)
    print(list(col_to_drop_num))
    print("End Num-Num")
    
    
    
    
    
    # 2/ Cat-Cat :
    print("Cat-Cat")
    
    # A/ Corr matrix : 
    # List of categorical columns
    categorical_columns = list_cat_col
    # Initialize an empty matrix
    cramer_matrix = np.zeros((len(categorical_columns), len(categorical_columns)))
    corr_matrix_cat = pd.DataFrame(cramer_matrix, index=categorical_columns, columns=categorical_columns)
    # Calculate Cramer's V for each pair of categorical columns
    for i in range(len(categorical_columns)):
        for j in range(i+1, len(categorical_columns)):
            col1 = df_train.dropna(subset=[categorical_columns[i], categorical_columns[j]]).reset_index(drop=True)[categorical_columns[i]]
            col2 = df_train.dropna(subset=[categorical_columns[i], categorical_columns[j]]).reset_index(drop=True)[categorical_columns[j]]
            corr_matrix_cat.loc[categorical_columns[j], categorical_columns[i]] = cramers_v(col1, col2)
    
    # B/ Find pairs of columns with correlation above the threshold :
    highly_correlated_pairs_cat = []
    for i in categorical_columns :
        for j in categorical_columns :
            if abs(corr_matrix_cat.loc[j, i]) > threshold:
                highly_correlated_pairs_cat.append((i, j, corr_matrix_cat.loc[j, i]))
    highly_correlated_pairs_cat = sorted(highly_correlated_pairs_cat, key=lambda x: x[2], reverse=True)
    
    
    # C/ Get correlation between explicative cat and target :
    dict_corr_target_cat = {}
    if prediction_type=="regression":
        for i in corr_matrix_cat :
            correlation = eta_carre(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_cat[i] = correlation
        
    elif prediction_type=="classification":
        for i in corr_matrix_cat :
            correlation = cramers_v(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_cat[i] = correlation
      
    # D/ Get set of col to drop :
    col_to_drop_cat = set()
    for pair in highly_correlated_pairs_cat :
        corr_target_0 = dict_corr_target_cat[pair[0]]
        corr_target_1 = dict_corr_target_cat[pair[1]]
        
        if corr_target_0 > corr_target_1 :
            col_to_drop_cat.add(pair[1])
            
        elif corr_target_0 <= corr_target_1 :
            col_to_drop_cat.add(pair[0])
    
    # E/ Drop num col :
    df_train = df_train.drop(list(col_to_drop_cat), axis=1)
    df_test = df_test.drop(list(col_to_drop_cat), axis=1)
    print(list(col_to_drop_cat))
    print("End Cat-Cat")
    
    
    
    
    # 3/ Num-Cat : 
    print("Num-Cat")
    # A*/ Build list for each col type :
    list_number_col = df_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Num list :
    # Numeric and > 2 :
    list_num_col = [col for col in list_number_col if ((df_train[col].nunique() > 2) & (col!=target) & (col not in groupby_col))]

    # Cat list :
    # Numeric and <= 2 :
    list_binary_col = [col for col in list_number_col if ((df_train[col].nunique() <= 2) & (col!=target) & (col not in groupby_col))]
    # Categorical col :
    list_cat_col = df_train.select_dtypes(include=['object']).columns.tolist()
    list_cat_col = [col for col in list_cat_col if ((col!=target) & (df_train[col].nunique() <= 100) & (col not in groupby_col))] + list_binary_col
    

    # A/ Corr matrix :
    # List of categorical columns
    numerical_columns = list_num_col
    categorical_columns = list_cat_col
    # Initialize an empty matrix
    eta_matrix = np.zeros((len(numerical_columns), len(categorical_columns)))
    # Calculate eta for each pair of num-categorical columns
    for i in range(len(categorical_columns)):
        for j in range(len(numerical_columns)):
            cat_col1 = df_train.dropna(subset=[categorical_columns[i], numerical_columns[j]]).reset_index(drop=True)[categorical_columns[i]]
            num_col2 = df_train.dropna(subset=[categorical_columns[i], numerical_columns[j]]).reset_index(drop=True)[numerical_columns[j]]
            eta_matrix[j, i] = eta_carre(x=cat_col1, y=num_col2)  
    # Create a DataFrame from the matrix
    corr_matrix_num_cat = pd.DataFrame(eta_matrix, index=numerical_columns, columns=categorical_columns)

    # B/ Find pairs of columns with correlation above the threshold :
    highly_correlated_pairs_num_cat = []
    for i in numerical_columns:
        for j in categorical_columns:
            if abs(corr_matrix_num_cat.loc[i, j]) > threshold:
                highly_correlated_pairs_num_cat.append((i, j, corr_matrix_num_cat.loc[i, j]))
    highly_correlated_pairs_num_cat = sorted(highly_correlated_pairs_num_cat, key=lambda x: x[2], reverse=True)
    
    # C/ Get correlation between explicative Num-cat and target :
    dict_corr_target_num_cat = {}
    if prediction_type=="regression":
        # Num-Target :
        for i in numerical_columns :
            correlation, p_value = spearmanr(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_num_cat[i] = correlation
            
        # Cat-Target :
        for i in categorical_columns :
            correlation = eta_carre(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_num_cat[i] = correlation
        
    elif prediction_type=="classification":
        # Num-Target :
        for i in numerical_columns :
            correlation = eta_carre(df_train.dropna(subset=[i]).reset_index(drop=True)[target], df_train.dropna(subset=[i]).reset_index(drop=True)[i])
            dict_corr_target_num_cat[i] = correlation
        
        # Cat-Target :
        for i in categorical_columns :
            correlation = cramers_v(df_train.dropna(subset=[i]).reset_index(drop=True)[i], df_train.dropna(subset=[i]).reset_index(drop=True)[target])
            dict_corr_target_num_cat[i] = correlation
    
    # D/ Get set of col to drop :
    col_to_drop_num_cat = set()
    for pair in highly_correlated_pairs_num_cat :
        corr_target_0 = dict_corr_target_num_cat[pair[0]]
        corr_target_1 = dict_corr_target_num_cat[pair[1]]
        
        if corr_target_0 > corr_target_1 :
            col_to_drop_num_cat.add(pair[1])
            
        elif corr_target_0 <= corr_target_1 :
            col_to_drop_num_cat.add(pair[0])
    
    # E/ Drop num col :
    df_train = df_train.drop(list(col_to_drop_num_cat), axis=1)
    df_test = df_test.drop(list(col_to_drop_num_cat), axis=1)
    print(list(col_to_drop_num_cat))
    print("End Num-Cat")
    
    return df_train, df_test


In [None]:
# 2.4/ RFE

def Feature_selection_RFE(df_train=df_train, df_test=df_val, target=target, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, display_selected_features=False) :
    
    # Create x, y :
    X_train = df_train.drop([target], axis=1)
    Y_train = df_train[target]
    X_test = df_test.drop([target], axis=1)
    Y_test = df_test[target]
    
    # fillna :
    X_train, X_test = fillna_non_fix(x_train=X_train, x_test=X_test, fillna_method=fillna_method, groupby_col=groupby_col)

    # encoding/scaling :
    #numeric and > 2 :
    list_cont_col = X_train.select_dtypes(include=[np.number]).columns.tolist()
    list_cont_col = [col for col in list_cont_col if X_train[col].nunique() > 2]
    #numeric and <= 2 :
    list_binary_col = X_train.select_dtypes(include=[np.number]).columns.tolist()
    list_binary_col = [col for col in list_binary_col if X_train[col].nunique() <= 2]
    #categorical col :
    list_cat_col_TE = X_train.select_dtypes(include=['object']).columns.tolist()
    
    pre_process = pre_processing()
    X_train = pre_process.pre_processing(df=X_train, train=True, categorical_var_OHE=[],
                                         categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                         target=Y_train, continious_var=list_cont_col, encoding_type_cont=StandardScaler())

    X_test = pre_process.pre_processing(df=X_test, train=False, categorical_var_OHE=[],
                                     categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                     target=Y_train, continious_var=list_cont_col, encoding_type_cont=StandardScaler())
    
    
    # Features selection :
    rfe_score = {"nb_var" : 1000, "score_train": 1000, "best_score_test" : 1000}

    for i in tqdm(range(1,len(X_train.columns)+1)) :
        model = LinearRegression()
        selector = RFE(model, n_features_to_select=i, step=1)
        selector.fit(X_train, Y_train)

        X_train_new = X_train[list(selector.get_feature_names_out())]
        X_test_new = X_test[list(selector.get_feature_names_out())]
        model.fit(X_train_new, Y_train)

        score_train = mean_absolute_error(Y_train, model.predict(X_train_new))
        score_test = mean_absolute_error(Y_test, model.predict(X_test_new))
        
        if score_test < rfe_score["best_score_test"] :
            rfe_score["nb_var"] = i
            rfe_score["score_train"] = score_train
            rfe_score["best_score_test"] = score_test
            rfe_score["selected_features"] = list(selector.get_feature_names_out())
    
    if display_selected_features==True:
        print(f"RFE_score : {rfe_score}")
    
    df_train_new = pd.concat([df_train[rfe_score["selected_features"]], df_train[target]], axis=1)
    df_test_new = pd.concat([df_test[rfe_score["selected_features"]], df_test[target]], axis=1)
    
    return df_train_new, df_test_new
    

In [None]:
%%time

df_train, df_val = Feature_selection_RFE(df_train=df_train, df_test=df_val, target=target, fillna_method=dict_imputation_non_fix, 
                                         groupby_col=groupby_col, display_selected_features=True)

In [None]:
display(df_train.shape)
display(df_val.shape)

In [None]:
df_train.columns

#  __________________________________ Beginning : ________________________________

In [5]:
#temp :
df_train = pd.read_pickle("./data/df_train.pkl")
df_val = pd.read_pickle("./data/df_val.pkl")

target = "Ewltp (g/km)"

## Row selection (useless)

## Create x and y

In [6]:
def create_x_y(df_train, df_test, target):
    x_train = df_train.drop([target], axis=1).copy()
    y_train = df_train[target].copy()

    x_test = df_test.drop([target], axis=1).copy()
    y_test = df_test[target].copy()
    
    return x_train, y_train, x_test, y_test

x_train, y_train, x_val, y_val = create_x_y(df_train=df_train, df_test=df_val, target=target)

# 5/ Modelling :

## Display Data treatment methods before modelling 

#### A/ Fillna non-fix method

In [7]:
# Impute by non-fix value : 

# Fillna all groupby(Ft) :
groupby_col = ['Ft']
dict_imputation_non_fix = {"VFN": "mode", "T": "mode", "Tan": "mode",
                            "Va": "mode","Ve": "mode","Ct": "mode",
                            "m (kg)": "median","Mt": "median","W (mm)": "median",
                            "At1 (mm)": "median","At2 (mm)": "median","Fm": "mode",
                            "ec (cm3)": "median","ep (KW)": "median", "z (Wh/km)" : "median",
                            "IT": "mode","Fuel consumption ": "median", "Electric range (km)":"median", 'Cr':"mode", 'Mh':"mode"}

#x_train, x_test = fillna_non_fix(x_train=x_train, x_test=x_test, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)

#### B/ Encoding/Scaling method

In [8]:
#numeric and > 2 :
list_cont_col = x_train.select_dtypes(include=[np.number]).columns.tolist()
list_cont_col = [col for col in list_cont_col if x_train[col].nunique() > 2]

#numeric and <= 2 :
list_binary_col = x_train.select_dtypes(include=[np.number]).columns.tolist()
list_binary_col = [col for col in list_binary_col if x_train[col].nunique() <= 2]

#categorical col :
list_cat_col = x_train.select_dtypes(exclude=[np.number]).columns.tolist()
list_cat_col_OHE = [col for col in list_cat_col if x_train[col].nunique() <= 30]
list_cat_col_TE =  [col for col in list_cat_col if x_train[col].nunique() > 30]

# Check if all columns are taken :
print(list_cont_col)
print(list_binary_col)
print(list_cat_col_OHE)
print(list_cat_col_TE)
len(list_cont_col) + len(list_binary_col) + len(list_cat_col_OHE) + len(list_cat_col_TE) == x_train.shape[1]

['m (kg)', 'Mt', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'Fuel consumption ', 'Electric range (km)']
[]
['Ct', 'Cr', 'Ft', 'Fm']
['VFN', 'T', 'Va', 'Ve']


True

## Model testing (CV method)

#### 0/ Evaluation metric 

In [9]:
# MAE_score_CV :

def MAE_score_CV(x_train, y_train, model, fillna_method, groupby_col, list_cat_col_OHE=None, list_cat_col_TE=None, list_cont_col=None, cv=5, random_state=42, encoding=False, display_cv=False):
    
    num_bins = 8  # Number of bins for stratification
    y_bins = pd.cut(y_train, bins=num_bins, labels=False)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state) #StratifiedKFold or #KFold
    
    list_score = []
    dict_score = {}

    i=0
    for train_idx, valid_idx in skf.split(x_train, y_bins):
        X_train, X_val = x_train.loc[train_idx], x_train.loc[valid_idx]
        Y_train, Y_val = y_train.loc[train_idx], y_train.loc[valid_idx]
        
        #reset_index() :
        X_train = X_train.reset_index(drop = True)
        Y_train = Y_train.reset_index(drop = True)
        X_val = X_val.reset_index(drop = True)
        Y_val = Y_val.reset_index(drop = True)
        
        # fillna :
        X_train, X_val = fillna_non_fix(x_train=X_train, x_test=X_val, fillna_method=fillna_method, groupby_col=groupby_col)

        # encoding :
        if encoding==True :
            pre_process = pre_processing()
            X_train = pre_process.pre_processing(df=X_train, train=True, categorical_var_OHE=list_cat_col_OHE,
                                                 categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                                 target=Y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

            X_val = pre_process.pre_processing(df=X_val, train=False, categorical_var_OHE=list_cat_col_OHE,
                                             categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                             target=Y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

        # training on train set :
        model.fit(X_train, Y_train)
        
        #score :
        score_train = mean_absolute_error(Y_train, model.predict(X_train))
        score_val = mean_absolute_error(Y_val, model.predict(X_val))
        
        dict_score[f"cv {str(i)}"]= {"train" : score_train, "val" : score_val}
        list_score.append(score_val)
        
        #temp :
        #print(f"CV {str(i)} : score_train = {score_train} | score_val : {score_val}")
            
        i+=1

    if display_cv :
        display(dict_score)
        
    return mean(list_score)

### A/ Catboost (to review)

In [None]:
# Naiv modelling :

model = CatBoostRegressor(iterations=150, # Number of boosting iterations
                                    depth=6, # Depth of the tree
                                    learning_rate=0.1, # Step size shrinkage
                                    loss_function='MAE', 
                                    eval_metric='MAE',
                                    cat_features=list_cat_col_OHE + list_cat_col_TE, # Indices of categorical features
                                    silent=True)

In [None]:
%%time

MAE_score_CV(x_train=x_train, y_train=y_train, model=model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=None, 
             list_cat_col_TE=None, list_cont_col=None, cv=5, random_state=42, encoding=False, display_cv=True)

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    #hyperparameters :
    learning_rate = trial.suggest_float(name='learning_rate', low=0.01, high=1)
    colsample_bylevel = trial.suggest_float(name="colsample_bylevel", low=0.3, high=1)
    depth = trial.suggest_int(name="depth", low=1, high=4)
    reg_lambda = trial.suggest_float(name="reg_lambda", low=0.01, high=10)
    iterations = trial.suggest_int(name="iterations", low=5, high=300)
    random_strength = trial.suggest_float(name="random_strength", low=0, high=10)
    bagging_temperature = trial.suggest_float(name="bagging_temperature", low=0, high=10)
    

    # instanciate :
    # train on df_train :
    model = CatBoostRegressor(silent=True, loss_function='MAE', # Use 'MultiClass' for multi-class classification
                            eval_metric='MAE',
                            cat_features=list_cat_col_OHE + list_cat_col_TE, 
                            learning_rate=learning_rate,
                            colsample_bylevel=colsample_bylevel, depth=depth, reg_lambda=reg_lambda,
                            iterations=iterations, random_strength=random_strength, bagging_temperature=bagging_temperature)

    # score :
    score = MAE_score_CV(x_train=x_train, y_train=y_train, model=model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=None, 
             list_cat_col_TE=None, list_cont_col=None, cv=3, random_state=42, encoding=False, display_cv=False)

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

In [None]:
trial = study.best_trial
print('score : {}'.format(trial.value)) # replace scoring='accuracy' by "recall"  #or auc
print("Best hyperparameters: {}".format(trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_cat_model = CatBoostRegressor(silent=True, loss_function='MAE', # Use 'MultiClass' for multi-class classification
                                    eval_metric='MAE',
                                    cat_features=list_cat_col_OHE + list_cat_col_TE, 
                                    learning_rate=(trial.params)["learning_rate"],
                                    colsample_bylevel=(trial.params)["colsample_bylevel"], depth=(trial.params)["depth"], 
                                    reg_lambda=(trial.params)['reg_lambda'],
                                    iterations=(trial.params)["iterations"], random_strength=(trial.params)["random_strength"], 
                                    bagging_temperature=(trial.params)["bagging_temperature"])

In [None]:
%%time

MAE_score_CV(x_train=x_train, y_train=y_train, model=best_cat_model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=None, 
             list_cat_col_TE=None, list_cont_col=None, cv=5, random_state=42, encoding=False, display_cv=True)

### B/ Xgboost

In [None]:
# Naiv modelling :

model = XGBRegressor(random_state=42)

In [None]:
%%time

MAE_score_CV(x_train=x_train, y_train=y_train, model=model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=list_cat_col_OHE, 
             list_cat_col_TE=list_cat_col_TE, list_cont_col=[], cv=2, random_state=42, encoding=True, display_cv=True)

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    #hyperparameters :
    max_depth = trial.suggest_int('max_depth', 0, 50, step=2) #profondeur
    learning_rate = trial.suggest_categorical('learning_rate', [0.01,0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]) 
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1, step=0.1) #min leaf of each tree
    n_estimators = trial.suggest_int('n_estimators', 1,1001, step=50) #nb of tree
    
    # instanciate :
    # train on train set :
    model = XGBRegressor(random_state=42, n_jobs=-1, max_depth=max_depth, learning_rate=learning_rate, colsample_bytree=colsample_bytree,
                        n_estimators=n_estimators)  #, tree_method='gpu_hist', predictor="gpu_predictor"

    # score :
    score = MAE_score_CV(x_train=x_train, y_train=y_train, model=model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=list_cat_col_OHE, 
             list_cat_col_TE=list_cat_col_TE, list_cont_col=[], cv=5, random_state=42, encoding=True, display_cv=False)

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
trial = study.best_trial
print('score : {}'.format(trial.value)) 
print("Best hyperparameters: {}".format(trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_xgb_model =  XGBRegressor(random_state=42, n_jobs=-1, max_depth=(trial.params)["max_depth"], learning_rate=(trial.params)["learning_rate"], 
                                  colsample_bytree=(trial.params)["colsample_bytree"], n_estimators=(trial.params)["n_estimators"]) #, tree_method='gpu_hist'

In [None]:
%%time

MAE_score_CV(x_train=x_train, y_train=y_train, model=best_xgb_model, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col, list_cat_col_OHE=list_cat_col_OHE, 
             list_cat_col_TE=list_cat_col_TE, list_cont_col=[], cv=5, random_state=42, encoding=True, display_cv=True)

## Model testing (train/val1/val2 method)

#### 0*/ Data treatment

In [10]:
# Split x_val/x_val2 :
target = "Ewltp (g/km)"

num_bins = 8  # Number of bins for stratification
y_bins = pd.cut(df_val[target], bins=num_bins, labels=False)
df_val1, df_val2 = train_test_split(df_val, test_size=0.5, stratify=y_bins, random_state=42)

#reset_index :
df_val1 = df_val1.reset_index(drop = True)
df_val2 = df_val2.reset_index(drop = True)

#create x_val, y_val, x_val2, y_val2 : 
x_val1, y_val1, x_val2, y_val2 = create_x_y(df_train=df_val1, df_test=df_val2, target=target)

In [11]:
# Fillna :
x_train_imp, x_val1_imp = fillna_non_fix(x_train=x_train, x_test=x_val1, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)

# Encoding/Scaling :
pre_process = pre_processing()
x_train_process = pre_process.pre_processing(df=x_train_imp, train=True, categorical_var_OHE=list_cat_col_OHE,
                                     categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                     target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

x_val1_process = pre_process.pre_processing(df=x_val1_imp, train=False, categorical_var_OHE=list_cat_col_OHE,
                                 categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                 target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


### A*/ Catboost (to review)

In [None]:
%%time
# Naiv modelling :

# Instanciate :
model = CatBoostRegressor(iterations=150, # Number of boosting iterations
                                    depth=6, # Depth of the tree
                                    learning_rate=0.1, # Step size shrinkage
                                    loss_function='MAE', 
                                    eval_metric='MAE',
                                    cat_features=list_cat_col_OHE + list_cat_col_TE, # Indices of categorical features
                                    silent=True)

# Training :
model.fit(x_train_imp, y_train)

# MAE score :
print(f"MAE on train : {mean_absolute_error(y_train, model.predict(x_train_imp))}")
print(f"MAE on val : {mean_absolute_error(y_val1, model.predict(x_val1_imp))}") #y_val1, x_val1_process

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    #hyperparameters :
    learning_rate = trial.suggest_float(name='learning_rate', low=0.01, high=1)
    colsample_bylevel = trial.suggest_float(name="colsample_bylevel", low=0.3, high=1)
    depth = trial.suggest_int(name="depth", low=1, high=4)
    reg_lambda = trial.suggest_float(name="reg_lambda", low=0.01, high=10)
    iterations = trial.suggest_int(name="iterations", low=5, high=300)
    random_strength = trial.suggest_float(name="random_strength", low=0, high=10)
    bagging_temperature = trial.suggest_float(name="bagging_temperature", low=0, high=10)
    

    # instanciate :
    # train on df_train :
    model = CatBoostRegressor(silent=True, loss_function='MAE', 
                            eval_metric='MAE',
                            cat_features=list_cat_col_OHE + list_cat_col_TE, 
                            learning_rate=learning_rate,
                            colsample_bylevel=colsample_bylevel, depth=depth, reg_lambda=reg_lambda,
                            iterations=iterations, random_strength=random_strength, bagging_temperature=bagging_temperature)
    
    # Training :
    model.fit(x_train_imp, y_train)
    
    # score :
    score = mean_absolute_error(y_val1, model.predict(x_val1_imp))

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
Catboost_trial = study.best_trial
print('score : {}'.format(trial.value)) 
print("Best hyperparameters: {}".format(trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_cat_model = CatBoostRegressor(silent=True, loss_function='MAE', # Use 'MultiClass' for multi-class classification
                                    eval_metric='MAE',
                                    cat_features=list_cat_col_OHE + list_cat_col_TE, 
                                    learning_rate=(Catboost_trial.params)["learning_rate"],
                                    colsample_bylevel=(Catboost_trial.params)["colsample_bylevel"], depth=(Catboost_trial.params)["depth"], 
                                    reg_lambda=(Catboost_trial.params)['reg_lambda'],
                                    iterations=(Catboost_trial.params)["iterations"], random_strength=(Catboost_trial.params)["random_strength"], 
                                    bagging_temperature=(Catboost_trial.params)["bagging_temperature"])

# Training :
best_cat_model.fit(x_train_imp, y_train)

In [None]:
# Save the model temporary :
joblib.dump(value = best_cat_model, filename = './MODEL/Temp/train_val1_val2/best_cat_model.pkl')

### B*/ Xgboost

In [None]:
%%time
# Naiv modelling :

# Instanciate :
model = XGBRegressor(random_state=42, objective='reg:squarederror', booster= 'gbtree', eval_metric='mae', n_jobs=-1)

# Training :
model.fit(x_train_process, y_train)

# MAE score :
print(f"MAE on train : {mean_absolute_error(y_train, model.predict(x_train_process))}")
print(f"MAE on val : {mean_absolute_error(y_val1, model.predict(x_val1_process))}") #y_val1, x_val1_process

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    #hyperparameters :
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.4, step=0.001) 
    n_estimators = trial.suggest_int('n_estimators', 50, 2010, step=20) #nb of tree
    max_depth = trial.suggest_int('max_depth', 3, 40, step=1) #profondeur
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1) #min leaf of each tree
    
    subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
    gamma= trial.suggest_float('gamma', 0, 5, step=0.1)
    reg_alpha= trial.suggest_float('reg_alpha', 0, 1, step=0.1)
    reg_lambda= trial.suggest_float('reg_lambda', 0, 1, step=0.1)
    
    
    
    # instanciate :
    # train on train set :
    model = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror', booster= 'gbtree', eval_metric='mae', 
                         learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, colsample_bytree=colsample_bytree, 
                         subsample=subsample, gamma=gamma, reg_alpha=reg_alpha, reg_lambda=reg_lambda)  #, tree_method='gpu_hist', predictor="gpu_predictor"

    # Training :
    model.fit(x_train_process, y_train)
    
    # score :
    score = mean_absolute_error(y_val1, model.predict(x_val1_process))

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
XGB_trial = study.best_trial
print('score : {}'.format(XGB_trial.value)) 
print("Best hyperparameters: {}".format(XGB_trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_xgb_model = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror', booster= 'gbtree', eval_metric='mae', 
                               learning_rate=(XGB_trial.params)["learning_rate"], n_estimators=(XGB_trial.params)["n_estimators"], 
                               max_depth=(XGB_trial.params)["max_depth"], colsample_bytree=(XGB_trial.params)["colsample_bytree"],
                               subsample=(XGB_trial.params)["subsample"], gamma=(XGB_trial.params)["gamma"], reg_alpha=(XGB_trial.params)["reg_alpha"], 
                               reg_lambda=(XGB_trial.params)["reg_lambda"]) #, tree_method='gpu_hist', predictor="gpu_predictor"

# Training :
best_xgb_model.fit(x_train_process, y_train)

In [None]:
# Save the model temporary :
joblib.dump(value = best_xgb_model, filename = './MODEL/Temp/train_val1_val2/best_xgb_model.pkl')

### C*/ Random forest

In [None]:
%%time
# Naiv modelling :

# Instanciate :
model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Training :
model.fit(x_train_process, y_train)

# MAE score :
print(f"MAE on train : {mean_absolute_error(y_train, model.predict(x_train_process))}")
print(f"MAE on val : {mean_absolute_error(y_val1, model.predict(x_val1_process))}") #y_val1, x_val1_process

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    
    #hyperparameters :
    max_depth = trial.suggest_int('max_depth', 2, 50, step=2) #profondeur
    max_features = trial.suggest_categorical('max_features', ["log2","sqrt",None]) 
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, step=1) #min leaf of each tree
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 10, step=1) #max leaf of each tree
    min_samples_split = trial.suggest_int('min_samples_split', 4, 20, step=1)
    n_estimators = trial.suggest_int('n_estimators', 40,2000, step=20) #nb of tree

    
    # instanciate :
    # train on train set :
    model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, 
                                max_leaf_nodes=max_leaf_nodes, min_samples_split=min_samples_split, n_estimators=n_estimators)

    # Training :
    model.fit(x_train_process, y_train)
    
    # score :
    score = mean_absolute_error(y_val1, model.predict(x_val1_process))

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
rf_trial = study.best_trial
print('score : {}'.format(rf_trial.value)) 
print("Best hyperparameters: {}".format(rf_trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_rf_model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=(rf_trial.params)["max_depth"], max_features=(rf_trial.params)["max_features"], 
                                      min_samples_leaf=(rf_trial.params)["min_samples_leaf"], max_leaf_nodes=(rf_trial.params)["max_leaf_nodes"], 
                                      min_samples_split=(rf_trial.params)["min_samples_split"], n_estimators=(rf_trial.params)["n_estimators"])

# Training :
best_rf_model.fit(x_train_process, y_train)

In [None]:
# Save the model temporary :
joblib.dump(value = best_rf_model, filename = './MODEL/Temp/train_val1_val2/best_rf_model.pkl')

### D*/ LGBM

In [None]:
%%time
# Naiv modelling :

# Instanciate :
model = LGBMRegressor(random_state=42, n_jobs=-1)

# Training :
model.fit(x_train_process, y_train)

# MAE score :
print(f"MAE on train : {mean_absolute_error(y_train, model.predict(x_train_process))}")
print(f"MAE on val : {mean_absolute_error(y_val1, model.predict(x_val1_process))}") #y_val1, x_val1_process

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    max_depth = trial.suggest_categorical('max_depth', [5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 25, 28, 30, 40, 50, None]) #profondeur
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.4, step=0.001) 
    n_estimators = trial.suggest_int('n_estimators', 40,2000, step=20) #nb of tree
    boosting_type = trial.suggest_categorical('boosting_type', ["gbdt", "dart"])
    num_leaves = trial.suggest_int('num_leaves', 10,200,step=5)
    #feature_fraction = trial.suggest_float('feature_fraction', 0.1,0.999)
    subsample = trial.suggest_float('subsample', 0.1,0.999)
    reg_alpha = trial.suggest_float('reg_alpha', 0.001,0.999)
    reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.999)

    model = LGBMRegressor(random_state=42, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves,
                                    boosting_type=boosting_type, subsample=subsample, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
    
    # Training :
    model.fit(x_train_process, y_train)
    
    # score :
    score = mean_absolute_error(y_val1, model.predict(x_val1_process))

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
LGBM_trial = study.best_trial
print('score : {}'.format(LGBM_trial.value)) 
print("Best hyperparameters: {}".format(LGBM_trial.params))

In [None]:
# Modelling with best hyperparameters : 

best_lgbm_model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=(LGBM_trial.params)['max_depth'], 
                                      learning_rate=(LGBM_trial.params)['learning_rate'], 
                                      n_estimators=(LGBM_trial.params)['n_estimators'], num_leaves=(LGBM_trial.params)['num_leaves'], 
                                      boosting_type=(LGBM_trial.params)['boosting_type'], subsample=(LGBM_trial.params)['subsample'], 
                                      reg_alpha=(LGBM_trial.params)['reg_alpha'], reg_lambda=(LGBM_trial.params)['reg_lambda'])
                                  
# Training :
best_lgbm_model.fit(x_train_process, y_train)

In [None]:
# Save the model temporary :
joblib.dump(value = best_lgbm_model, filename = './MODEL/Temp/train_val1_val2/best_lgbm_model.pkl')

## Combinaison testing (train/val1/val2)

#### 0/ Import all model

In [None]:
# Load all model :
best_cat_model = joblib.load(filename = './MODEL/Temp/train_val1_val2/best_cat_model.pkl')
best_xgb_model = joblib.load(filename = './MODEL/Temp/train_val1_val2/best_xgb_model.pkl')
best_rf_model = joblib.load(filename = './MODEL/Temp/train_val1_val2/best_rf_model.pkl')
best_lgbm_model = joblib.load(filename = './MODEL/Temp/train_val1_val2/best_lgbm_model.pkl')

### A/ Comb 1 (xgb, rf, lgbm)

In [None]:
# Combinaison method 1 :

def model_predict_comb1(x_test, dict_coeff):
    
    y_test_pred_xgb = best_xgb_model.predict(x_test)
    y_test_pred_rf = best_rf_model.predict(x_test)
    y_test_pred_lgbm = best_lgbm_model.predict(x_test)

    y_test_pred = y_test_pred_xgb * dict_coeff["XGB"] + y_test_pred_rf * dict_coeff["RF"] + y_test_pred_lgbm * dict_coeff["LGBM"]
    
    return y_test_pred


In [None]:
# Naiv modelling :
dict_coeff = {"XGB" :0.7, "RF":0.15, "LGBM":0.15}
y_val1_pred_comb = model_predict_comb1(x_test=x_val1_process, dict_coeff=dict_coeff)
print(mean_absolute_error(y_val1, y_val1_pred_comb))

In [None]:
# Bayesian Optimisation (optuna) :

def objective(trial):
    xgb_coeff = trial.suggest_float('xgb_coeff', 0, 1, step=0.001)
    rf_coeff = trial.suggest_float('rf_coeff', 0, 1, step=0.001)
    lgbm_coeff = trial.suggest_float('lgbm_coeff', 0, 1, step=0.001)
    
    # Build dict coeff :
    dict_coeff = {"XGB" : xgb_coeff, "RF":rf_coeff, "LGBM":lgbm_coeff}

    # Predict :
    y_val1_pred_comb = model_predict_comb1(x_test=x_val1_process, dict_coeff=dict_coeff)
    
    # score :
    score = mean_absolute_error(y_val1, y_val1_pred_comb)

    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10000)

In [None]:
comb_trial = study.best_trial
print('score : {}'.format(comb_trial.value)) 
print("Best hyperparameters: {}".format(comb_trial.params))

In [None]:
# Create the best dict_coeff :
dict_coeff = {"XGB" :(comb_trial.params)["xgb_coeff"], "RF": (comb_trial.params)['rf_coeff'] , "LGBM": (comb_trial.params)['lgbm_coeff']}

In [None]:
# Save dict and model combinaison :

# dict : 
with open(r'./MODEL/Temp/Combinaison_model/Comb1/dict_coeff.yaml', 'w') as file:
    documents = yaml.dump(dict_coeff, file)

# Model the model used for this comb model : 
joblib.dump(value = best_xgb_model, filename = './MODEL/Temp/Combinaison_model/Comb1/best_xgb_model.pkl')
joblib.dump(value = best_rf_model, filename = './MODEL/Temp/Combinaison_model/Comb1/best_rf_model.pkl')
joblib.dump(value = best_lgbm_model, filename = './MODEL/Temp/Combinaison_model/Comb1/best_lgbm_model.pkl')

# 6/ Choose, Train, Valid, Save the best model : 

## Choose the model

## Treat x_train/x_val1/x_val2 for the best model

In [11]:
# Fillna :

# x_train and x_val1 :
x_train_imp, x_val1_imp = fillna_non_fix(x_train=x_train, x_test=x_val1, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)

# x_val2 :
_ , x_val2_imp = fillna_non_fix(x_train=x_train, x_test=x_val2, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)



# Encoding/Scaling :
pre_process = pre_processing()

# x_train :
x_train_process = pre_process.pre_processing(df=x_train_imp, train=True, categorical_var_OHE=list_cat_col_OHE,
                                     categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                     target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

# x_val1 :
x_val1_process = pre_process.pre_processing(df=x_val1_imp, train=False, categorical_var_OHE=list_cat_col_OHE,
                                 categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                 target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

# x_val2 :
x_val2_process = pre_process.pre_processing(df=x_val2_imp, train=False, categorical_var_OHE=list_cat_col_OHE,
                                 categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                 target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


## Load the best model

In [13]:
#load the best regression model :
best_model_reg = joblib.load(filename = './MODEL/Temp/train_val1_val2/best_xgb_model.pkl')

## Validation

In [14]:
# MAE
print(f"MAE on train : {mean_absolute_error(y_train, best_model_reg.predict(x_train_process))}")
print(f"MAE on val1 : {mean_absolute_error(y_val1, best_model_reg.predict(x_val1_process))}")
print(f"MAE on val2 : {mean_absolute_error(y_val2, best_model_reg.predict(x_val2_process))}")

MAE on train : 2.6303748991392304
MAE on val1 : 2.9301284525904654
MAE on val2 : 2.938298250240637


## Save model

In [None]:
# Save the best model :
joblib.dump(value = best_model_reg, filename = './MODEL/best_model/best_model_reg.pkl')

In [None]:
#load model :
best_model_reg = joblib.load(filename = './MODEL/best_model/best_model_reg.pkl')

# 7/ Analyse output : (useless)

# 8/ Choose best threshold (option) : (useless)

# 9/ Feature impact analysis (model interpretation) :

In [None]:
# Identification des variables les plus importantes :
def Features_importance(model) -> pd.DataFrame :
    """
    Calculate and return feature importance scores as a DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing feature names and their importance scores in percentage.
    """
    df_features_importance = (pd.DataFrame({'Features': model.feature_names_in_,
            'Features importance (in %)': (model.feature_importances_)*100}))
    
    return df_features_importance.sort_values(by='Features importance (in %)', ascending=False)

Features_importance(model=best_model_reg).head(30)

In [None]:
# SHAP values

#General :
# compute the SHAP values for the linear model
explainer = shap.TreeExplainer(best_model_reg)
shap_values = explainer.shap_values(x_val2_process)

In [None]:
shap.summary_plot(shap_values, x_val2_process)

# 10/ Deployement :

In [None]:
# 1/ Open data

# - read_csv :
x_test = pd.read_csv("./data/test.csv", encoding="utf-8")

# - Take the ID (option) :
ID = x_test["ID"]

In [None]:
# 2/ Data cleaning 


# - Basic treatment :
useless_columns = ['ID', 'Vf', 'De','Ernedc (g/km)','MMS','Mp','Mk','Man','Cn','Date of registration','r','Status']
x_test = basic_treatment(df=x_test, useless_columns=useless_columns, drop_duplicate=False)


# - Data filter : (useless)


# - Data transformation : (useless)


# - Check/change col type : (good)


# - Handle abnormal values : 
# Verif incoherance : "z (Wh/km)" 
# Solution : Corriger les lignes incohérentes. Ajouter "/electric" à la col "Ft" si : z (Wh/km) != NaN, Fuel consumption != NaN, et Ft ne contient pas "electric"
x_test.loc[(x_test['z (Wh/km)'].notna()) & (x_test['Fuel consumption '].notna() & ~(x_test["Ft"].str.contains("electric"))), "Ft"] += "/electric"


# - Impute NaN (delete col, fillna_fix) :
# A/ Delete columns which contains more than 50% of NaN or useless :
Col_to_drop = ["Enedc (g/km)", "Erwltp (g/km)"]
x_test = x_test.drop(Col_to_drop, axis=1)

#B/ # Impute by fix value :
dict_imputation_fix = {"Country" : "unknown", "z (Wh/km)": 0,"Fuel consumption ": 0, "Electric range (km)": 0}
x_test = fillna_fix_value(df=x_test, fillna_value=dict_imputation_fix)

In [None]:
# 3/ Feature eng 


# - Feature creation : (useless)


# - Feature selection (x_test = x_test[list(x_train.columns)]) :
x_test = x_test[list(x_train.columns)]


# - Row selection : (useless)

In [None]:
# 4/ Prediction


# - Impute NaN (fillna_nonfix) :
groupby_col = ['Ft']
dict_imputation_non_fix = {"VFN": "mode","T": "mode", "Tan": "mode", "Va": "mode","Ve": "mode","Ct": "mode","m (kg)": "median","Mt": "median","W (mm)": "median","At1 (mm)": "median","At2 (mm)": "median","Fm": "mode",
                            "ec (cm3)": "median","ep (KW)": "median", "z (Wh/km)" : "median","IT": "mode","Fuel consumption ": "median", "Electric range (km)":"median", 'Cr':"mode", 'Mh':"mode"}
x_train_imp, x_test_imp = fillna_non_fix(x_train=x_train, x_test=x_test, fillna_method=dict_imputation_non_fix, groupby_col=groupby_col)


# - Encoding/Scaling :
pre_process = pre_processing()
x_train_pro = pre_process.pre_processing(df=x_train_imp, train=True, categorical_var_OHE=list_cat_col_OHE,
                                     categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                     target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())

x_test_pro = pre_process.pre_processing(df=x_test_imp, train=False, categorical_var_OHE=list_cat_col_OHE,
                                 categorical_var_OrdinalEncoding={}, categorical_var_TE=list_cat_col_TE, 
                                 target=y_train, continious_var=[], encoding_type_cont=MinMaxScaler())


# - model.predict(x_test) :
y_test_pred = best_model_reg.predict(x_test_pro)


# - Save the prediction (submission.csv) :
submission = pd.DataFrame({'ID': ID, 'Ewltp (g/km)': y_test_pred})
submission.to_csv(r'submission.csv', index=False)
submission