# Training model

In [61]:
## To get some insights
# https://github.com/ceptln/paris-bike-traffic-prediction/tree/main

from pathlib import Path
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from submissions.external_data.estimator import _encode_dates, _merge_external_data
import utils.get_data as get_data

### Function submission kaggle

In [62]:
def submission_kaggle(model, X_test):
    y_pred = model.predict(X_test)
    print(y_pred)
    results = pd.DataFrame(
        dict(
            Id=np.arange(y_pred.shape[0]),
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

## Read Data

In [63]:
def add_external_data(X):
    
    X = X.copy()
    
    # merge original data + external data
    merged_X_train_external_DATA = get_data._merge_external_data_weather(X)
    
    # merge original data + external data + holidays
    merged_X_train_external_HOLIDAYS = get_data._merge_holidays_week_end(merged_X_train_external_DATA)
    
    # merge original data + external data + holidays + data COVID
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_Curfews_lockdowns_COVID(merged_X_train_external_HOLIDAYS)
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_indicators_COVID(merged_X_train_external_HOLIDAYS_COVID)
    
    # merge original data + external data + holidays + data COVID + data accidents
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS = get_data._merge_road_accidents(merged_X_train_external_HOLIDAYS_COVID)
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS 
    return merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS

In [64]:
# Read data
X_train, y_train = get_data.get_train_data()
X_test, y_test = get_data.get_test_data()
X_final_test = get_data.get_final_test_data()

In [65]:
X_train_plus = add_external_data(X_train)
X_train_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,ff,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
30,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0


In [17]:
X_test_plus = add_external_data(X_test)
X_train_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,ff,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
30,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0


In [18]:
X_final_test_plus = add_external_data(X_final_test)
X_final_test_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
43,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2021-09-10 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,2.32666,...,False,0,False,False,365,126,5.0,21675,0.0,0.0


##  Preprocessing

In [20]:
def _encode_dates(X):
    
    '''
    Splits the 'date' columns of the input DataFrame into several columns (year, month, day, weekday, hour)
    
    Parameters:
        X (pd.DataFrame): the dataframe to modify
    
    Returns:
        X (pd.DataFrame): the modified dataframe
    '''
    
    # Duplicate X to work on it
    X = X.copy()
    
    # Create new columns with date parts from X.date
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Adding cosinus and sinus features from date variables to enhance the date periodicity
    # X['cos_hour'] = np.cos(X['hour']*(2.*np.pi/24))
    # X['sin_hour'] = np.sin(X['hour']*(2.*np.pi/24))
    # X['cos_day'] = np.cos(X['day']*(2.*np.pi/30))
    # X['sin_day'] = np.sin(X['day']*(2.*np.pi/30))
    # X['cos_month'] = np.cos(X['month']*(2.*np.pi/12))
    # X['sin_month'] = np.sin(X['month']*(2.*np.pi/12))
    # X['cos_weekday'] = np.cos(X['weekday']*(2.*np.pi/7))
    # X['sin_weekday'] = np.sin(X['weekday']*(2.*np.pi/7))
    
    # Clean the new dataframe and return it
    X.drop(columns=["date"], inplace=True)
    #X.drop(columns=["year", 'month', 'day', 'weekday', 'hour'], inplace=True)
    return X

# Training

## Training without pipeline

In [28]:
def get_RMSE_local_wt_pipe(model, X_train, y_train, X_test, y_test):
    n_folds = 5

    # Perform cross-validation and compute the scores for the training set
    cv_scores_train = cross_val_score(model, X_train, y_train, cv=n_folds, scoring='neg_mean_squared_error')

    # Perform cross-validation and compute the scores for the testing set
    cv_scores_test = cross_val_score(model, X_test, y_test, cv=n_folds, scoring='neg_mean_squared_error')

    # Convert the scores to root mean squared error
    rmse_scores_train = np.sqrt(-cv_scores_train)
    rmse_scores_test = np.sqrt(-cv_scores_test)

    print(f"Train set, RMSE={np.mean(rmse_scores_train):.2f}")
    print(f"Test set, RMSE={np.mean(rmse_scores_test):.2f}")

### Select features for training

In [39]:
X_train_plus_chosen_FI

Unnamed: 0,t,u,rr3,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,rad,...,counter_name_Totem 64 Rue de Rivoli E-O,counter_name_Totem 64 Rue de Rivoli O-E,counter_name_Totem 73 boulevard de Sébastopol N-S,counter_name_Totem 73 boulevard de Sébastopol S-N,counter_name_Totem 85 quai d'Austerlitz NO-SE,counter_name_Totem 85 quai d'Austerlitz SE-NO,counter_name_Totem Cours la Reine E-O,counter_name_Totem Cours la Reine O-E,counter_name_Voie Georges Pompidou NE-SO,counter_name_Voie Georges Pompidou SO-NE
0,285.75,81,0.0,False,0,False,False,293,42,6641,...,False,False,False,False,False,False,False,False,False,False
30,285.75,81,0.0,False,0,False,False,293,42,6641,...,False,False,False,False,False,False,False,False,False,False
31,285.75,81,0.0,False,0,False,False,293,42,6641,...,False,False,False,False,False,False,False,False,False,False
32,285.75,81,0.0,False,0,False,False,293,42,6641,...,False,False,False,False,False,False,False,False,False,False
33,285.75,81,0.0,False,0,False,False,293,42,6641,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455129,291.45,72,0.0,False,0,False,False,285,80,21150,...,False,False,False,False,False,True,False,False,False,False
455130,291.45,72,0.0,False,0,False,False,285,80,21150,...,False,False,False,False,False,False,True,False,False,False
455119,291.45,72,0.0,False,0,False,False,285,80,21150,...,False,False,False,False,False,False,False,True,False,False
455136,291.45,72,0.0,False,0,False,False,285,80,21150,...,False,False,False,False,False,False,False,False,True,False


In [36]:
fixed_features = ['date', "site_name", "t"]

not_fixed_features = ["counter_id", "counter_technical_id", "counter_installation_date", "counter_name",
                      'site_id', 
                      'site_name'
                       'counter_installation_date', 
                      'counter_technical_id', 
                      'latitude',
                       'longitude', 
                      'ff', # the wind speed
                      'u', # the humidity
                      'ssfrai', # the fresh snowfall amount
                      'n', # the amount of cloud cover
                      'vv', # the visibility
                      'rr3', # the precipitation amount over 3 hours
                      't', # the temperature
                      'is_holiday', # is holidays
                       'is_weekend', # is week end
                      'is_lockdown', # Lockdown for COVID
                      'is_curfew', # Curfew for COVID
                      'hosp', # Number of patients currently hospitalised for COVID-19
                      'rea', # Number of patients currently in intensive care.
                      'incid_rea', # Number of new patients admitted to intensive care in the last 24 hours.
                       'rad', # Cumulative number of patients hospitalised for COVID-19 who have returned home due to an improvement in their state of health
                      'Max_Grav_accidents', # The maximum severity of all cyclists accidents at a given hour
                      'Count_accidents' # the number of accidents at a given hour in Paris
                     ]


chosen_not_fixed_features = ['u', 'rr3', 'is_holiday', "counter_name",
       'is_weekend', 'is_lockdown', 'is_curfew', 'hosp', 'rea',
       'rad']

# VARIABLES REMOVED BY rfecv
# {'Count_accidents',
#  'Max_Grav_accidents',
#  'cos_day',
#  'ff',
#  'incid_rea',
#  'is_curfew',
#  'is_lockdown',
#  'n',
#  'sin_day',
#  'sin_weekday',
#  'site_name_39 quai François Mauriac',
#  'site_name_6 rue Julia Bartet',
#  'site_name_90 Rue De Sèvres',
#  'ssfrai',
#  'vv'}


chosen_variables = fixed_features + chosen_not_fixed_features

X_train_plus_chosen = X_train_plus[chosen_variables]
X_test_plus_chosen = X_test_plus[chosen_variables]
X_final_test_plus_chosen = X_final_test_plus[chosen_variables]

In [44]:
# Identifying continuous variables (float type or int with wide range)
continuous_columns  = X_train_plus_chosen_FI.select_dtypes(include=['float64', 'float32', 'int64', 'int32']).columns.tolist()

# Initialize the scaler
scaler = StandardScaler()

X_train_plus_chosen_FI = _encode_dates(X_train_plus_chosen)
X_train_plus_chosen_FI = pd.get_dummies(X_train_plus_chosen_FI, columns=['site_name'])
X_train_plus_chosen_FI = pd.get_dummies(X_train_plus_chosen_FI, columns=['counter_name'])
X_train_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_train_plus_chosen_FI[continuous_columns])

X_test_plus_chosen_FI = _encode_dates(X_test_plus_chosen)
X_test_plus_chosen_FI = pd.get_dummies(X_test_plus_chosen_FI, columns=['site_name'])
X_test_plus_chosen_FI = pd.get_dummies(X_test_plus_chosen_FI, columns=['counter_name'])
X_test_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_test_plus_chosen_FI[continuous_columns])

X_final_test_plus_chosen_FI = _encode_dates(X_final_test_plus_chosen)
X_final_test_plus_chosen_FI = pd.get_dummies(X_final_test_plus_chosen_FI, columns=['site_name'])
X_final_test_plus_chosen_FI = pd.get_dummies(X_final_test_plus_chosen_FI, columns=['counter_name'])
X_final_test_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_final_test_plus_chosen_FI[continuous_columns])

In [60]:
# LGMBRegressor
Regressor = lgb.LGBMRegressor()

# Ridge
#Regressor = Ridge()

# Initialize the XGBRegressor
#Regressor = xgb.XGBRegressor(objective='reg:squarederror')

# Initialize the MLPRegressor
Regressor = MLPRegressor(hidden_layer_sizes=(100,), 
                         activation='relu', 
                         solver='adam', 
                         alpha=0.0001, 
                         batch_size='auto', 
                         learning_rate='constant', 
                         learning_rate_init=0.001, 
                         max_iter=100, 
                         shuffle=True, 
                         random_state=None, 
                         tol=0.0001, 
                         verbose=True, 
                         warm_start=True
                         )

print("coucou")
# Train model with selected features
Regressor.fit(X_train_plus_chosen_FI, y_train)

get_RMSE_local_wt_pipe(Regressor, X_train_plus_chosen_FI, y_train, X_test_plus_chosen_FI, y_test)

coucou
Iteration 1, loss = 0.46534587
Iteration 2, loss = 0.20989278
Iteration 3, loss = 0.18261925
Iteration 4, loss = 0.16417408
Iteration 5, loss = 0.15216812
Iteration 6, loss = 0.14426152
Iteration 7, loss = 0.13890702
Iteration 8, loss = 0.13427257
Iteration 9, loss = 0.13100493
Iteration 10, loss = 0.12772995
Iteration 11, loss = 0.12498858
Iteration 12, loss = 0.12268303
Iteration 13, loss = 0.12082833
Iteration 14, loss = 0.11898670
Iteration 15, loss = 0.11756054
Iteration 16, loss = 0.11610447
Iteration 17, loss = 0.11462506
Iteration 18, loss = 0.11360777
Iteration 19, loss = 0.11259269
Iteration 20, loss = 0.11167170
Iteration 21, loss = 0.11099569
Iteration 22, loss = 0.11016795
Iteration 23, loss = 0.10923900
Iteration 24, loss = 0.10870907
Iteration 25, loss = 0.10791002
Iteration 26, loss = 0.10734520
Iteration 27, loss = 0.10679282
Iteration 28, loss = 0.10633940
Iteration 29, loss = 0.10590216
Iteration 30, loss = 0.10550317
Iteration 31, loss = 0.10480556
Iteration 



Iteration 1, loss = 0.44863191
Iteration 2, loss = 0.20868476
Iteration 3, loss = 0.18196965
Iteration 4, loss = 0.16518282
Iteration 5, loss = 0.15425714
Iteration 6, loss = 0.14743985
Iteration 7, loss = 0.14182717
Iteration 8, loss = 0.13768581
Iteration 9, loss = 0.13379651
Iteration 10, loss = 0.13061385
Iteration 11, loss = 0.12762843
Iteration 12, loss = 0.12489099
Iteration 13, loss = 0.12260546
Iteration 14, loss = 0.12053087
Iteration 15, loss = 0.11875621
Iteration 16, loss = 0.11720312
Iteration 17, loss = 0.11579729
Iteration 18, loss = 0.11429756
Iteration 19, loss = 0.11313609
Iteration 20, loss = 0.11199613
Iteration 21, loss = 0.11112764
Iteration 22, loss = 0.10990574
Iteration 23, loss = 0.10934489
Iteration 24, loss = 0.10876020
Iteration 25, loss = 0.10772394
Iteration 26, loss = 0.10710631
Iteration 27, loss = 0.10650468
Iteration 28, loss = 0.10562910
Iteration 29, loss = 0.10521290
Iteration 30, loss = 0.10451419
Iteration 31, loss = 0.10406719
Iteration 32, los



Iteration 1, loss = 0.50557821
Iteration 2, loss = 0.22271979
Iteration 3, loss = 0.19668692
Iteration 4, loss = 0.17794582
Iteration 5, loss = 0.16429899
Iteration 6, loss = 0.15420554
Iteration 7, loss = 0.14680509
Iteration 8, loss = 0.14081855
Iteration 9, loss = 0.13616843
Iteration 10, loss = 0.13234011
Iteration 11, loss = 0.12890004
Iteration 12, loss = 0.12587637
Iteration 13, loss = 0.12340315
Iteration 14, loss = 0.12128122
Iteration 15, loss = 0.11927333
Iteration 16, loss = 0.11752020
Iteration 17, loss = 0.11587999
Iteration 18, loss = 0.11422941
Iteration 19, loss = 0.11311456
Iteration 20, loss = 0.11189450
Iteration 21, loss = 0.11099842
Iteration 22, loss = 0.10995228
Iteration 23, loss = 0.10933101
Iteration 24, loss = 0.10844320
Iteration 25, loss = 0.10768995
Iteration 26, loss = 0.10707032
Iteration 27, loss = 0.10644995
Iteration 28, loss = 0.10601911
Iteration 29, loss = 0.10543649
Iteration 30, loss = 0.10486399
Iteration 31, loss = 0.10462944
Iteration 32, los



Iteration 1, loss = 0.46704150
Iteration 2, loss = 0.21164737
Iteration 3, loss = 0.18535022
Iteration 4, loss = 0.16653494
Iteration 5, loss = 0.15428639
Iteration 6, loss = 0.14579910
Iteration 7, loss = 0.13937330
Iteration 8, loss = 0.13393887
Iteration 9, loss = 0.12936184
Iteration 10, loss = 0.12521869
Iteration 11, loss = 0.12198472
Iteration 12, loss = 0.11896741
Iteration 13, loss = 0.11633238
Iteration 14, loss = 0.11402214
Iteration 15, loss = 0.11199229
Iteration 16, loss = 0.11067670
Iteration 17, loss = 0.10901053
Iteration 18, loss = 0.10750275
Iteration 19, loss = 0.10647039
Iteration 20, loss = 0.10513344
Iteration 21, loss = 0.10414505
Iteration 22, loss = 0.10334782
Iteration 23, loss = 0.10249764
Iteration 24, loss = 0.10168295
Iteration 25, loss = 0.10110639
Iteration 26, loss = 0.10068082
Iteration 27, loss = 0.09960838
Iteration 28, loss = 0.09936923
Iteration 29, loss = 0.09867485
Iteration 30, loss = 0.09859074
Iteration 31, loss = 0.09790299
Iteration 32, los



Iteration 1, loss = 0.49767790
Iteration 2, loss = 0.22078866
Iteration 3, loss = 0.19231107
Iteration 4, loss = 0.16994052
Iteration 5, loss = 0.15538796
Iteration 6, loss = 0.14620355
Iteration 7, loss = 0.14000420
Iteration 8, loss = 0.13468050
Iteration 9, loss = 0.12993724
Iteration 10, loss = 0.12590127
Iteration 11, loss = 0.12291120
Iteration 12, loss = 0.12035610
Iteration 13, loss = 0.11834913
Iteration 14, loss = 0.11656859
Iteration 15, loss = 0.11493079
Iteration 16, loss = 0.11355454
Iteration 17, loss = 0.11246135
Iteration 18, loss = 0.11165750
Iteration 19, loss = 0.11033596
Iteration 20, loss = 0.10966284
Iteration 21, loss = 0.10880395
Iteration 22, loss = 0.10803564
Iteration 23, loss = 0.10741364
Iteration 24, loss = 0.10682728
Iteration 25, loss = 0.10609396
Iteration 26, loss = 0.10552881
Iteration 27, loss = 0.10516048
Iteration 28, loss = 0.10472461
Iteration 29, loss = 0.10402327
Iteration 30, loss = 0.10375646
Iteration 31, loss = 0.10350918
Iteration 32, los



Iteration 1, loss = 0.47545469
Iteration 2, loss = 0.20920951
Iteration 3, loss = 0.18535607
Iteration 4, loss = 0.16873304
Iteration 5, loss = 0.15618220
Iteration 6, loss = 0.14713759
Iteration 7, loss = 0.14066096
Iteration 8, loss = 0.13577346
Iteration 9, loss = 0.13174162
Iteration 10, loss = 0.12830739
Iteration 11, loss = 0.12546157
Iteration 12, loss = 0.12261560
Iteration 13, loss = 0.12025958
Iteration 14, loss = 0.11802267
Iteration 15, loss = 0.11583342
Iteration 16, loss = 0.11406760
Iteration 17, loss = 0.11259005
Iteration 18, loss = 0.11115122
Iteration 19, loss = 0.10999388
Iteration 20, loss = 0.10893936
Iteration 21, loss = 0.10793739
Iteration 22, loss = 0.10701378
Iteration 23, loss = 0.10622783
Iteration 24, loss = 0.10546332
Iteration 25, loss = 0.10473400
Iteration 26, loss = 0.10402575
Iteration 27, loss = 0.10369589
Iteration 28, loss = 0.10301705
Iteration 29, loss = 0.10263224
Iteration 30, loss = 0.10224360
Iteration 31, loss = 0.10162464
Iteration 32, los



Iteration 1, loss = 1.96123051
Iteration 2, loss = 0.49096385
Iteration 3, loss = 0.32583930
Iteration 4, loss = 0.23995191
Iteration 5, loss = 0.20771447
Iteration 6, loss = 0.19420650
Iteration 7, loss = 0.18329820
Iteration 8, loss = 0.17445178
Iteration 9, loss = 0.16684471
Iteration 10, loss = 0.16110348
Iteration 11, loss = 0.15440349
Iteration 12, loss = 0.14880410
Iteration 13, loss = 0.14395039
Iteration 14, loss = 0.13842588
Iteration 15, loss = 0.13399489
Iteration 16, loss = 0.12936010
Iteration 17, loss = 0.12548281
Iteration 18, loss = 0.12165266
Iteration 19, loss = 0.11750780
Iteration 20, loss = 0.11529621
Iteration 21, loss = 0.11186870
Iteration 22, loss = 0.10943395
Iteration 23, loss = 0.10708887
Iteration 24, loss = 0.10464441
Iteration 25, loss = 0.10276013
Iteration 26, loss = 0.10070447
Iteration 27, loss = 0.09904495
Iteration 28, loss = 0.09713869
Iteration 29, loss = 0.09577397
Iteration 30, loss = 0.09443994
Iteration 31, loss = 0.09238137
Iteration 32, los



Iteration 1, loss = 1.25180996
Iteration 2, loss = 0.44203853
Iteration 3, loss = 0.29243708
Iteration 4, loss = 0.21883664
Iteration 5, loss = 0.19055284
Iteration 6, loss = 0.17650656
Iteration 7, loss = 0.16762830
Iteration 8, loss = 0.16068282
Iteration 9, loss = 0.15456793
Iteration 10, loss = 0.15029877
Iteration 11, loss = 0.14588400
Iteration 12, loss = 0.14180054
Iteration 13, loss = 0.13859441
Iteration 14, loss = 0.13542278
Iteration 15, loss = 0.13343621
Iteration 16, loss = 0.13001174
Iteration 17, loss = 0.12838113
Iteration 18, loss = 0.12634327
Iteration 19, loss = 0.12390895
Iteration 20, loss = 0.12249144
Iteration 21, loss = 0.12030872
Iteration 22, loss = 0.11861044
Iteration 23, loss = 0.11699472
Iteration 24, loss = 0.11508947
Iteration 25, loss = 0.11283482
Iteration 26, loss = 0.11122868
Iteration 27, loss = 0.10964259
Iteration 28, loss = 0.10804424
Iteration 29, loss = 0.10610832
Iteration 30, loss = 0.10499604
Iteration 31, loss = 0.10289625
Iteration 32, los



Iteration 1, loss = 1.40105457
Iteration 2, loss = 0.47279395
Iteration 3, loss = 0.32706558
Iteration 4, loss = 0.23850318
Iteration 5, loss = 0.20320399
Iteration 6, loss = 0.18740700
Iteration 7, loss = 0.17593348
Iteration 8, loss = 0.16682112
Iteration 9, loss = 0.16042568
Iteration 10, loss = 0.15365549
Iteration 11, loss = 0.14859067
Iteration 12, loss = 0.14420250
Iteration 13, loss = 0.14006849
Iteration 14, loss = 0.13679342
Iteration 15, loss = 0.13307050
Iteration 16, loss = 0.12993141
Iteration 17, loss = 0.12703335
Iteration 18, loss = 0.12444375
Iteration 19, loss = 0.12131796
Iteration 20, loss = 0.11876816
Iteration 21, loss = 0.11688454
Iteration 22, loss = 0.11436839
Iteration 23, loss = 0.11197245
Iteration 24, loss = 0.11011338
Iteration 25, loss = 0.10774460
Iteration 26, loss = 0.10626535
Iteration 27, loss = 0.10444568
Iteration 28, loss = 0.10269956
Iteration 29, loss = 0.10156349
Iteration 30, loss = 0.09959090
Iteration 31, loss = 0.09838668
Iteration 32, los



Iteration 1, loss = 1.60274373
Iteration 2, loss = 0.46848293
Iteration 3, loss = 0.32066118
Iteration 4, loss = 0.22950910
Iteration 5, loss = 0.19458687
Iteration 6, loss = 0.17854060
Iteration 7, loss = 0.16728285
Iteration 8, loss = 0.15889555
Iteration 9, loss = 0.15210569
Iteration 10, loss = 0.14569998
Iteration 11, loss = 0.14020946
Iteration 12, loss = 0.13578084
Iteration 13, loss = 0.13280818
Iteration 14, loss = 0.12915375
Iteration 15, loss = 0.12602970
Iteration 16, loss = 0.12427628
Iteration 17, loss = 0.12145964
Iteration 18, loss = 0.11954986
Iteration 19, loss = 0.11754256
Iteration 20, loss = 0.11558108
Iteration 21, loss = 0.11366257
Iteration 22, loss = 0.11226561
Iteration 23, loss = 0.11098284
Iteration 24, loss = 0.10956515
Iteration 25, loss = 0.10806533
Iteration 26, loss = 0.10654284
Iteration 27, loss = 0.10568155
Iteration 28, loss = 0.10393200
Iteration 29, loss = 0.10297014
Iteration 30, loss = 0.10196054
Iteration 31, loss = 0.10043413
Iteration 32, los



Iteration 1, loss = 1.98495201
Iteration 2, loss = 0.46657537
Iteration 3, loss = 0.30683155
Iteration 4, loss = 0.21361748
Iteration 5, loss = 0.18256172
Iteration 6, loss = 0.16676454
Iteration 7, loss = 0.15465920
Iteration 8, loss = 0.14540333
Iteration 9, loss = 0.13796548
Iteration 10, loss = 0.13219449
Iteration 11, loss = 0.12690075
Iteration 12, loss = 0.12271509
Iteration 13, loss = 0.11928543
Iteration 14, loss = 0.11620177
Iteration 15, loss = 0.11335561
Iteration 16, loss = 0.11042621
Iteration 17, loss = 0.10847620
Iteration 18, loss = 0.10640525
Iteration 19, loss = 0.10422735
Iteration 20, loss = 0.10255572
Iteration 21, loss = 0.10105539
Iteration 22, loss = 0.09988517
Iteration 23, loss = 0.09860085
Iteration 24, loss = 0.09700264
Iteration 25, loss = 0.09589907
Iteration 26, loss = 0.09477276
Iteration 27, loss = 0.09375813
Iteration 28, loss = 0.09235423
Iteration 29, loss = 0.09127020
Iteration 30, loss = 0.08985153
Iteration 31, loss = 0.08945555
Iteration 32, los



### Final csv file with current model

In [52]:
# Get submission kaggle to csv
submission_kaggle(Regressor, X_final_test_plus_chosen_FI)

[2.33023798 2.22080372 1.70775034 ... 2.07673133 1.39920118 1.48307387]


### Feature selections with RFECV

In [None]:
from sklearn.feature_selection import RFECV
import xgboost as xgb

# Initialize the XGBRegressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Initialize RFECV
selector = RFECV(estimator=xgb_reg, step=1, cv=5)

# Fit RFECV
selector = selector.fit(X_train_plus_FI, y_train)

# Print the optimal number of features
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
# Print selected features
selected_features = [feature for feature, selected in zip(X_train_plus_FI.columns, selector.support_) if selected]
print("Selected features:", selected_features)

In [None]:
# Transform training and testing sets
X_train_selected = selector.transform(X_train_plus_FI)
X_test_selected = selector.transform(X_test_plus_FI)

# Train model with selected features
xgb_reg.fit(X_train_selected, y_train)

get_RMSE_local(xgb_reg, X_train_selected, y_train, X_test_selected, y_test)

## Train with pipeline

In [None]:
def preprocessing(X_train):
    
    date_encoder = FunctionTransformer(_encode_dates)
    date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name"]

    numeric_encoder = StandardScaler()
    numeric_cols = ['latitude', 'longitude', 't', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3', 'hosp', 'rea', 'incid_rea', 'rad', 'Count_accidents']
    numeric_cols = ['t', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3', 'hosp', 'rea', 'incid_rea', 'rad', 'Count_accidents']
    #numeric_cols = ['t', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3']
    
    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", categorical_encoder, categorical_cols),
            ("num", numeric_encoder, numeric_cols),
        ],
        remainder="passthrough"  # This will pass through other columns not specified
    )
    return preprocessor, date_encoder

In [None]:
def get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test):
    
    n_folds = 5

    # Perform cross-validation and compute the scores
    cv_scores_train = cross_val_score(pipe, X_train, y_train, cv=n_folds, scoring='neg_mean_squared_error')
    cv_scores_test = cross_val_score(pipe, X_test, y_test, cv=n_folds, scoring='neg_mean_squared_error')

    # Convert the scores to root mean squared error
    rmse_scores_train = np.sqrt(-cv_scores_train)
    rmse_scores_test = np.sqrt(-cv_scores_test)
    
    print(
        f"Train set, RMSE={np.mean(rmse_scores_train):.2f}"
    )
    print(
        f"Test set, RMSE={np.mean(rmse_scores_test):.2f}"
    ) 

In [61]:
# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)

In [17]:
# Ridge pipe
regressor = Ridge()

pipe_Ridge = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_Ridge, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.74
Test set, RMSE=1.45


In [18]:
# Lasso pipe
regressor = Lasso()

pipe_Lasso = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_Lasso, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.70
Test set, RMSE=1.42


In [21]:
# ElasticNet pipe
regressor = ElasticNet()

pipe_ElasticNet = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_ElasticNet, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.70
Test set, RMSE=1.42


In [None]:
# RandomForestRegressor pipe
regressor = RandomForestRegressor()

pipe_RandomForestRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_RandomForestRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

In [42]:
# LGMBRegressor pipe
regressor = lgb.LGBMRegressor()

pipe_LGMBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_LGMBRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 180
[LightGBM] [Info] Start training from score 2.979088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1943
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 181
[LightGBM] [Info] Start training from score 3.125360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [72]:
# XGBRegressor pipe

best_params = {'colsample_bytree': 0.6154469128110744,
              'gamma': 1,
              'learning_rate': 0.09803049874792026,
              'max_depth': 9,
              'n_estimators': 363,
              'subsample': 0.5171942605576092}

regressor = xgb.XGBRegressor(objective='reg:squarederror',
                    n_estimators=best_params['n_estimators'],
                    max_depth=best_params['max_depth'],
                    learning_rate=best_params['learning_rate'],
                    subsample=best_params['subsample'],
                    colsample_bytree=best_params['colsample_bytree'],
                    gamma=best_params['gamma'],
                    seed=42
            )

pipe_XGBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_XGBRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=0.66
Test set, RMSE=0.51


# Hyperparameters tuning

## LGBMRegressor

In [44]:
best_params_LGBM = {'lgbmregressor__colsample_bytree': 0.5232252063599989,
                    'lgbmregressor__learning_rate': 0.1315089703802877,
                    'lgbmregressor__max_depth': 7,
                    'lgbmregressor__n_estimators': 428,
                    'lgbmregressor__num_leaves': 26,
                    'lgbmregressor__subsample': 0.5066324805799333
                   }

# Define the hyperparameter space for LGBMRegressor
param_dist = {
    'lgbmregressor__n_estimators': randint(100, 500),
    'lgbmregressor__max_depth': randint(3, 10),
    'lgbmregressor__learning_rate': uniform(0.01, 0.2),
    'lgbmregressor__subsample': uniform(0.5, 0.5),
    'lgbmregressor__colsample_bytree': uniform(0.5, 0.5),
    'lgbmregressor__num_leaves': randint(20, 40),
}

# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)
pipe_LGMBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Create a RandomizedSearchCV object for LightGBM
random_search_lgbm = RandomizedSearchCV(
    estimator= pipe_LGMBRegressor,  # Ensure your pipeline ends with a LGBMRegressor
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_root_mean_squared_error',  # Scoring metric to optimize
    cv=5,  # Number of folds in cross-validation
    random_state=42
)

# Fit to the data
random_search_lgbm.fit(X_train_plus_FI, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found for LGBM: ", random_search_lgbm.best_params_)
print("Lowest RMSE found for LGBM: ", np.abs(random_search_lgbm.best_score_))

# To predict and get RMSE on the test set using the best LightGBM model
best_model_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_model_lgbm.predict(X_test_plus_FI)
rmse_test_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
print("Test set RMSE of best LGBM model: ", rmse_test_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 180
[LightGBM] [Info] Start training from score 2.979088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1943
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 181
[LightGBM] [Info] Start training from score 3.125360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

## XGBregressor

In [34]:
# Define the hyperparameter space
param_dist = {
    'xgbregressor__n_estimators': randint(100, 500),
    'xgbregressor__max_depth': randint(3, 10),
    'xgbregressor__learning_rate': uniform(0.01, 0.2),
    'xgbregressor__subsample': uniform(0.5, 0.5),
    'xgbregressor__colsample_bytree': uniform(0.5, 0.5),
    'xgbregressor__gamma': [0, 0.1, 0.5, 1]
}

# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)
pipe_XGBregressor = make_pipeline(date_encoder, preprocessor, regressor)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=pipe_XGBregressor,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_root_mean_squared_error',  # Scoring metric to optimize
    cv=5,  # Number of folds in cross-validation
    random_state=42
)

# Fit to the data
random_search.fit(X_train_plus_FI, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", random_search.best_params_)
print("Lowest RMSE found: ", np.abs(random_search.best_score_))

# To predict and get RMSE on the test set using the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_plus_FI)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test set RMSE of best model: ", rmse_test)

Best parameters found:  {'xgbregressor__colsample_bytree': 0.6154469128110744, 'xgbregressor__gamma': 1, 'xgbregressor__learning_rate': 0.09803049874792026, 'xgbregressor__max_depth': 9, 'xgbregressor__n_estimators': 363, 'xgbregressor__subsample': 0.5171942605576092}
Lowest RMSE found:  0.677347199973148
Test set RMSE of best model:  0.4584068511534211
