# Training model

In [59]:
## To get some insights
# https://github.com/ceptln/paris-bike-traffic-prediction/tree/main

from pathlib import Path
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from submissions.external_data.estimator import _encode_dates, _merge_external_data
import utils.get_data as get_data

### Function submission kaggle

In [63]:
def submission_kaggle(model, X_final_test):
    y_pred = model.predict(X_final_test)
    print(y_pred)
    results = pd.DataFrame(
        dict(
            Id=X_final_test.index,
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

    return results

## Read Data

In [64]:
def add_external_data(X):
    
    X = X.copy()
    
    # merge original data + external data
    merged_X_train_external_DATA = get_data._merge_external_data_weather(X)
    
    # merge original data + external data + holidays
    merged_X_train_external_HOLIDAYS = get_data._merge_holidays_week_end(merged_X_train_external_DATA)
    
    # merge original data + external data + holidays + data COVID
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_Curfews_lockdowns_COVID(merged_X_train_external_HOLIDAYS)
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_indicators_COVID(merged_X_train_external_HOLIDAYS_COVID)
    
    # merge original data + external data + holidays + data COVID + data accidents
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS = get_data._merge_road_accidents(merged_X_train_external_HOLIDAYS_COVID)
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS 
    return merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS

In [65]:
# Read data
X_train, y_train = get_data.get_train_data()
X_test, y_test = get_data.get_test_data()
X_final_test = get_data.get_final_test_data()

In [66]:
X_train_plus = add_external_data(X_train)
X_train_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,ff,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
30,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666,1.6,...,False,0,False,False,293,42,3.0,6641,0.0,0.0


In [67]:
X_test_plus = add_external_data(X_test)
X_test_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,ff,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2021-08-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.9,...,False,0,False,False,284,84,6.0,21167,0.0,0.0
31,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2021-08-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,1.9,...,False,0,False,False,284,84,6.0,21167,0.0,0.0
32,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2021-08-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.9,...,False,0,False,False,284,84,6.0,21167,0.0,0.0
33,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2021-08-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702,1.9,...,False,0,False,False,284,84,6.0,21167,0.0,0.0
34,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2021-08-10 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666,1.9,...,False,0,False,False,284,84,6.0,21167,0.0,0.0


In [68]:
X_final_test_plus = add_external_data(X_final_test)
X_final_test_plus.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,is_holiday,is_weekend,is_lockdown,is_curfew,hosp,rea,incid_rea,rad,Max_Grav_accidents,Count_accidents
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
43,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,...,False,0,False,False,365,126,5.0,21675,0.0,0.0
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2021-09-10 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,2.32666,...,False,0,False,False,365,126,5.0,21675,0.0,0.0


##  Preprocessing

In [69]:
def _encode_dates(X):
    
    '''
    Splits the 'date' columns of the input DataFrame into several columns (year, month, day, weekday, hour)
    
    Parameters:
        X (pd.DataFrame): the dataframe to modify
    
    Returns:
        X (pd.DataFrame): the modified dataframe
    '''
    
    # Duplicate X to work on it
    X = X.copy()
    
    # Create new columns with date parts from X.date
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Adding cosinus and sinus features from date variables to enhance the date periodicity
    # X['cos_hour'] = np.cos(X['hour']*(2.*np.pi/24))
    # X['sin_hour'] = np.sin(X['hour']*(2.*np.pi/24))
    # X['cos_day'] = np.cos(X['day']*(2.*np.pi/30))
    # X['sin_day'] = np.sin(X['day']*(2.*np.pi/30))
    # X['cos_month'] = np.cos(X['month']*(2.*np.pi/12))
    # X['sin_month'] = np.sin(X['month']*(2.*np.pi/12))
    # X['cos_weekday'] = np.cos(X['weekday']*(2.*np.pi/7))
    # X['sin_weekday'] = np.sin(X['weekday']*(2.*np.pi/7))
    
    # Clean the new dataframe and return it
    X.drop(columns=["date"], inplace=True)
    #X.drop(columns=["year", 'month', 'day', 'weekday', 'hour'], inplace=True)
    return X

# Training

## Training without pipeline

In [71]:
def get_RMSE_local(model, X_train, y_train, X_test, y_test):

    # Fit the pipeline on the training data
    model.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate RMSE for training and test data
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

    print(f"Train set RMSE: {rmse_train:.2f}")
    print(f"Test set RMSE: {rmse_test:.2f}")


### Select features for training

In [117]:
fixed_features = ['date', "counter_name", "site_id"]

not_fixed_features = ["counter_id", "counter_technical_id", "counter_installation_date",
                      'site_name'
                       'counter_installation_date', 
                      'counter_technical_id', 
                      'latitude',
                       'longitude', 
                      'ff', # the wind speed
                      'u', # the humidity
                      'ssfrai', # the fresh snowfall amount
                      'n', # the amount of cloud cover
                      'vv', # the visibility
                      'rr3', # the precipitation amount over 3 hours
                      't', # the temperature
                      'is_holiday', # is holidays
                       'is_weekend', # is week end
                      'is_lockdown', # Lockdown for COVID
                      'is_curfew', # Curfew for COVID
                      'hosp', # Number of patients currently hospitalised for COVID-19
                      'rea', # Number of patients currently in intensive care.
                      'incid_rea', # Number of new patients admitted to intensive care in the last 24 hours.
                       'rad', # Cumulative number of patients hospitalised for COVID-19 who have returned home due to an improvement in their state of health
                      'Max_Grav_accidents', # The maximum severity of all cyclists accidents at a given hour
                      'Count_accidents' # the number of accidents at a given hour in Paris
                     ]


chosen_not_fixed_features = ['t', 'u', 'rr3', 'is_holiday', 'is_weekend']

chosen_not_fixed_features = [ 'latitude',
                              'longitude',
                              'ff', # the wind speed
                              'u', # the humidity
                              'ssfrai', # the fresh snowfall amount
                              'n', # the amount of cloud cover
                              'vv', # the visibility
                              'rr3', # the precipitation amount over 3 hours
                              't', # the temperature
                              'is_holiday', # is holidays
                               'is_weekend', # is week end
                              'is_lockdown', # Lockdown for COVID
                              'is_curfew', # Curfew for COVID
                              'hosp', # Number of patients currently hospitalised for COVID-19
                              'rea', # Number of patients currently in intensive care.
                              'incid_rea', # Number of new patients admitted to intensive care in the last 24 hours.
                               'rad', # Cumulative number of patients hospitalised for COVID-19 who have returned home due to an improvement in their state of health
                              'Max_Grav_accidents', # The maximum severity of all cyclists accidents at a given hour
                              'Count_accidents' # the number of accidents at a given hour in Paris
                            ]

chosen_fixed_features = [ 'ff', # the wind speed
                          'u', # the humidity
                          'n', # the amount of cloud cover
                          'vv', # the visibility
                          'rr3', # the precipitation amount over 3 hours
                          't', # the temperature
                          'is_holiday', # is holidays
                           'is_weekend', # is week end
                          'is_lockdown', # Lockdown for COVID
                          'is_curfew', # Curfew for COVID
                          'hosp', # Number of patients currently hospitalised for COVID-19
                          'rea', # Number of patients currently in intensive care.
                          'incid_rea', # Number of new patients admitted to intensive care in the last 24 hours.
                           'rad', # Cumulative number of patients hospitalised for COVID-19 who have returned home due to an improvement in their state of health
                          'Max_Grav_accidents', # The maximum severity of all cyclists accidents at a given hour
                          'Count_accidents' # the number of accidents at a given hour in Paris
                        ]



chosen_variables = fixed_features + chosen_fixed_features

X_train_plus_chosen = X_train_plus[chosen_variables]
X_test_plus_chosen = X_test_plus[chosen_variables]
X_final_test_plus_chosen = X_final_test_plus[chosen_variables]

In [119]:
# Identifying continuous variables (float type or int with wide range)

# Initialize the scaler
scaler = StandardScaler()

X_train_plus_chosen_FI = _encode_dates(X_train_plus_chosen)
#X_train_plus_chosen_FI = pd.get_dummies(X_train_plus_chosen_FI, columns=['site_name'])
X_train_plus_chosen_FI = pd.get_dummies(X_train_plus_chosen_FI, columns=['counter_name'])
X_train_plus_chosen_FI = pd.get_dummies(X_train_plus_chosen_FI, columns=['site_id'])
continuous_columns  = X_train_plus_chosen_FI.select_dtypes(include=['float64', 'float32', 'int64', 'int32']).columns.tolist()
X_train_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_train_plus_chosen_FI[continuous_columns])

X_test_plus_chosen_FI = _encode_dates(X_test_plus_chosen)
#X_test_plus_chosen_FI = pd.get_dummies(X_test_plus_chosen_FI, columns=['site_name'])
X_test_plus_chosen_FI = pd.get_dummies(X_test_plus_chosen_FI, columns=['counter_name'])
X_test_plus_chosen_FI = pd.get_dummies(X_test_plus_chosen_FI, columns=['site_id'])
X_test_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_test_plus_chosen_FI[continuous_columns])

X_final_test_plus_chosen_FI = _encode_dates(X_final_test_plus_chosen)
#X_final_test_plus_chosen_FI = pd.get_dummies(X_final_test_plus_chosen_FI, columns=['site_name'])
X_final_test_plus_chosen_FI = pd.get_dummies(X_final_test_plus_chosen_FI, columns=['counter_name'])
X_final_test_plus_chosen_FI = pd.get_dummies(X_final_test_plus_chosen_FI, columns=['site_id'])
X_final_test_plus_chosen_FI[continuous_columns] = scaler.fit_transform(X_final_test_plus_chosen_FI[continuous_columns])

In [120]:
# LGMBRegressor

params = {
        'colsample_bytree': 0.7, 
        'learning_rate': 0.01,
        'max_depth': 11,
        'min_child_samples': 198,
        'min_child_weight': 0.1,
        'n_estimators': 2000,
        'num_leaves': 99,
        'reg_alpha': 1, 
        'reg_lambda': 0.1,
        'subsample': 0.5
}

Regressor = lgb.LGBMRegressor(**params)

#Regressor = xgb.XGBRegressor(objectives = 'reg:squarederror')
# Ridge
#Regressor = Ridge()

# Initialize the XGBRegressor
# Regressor = xgb.XGBRegressor(objectives = 'reg:squarederror',
#  learning_rate= 0.01,
#  max_depth = 15,
#  n_estimators = 1000,
#  max_iter = 20,
#  verbose = True,
#  early_stopping = True)

# Initialize the MLPRegressor
# Regressor = MLPRegressor(hidden_layer_sizes=(300,), 
#                          activation='relu', 
#                          solver='adam', 
#                          alpha=0.0001, 
#                          batch_size='auto', 
#                          learning_rate='constant', 
#                          learning_rate_init=0.001, 
#                          max_iter=400, 
#                          shuffle=True, 
#                          random_state=None, 
#                          tol=0.0001, 
#                          verbose=True,
#                          early_stopping=True
#                          )

# Train model with selected features
Regressor.fit(X_train_plus_chosen_FI, y_train)

get_RMSE_local(Regressor, X_train_plus_chosen_FI, y_train, X_test_plus_chosen_FI, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1823
[LightGBM] [Info] Number of data points in the train set: 455163, number of used features: 107
[LightGBM] [Info] Start training from score 3.048589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1823
[LightGBM] [Info] Number of data points in the train set: 455163, number of used features: 107
[LightGBM] [Info] Start training from score 3.048589
Train set RMSE: 0.38
Test set RMSE: 0.82


In [85]:
from sklearn.model_selection import GridSearchCV, train_test_split

# Splitting for validation
X_train_cv, X_val_cv, y_train_cv, y_val_cv = train_test_split(X_train_plus_chosen_FI, y_train, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
Regressor = xgb.XGBRegressor(objective='reg:squarederror',
                             max_iter = 5,
                             verbose=True,
                             early_stopping=True
                            )

# Define hyperparameter grid (example)
param_grid = {
    'n_estimators': [300, 500, 700, 900, 1100],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.7, 0.8]
}

# Randomized search with cross-validation
randomized_search = RandomizedSearchCV(Regressor, param_grid, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose = True, n_jobs=-1)
randomized_search.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=True)

# Best estimator
best_regressor = randomized_search.best_estimator_

# Evaluate the model using your custom function
# Assuming get_RMSE_local_wt_pipe is your evaluation function
get_RMSE_local(best_regressor, X_train_plus_chosen_FI, y_train, X_test_plus_chosen_FI, y_test)

full_params = best_regressor.get_params()
print("Full Parameters of the Best Regressor:", full_params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

In [70]:
best_regressor.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 9,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 700,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.7,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'max_iter': 5,
 'verbose': True,
 'early_stopping': True}

### Final csv file with current model

#### Train on whole dataset train and test to get the last month

In [46]:
print(len(np.concatenate([y_train, y_test], axis=0)))
#pd.concat([X_train_plus_chosen_FI, X_test_plus_chosen_FI], ignore_index=False)
pd.concat([X_train_chosen_FI, X_test_chosen_FI], ignore_index=False)

496771


Unnamed: 0,year,month,day,weekday,hour,counter_name_152 boulevard du Montparnasse E-O,counter_name_152 boulevard du Montparnasse O-E,counter_name_18 quai de l'Hôtel de Ville NO-SE,counter_name_18 quai de l'Hôtel de Ville SE-NO,counter_name_20 Avenue de Clichy NO-SE,...,site_id_100056332,site_id_100056334,site_id_100056335,site_id_100056336,site_id_100057329,site_id_100057380,site_id_100057445,site_id_100060178,site_id_100063175,site_id_300014702
400125,-1.364391,0.736581,-1.629701,-1.000283,-1.517593,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
408305,-1.364391,0.736581,-1.629701,-1.000283,-1.517593,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
87516,-1.364391,0.736581,-1.629701,-1.000283,-1.517593,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
98518,-1.364391,0.736581,-1.629701,-1.000283,-1.517593,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
875137,-1.364391,0.736581,-1.629701,-1.000283,-1.517593,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792857,0.000000,1.561991,-0.783237,0.048554,1.661061,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
805182,0.000000,1.561991,-0.783237,0.048554,1.661061,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
815218,0.000000,1.561991,-0.783237,0.048554,1.661061,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
125979,0.000000,1.561991,-0.783237,0.048554,1.661061,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [47]:
# concatenated_X_train_test = pd.concat([X_train_plus_chosen_FI, X_test_plus_chosen_FI], ignore_index=False)
# concatenated_y_train_test = np.concatenate([y_train, y_test], axis=0)

concatenated_X_train_test = pd.concat([X_train_chosen_FI, X_test_chosen_FI], ignore_index=False)
concatenated_y_train_test = np.concatenate([y_train, y_test], axis=0)

Regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    verbose = True
)

# LGMBRegressor
# params = {
#         'colsample_bytree': 0.7, 
#         'learning_rate': 0.01,
#         'max_depth': 11,
#         'min_child_samples': 198,
#         'min_child_weight': 0.1,
#         'n_estimators': 2000,
#         'num_leaves': 99,
#         'reg_alpha': 1, 
#         'reg_lambda': 0.1,
#         'subsample': 0.5
# }

# Regressor = lgb.LGBMRegressor(**params)

# Ridge
#Regressor = Ridge()

# Initialize the XGBRegressor
# Regressor = xgb.XGBRegressor({'objective': 'reg:squarederror',
#  'learning_rate': 0.05,
#  'max_depth': 12,
#  'n_estimators': 700,
#  'max_iter': 5,
#  'verbose': True,
#  'early_stopping': True})

# Initialize the MLPRegressor
# Regressor = MLPRegressor(hidden_layer_sizes=(100,), 
#                          activation='relu', 
#                          solver='adam', 
#                          alpha=0.0001, 
#                          batch_size='auto', 
#                          learning_rate='constant', 
#                          learning_rate_init=0.001, 
#                          max_iter=100, 
#                          shuffle=True, 
#                          random_state=None, 
#                          tol=0.0001, 
#                          verbose=True, 
#                          warm_start=True
#                          )

# Train model with selected features
Regressor.fit(concatenated_X_train_test, concatenated_y_train_test)

Parameters: { "verbose" } are not used.



In [48]:
# Get submission kaggle to csv
submission_kaggle(Regressor, X_final_test_chosen_FI)

[0.6518004  0.5825644  0.47236848 ... 2.6631536  2.003648   2.0176    ]


### Feature selections with RFECV

In [112]:
from sklearn.feature_selection import RFECV
import xgboost as xgb

# Initialize the XGBRegressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Initialize RFECV
selector = RFECV(estimator=xgb_reg, step=1, cv=2)

# Fit RFECV
selector = selector.fit(X_train_plus_chosen_FI, y_train)

# Print the optimal number of features
print("Optimal number of features : %d" % selector.n_features_)

Optimal number of features : 92


In [114]:
# Print selected features
selected_features = [feature for feature, selected in zip(X_train_plus_chosen_FI.columns, selector.support_) if selected]
print("Selected features:", selected_features)

Selected features: ['latitude', 'longitude', 'ff', 'u', 'n', 'vv', 'rr3', 't', 'is_holiday', 'is_weekend', 'is_lockdown', 'is_curfew', 'hosp', 'rea', 'incid_rea', 'rad', 'Max_Grav_accidents', 'Count_accidents', 'month', 'day', 'weekday', 'hour', 'counter_name_152 boulevard du Montparnasse E-O', 'counter_name_152 boulevard du Montparnasse O-E', "counter_name_18 quai de l'Hôtel de Ville NO-SE", 'counter_name_20 Avenue de Clichy NO-SE', 'counter_name_20 Avenue de Clichy SE-NO', 'counter_name_254 rue de Vaugirard NE-SO', 'counter_name_254 rue de Vaugirard SO-NE', 'counter_name_27 quai de la Tournelle NO-SE', 'counter_name_27 quai de la Tournelle SE-NO', 'counter_name_28 boulevard Diderot E-O', 'counter_name_28 boulevard Diderot O-E', 'counter_name_36 quai de Grenelle NE-SO', 'counter_name_36 quai de Grenelle SO-NE', 'counter_name_38 rue Turbigo NE-SO', 'counter_name_38 rue Turbigo SO-NE', 'counter_name_39 quai François Mauriac NO-SE', 'counter_name_39 quai François Mauriac SE-NO', 'counter

In [116]:
# Transform training and testing sets
X_train_selected = selector.transform(X_train_plus_chosen_FI)
X_test_selected = selector.transform(X_test_plus_chosen_FI)

# Train model with selected features
xgb_reg.fit(X_train_selected, y_train)

get_RMSE_local(xgb_reg, X_train_selected, y_train, X_test_selected, y_test)

Train set RMSE: 0.41
Test set RMSE: 0.95


## Train with pipeline

In [None]:
def preprocessing(X_train):
    
    date_encoder = FunctionTransformer(_encode_dates)
    date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name"]

    numeric_encoder = StandardScaler()
    numeric_cols = ['latitude', 'longitude', 't', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3', 'hosp', 'rea', 'incid_rea', 'rad', 'Count_accidents']
    numeric_cols = ['t', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3', 'hosp', 'rea', 'incid_rea', 'rad', 'Count_accidents']
    #numeric_cols = ['t', 'ff', 'u', 'ssfrai', 'n', 'vv', 'rr3']
    
    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", categorical_encoder, categorical_cols),
            ("num", numeric_encoder, numeric_cols),
        ],
        remainder="passthrough"  # This will pass through other columns not specified
    )
    return preprocessor, date_encoder

In [None]:
def get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test):
    
    n_folds = 5

    # Perform cross-validation and compute the scores
    cv_scores_train = cross_val_score(pipe, X_train, y_train, cv=n_folds, scoring='neg_mean_squared_error')
    cv_scores_test = cross_val_score(pipe, X_test, y_test, cv=n_folds, scoring='neg_mean_squared_error')

    # Convert the scores to root mean squared error
    rmse_scores_train = np.sqrt(-cv_scores_train)
    rmse_scores_test = np.sqrt(-cv_scores_test)
    
    print(
        f"Train set, RMSE={np.mean(rmse_scores_train):.2f}"
    )
    print(
        f"Test set, RMSE={np.mean(rmse_scores_test):.2f}"
    ) 

In [61]:
# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)

In [17]:
# Ridge pipe
regressor = Ridge()

pipe_Ridge = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_Ridge, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.74
Test set, RMSE=1.45


In [18]:
# Lasso pipe
regressor = Lasso()

pipe_Lasso = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_Lasso, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.70
Test set, RMSE=1.42


In [21]:
# ElasticNet pipe
regressor = ElasticNet()

pipe_ElasticNet = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_ElasticNet, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=1.70
Test set, RMSE=1.42


In [None]:
# RandomForestRegressor pipe
regressor = RandomForestRegressor()

pipe_RandomForestRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_RandomForestRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

In [42]:
# LGMBRegressor pipe
regressor = lgb.LGBMRegressor()

pipe_LGMBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_LGMBRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 180
[LightGBM] [Info] Start training from score 2.979088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1943
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 181
[LightGBM] [Info] Start training from score 3.125360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [72]:
# XGBRegressor pipe

best_params = {'colsample_bytree': 0.6154469128110744,
              'gamma': 1,
              'learning_rate': 0.09803049874792026,
              'max_depth': 9,
              'n_estimators': 363,
              'subsample': 0.5171942605576092}

regressor = xgb.XGBRegressor(objective='reg:squarederror',
                    n_estimators=best_params['n_estimators'],
                    max_depth=best_params['max_depth'],
                    learning_rate=best_params['learning_rate'],
                    subsample=best_params['subsample'],
                    colsample_bytree=best_params['colsample_bytree'],
                    gamma=best_params['gamma'],
                    seed=42
            )

pipe_XGBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Predict data and get RMSE
get_RMSE_local(pipe_XGBRegressor, X_train_plus_FI, y_train, X_test_plus_FI, y_test)

Train set, RMSE=0.66
Test set, RMSE=0.51


# Hyperparameters tuning

## LGBMRegressor

In [44]:
best_params_LGBM = {'lgbmregressor__colsample_bytree': 0.5232252063599989,
                    'lgbmregressor__learning_rate': 0.1315089703802877,
                    'lgbmregressor__max_depth': 7,
                    'lgbmregressor__n_estimators': 428,
                    'lgbmregressor__num_leaves': 26,
                    'lgbmregressor__subsample': 0.5066324805799333
                   }

# Define the hyperparameter space for LGBMRegressor
param_dist = {
    'lgbmregressor__n_estimators': randint(100, 500),
    'lgbmregressor__max_depth': randint(3, 10),
    'lgbmregressor__learning_rate': uniform(0.01, 0.2),
    'lgbmregressor__subsample': uniform(0.5, 0.5),
    'lgbmregressor__colsample_bytree': uniform(0.5, 0.5),
    'lgbmregressor__num_leaves': randint(20, 40),
}

# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)
pipe_LGMBRegressor = make_pipeline(date_encoder, preprocessor, regressor)

# Create a RandomizedSearchCV object for LightGBM
random_search_lgbm = RandomizedSearchCV(
    estimator= pipe_LGMBRegressor,  # Ensure your pipeline ends with a LGBMRegressor
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_root_mean_squared_error',  # Scoring metric to optimize
    cv=5,  # Number of folds in cross-validation
    random_state=42
)

# Fit to the data
random_search_lgbm.fit(X_train_plus_FI, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found for LGBM: ", random_search_lgbm.best_params_)
print("Lowest RMSE found for LGBM: ", np.abs(random_search_lgbm.best_score_))

# To predict and get RMSE on the test set using the best LightGBM model
best_model_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_model_lgbm.predict(X_test_plus_FI)
rmse_test_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
print("Test set RMSE of best LGBM model: ", rmse_test_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 180
[LightGBM] [Info] Start training from score 2.979088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1943
[LightGBM] [Info] Number of data points in the train set: 364130, number of used features: 181
[LightGBM] [Info] Start training from score 3.125360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

## XGBregressor

In [34]:
# Define the hyperparameter space
param_dist = {
    'xgbregressor__n_estimators': randint(100, 500),
    'xgbregressor__max_depth': randint(3, 10),
    'xgbregressor__learning_rate': uniform(0.01, 0.2),
    'xgbregressor__subsample': uniform(0.5, 0.5),
    'xgbregressor__colsample_bytree': uniform(0.5, 0.5),
    'xgbregressor__gamma': [0, 0.1, 0.5, 1]
}

# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train_plus_FI)
pipe_XGBregressor = make_pipeline(date_encoder, preprocessor, regressor)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=pipe_XGBregressor,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_root_mean_squared_error',  # Scoring metric to optimize
    cv=5,  # Number of folds in cross-validation
    random_state=42
)

# Fit to the data
random_search.fit(X_train_plus_FI, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", random_search.best_params_)
print("Lowest RMSE found: ", np.abs(random_search.best_score_))

# To predict and get RMSE on the test set using the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_plus_FI)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test set RMSE of best model: ", rmse_test)

Best parameters found:  {'xgbregressor__colsample_bytree': 0.6154469128110744, 'xgbregressor__gamma': 1, 'xgbregressor__learning_rate': 0.09803049874792026, 'xgbregressor__max_depth': 9, 'xgbregressor__n_estimators': 363, 'xgbregressor__subsample': 0.5171942605576092}
Lowest RMSE found:  0.677347199973148
Test set RMSE of best model:  0.4584068511534211
