In [1]:
# Import basic packages
import pandas as pd
import numpy as np
from pathlib import Path

# Import skleanr packages
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import _name_estimators, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

# Import LightGBM
import lightgbm as lgb

import utils.get_data_index_arange as get_data

In [2]:
def add_external_data(X):
    
    X = X.copy()
    
    # merge original data + external data
    merged_X_train_external_DATA = get_data._merge_external_data_weather(X)
    
    # merge original data + external data + holidays
    merged_X_train_external_HOLIDAYS = get_data._merge_holidays_week_end(merged_X_train_external_DATA)
    
    # merge original data + external data + holidays + data COVID
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_Curfews_lockdowns_COVID(merged_X_train_external_HOLIDAYS)
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_indicators_COVID(merged_X_train_external_HOLIDAYS_COVID)
    
    # merge original data + external data + holidays + data COVID + data accidents
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS = get_data._merge_road_accidents(merged_X_train_external_HOLIDAYS_COVID)
     
    return merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS

In [3]:
def _encode_dates(X):
    
    '''
    Splits the 'date' columns of the input DataFrame into several columns (year, month, day, weekday, hour)
    
    Parameters:
        X (pd.DataFrame): the dataframe to modify
    
    Returns:
        X (pd.DataFrame): the modified dataframe
    '''
    
    # Duplicate X to work on it
    X = X.copy()
    
    # Create new columns with date parts from X.date
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    
    # Clean the new dataframe and return it
    X.drop(columns=["date"], inplace=True)
    return X

In [4]:
def get_estimator():
    
    '''
    Creates a pipe which:
        (1) performs all the transformations done by the functions merge_external_data() and _encode_dates(),
        (2) encodes the categorical and numerical data, 
        (3) performs a LightGBM regression with tuned parameters.

    Parameters:
        None

    Returns:
        pipe (sklearn Pipeline objet): the given pipeline
    '''

    # Call the merge_external_data function
    merge_external = FunctionTransformer(add_external_data, validate=False)

    # Call the _encode_dates function to split the date column to several columns
    date_encoder = FunctionTransformer(_encode_dates)
    
    # Encode the final columns
    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", 'site_id']
    numeric_encoder = StandardScaler()
    numeric_cols = ['t', 'u','rr3', 'year', 'month', 'day', 'weekday', 'hour']
    
    # Create a ColumnTransformer object to perform all encodings
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_encoder, categorical_cols),
            ("numeric", numeric_encoder, numeric_cols) 
        ]
    )
    
    params = {
        'colsample_bytree': 0.7, 
        'learning_rate': 0.01,
        'max_depth': 11,
        'min_child_samples': 198,
        'min_child_weight': 0.1,
        'n_estimators': 2000,
        'num_leaves': 99,
        'reg_alpha': 1, 
        'reg_lambda': 0.1,
        'subsample': 0.5
    }
    
    # Create the regressor object 
    regressor = lgb.LGBMRegressor(**params)

    # Create pipeline
    pipe = make_pipeline(
        merge_external,
        date_encoder, 
        preprocessor, 
        regressor
    )
    
    return pipe

In [5]:
def get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test):
    from sklearn.metrics import mean_squared_error
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)

    # Calculate RMSE for training and test data
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print(f"Train set RMSE: {rmse_train:.2f}")
    print(f"Test set RMSE: {rmse_test:.2f}")


# LOAD DATA

In [6]:
# Read data
X_train, y_train = get_data.get_train_data()
X_test, y_test = get_data.get_test_data()
X_final_test = get_data.get_final_test_data()

# TRAINING

In [7]:
pipe = get_estimator()

In [8]:
# Predict data and get RMSE
get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test)

             counter_id                       counter_name    site_id  \
0   100049407-353255860  152 boulevard du Montparnasse E-O  100049407   
30  100049407-353255859  152 boulevard du Montparnasse O-E  100049407   
31  100036719-104036719  18 quai de l'Hôtel de Ville NO-SE  100036719   
32  100036719-103036719  18 quai de l'Hôtel de Ville SE-NO  100036719   
33  100063175-353277233          20 Avenue de Clichy NO-SE  100063175   

                        site_name                date  \
0   152 boulevard du Montparnasse 2020-09-01 01:00:00   
30  152 boulevard du Montparnasse 2020-09-01 01:00:00   
31    18 quai de l'Hôtel de Ville 2020-09-01 01:00:00   
32    18 quai de l'Hôtel de Ville 2020-09-01 01:00:00   
33            20 Avenue de Clichy 2020-09-01 01:00:00   

   counter_installation_date counter_technical_id   latitude  longitude  \
0                 2018-12-07          Y2H19070373  48.840801   2.333233   
30                2018-12-07          Y2H19070373  48.840801   2.333