In [1]:
# Import basic packages
import pandas as pd
import numpy as np
from pathlib import Path

# Import skleanr packages
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import _name_estimators, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

# Import xgboost
import xgboost as xgb

import utils.get_data_index_arange as get_data

In [2]:
def _encode_dates(X):
    
    '''
    Splits the 'date' columns of the input DataFrame into several columns (year, month, day, weekday, hour)
    
    Parameters:
        X (pd.DataFrame): the dataframe to modify
    
    Returns:
        X (pd.DataFrame): the modified dataframe
    '''
    
    # Duplicate X to work on it
    X = X.copy()
    
    # Create new columns with date parts from X.date
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    
    # Clean the new dataframe and return it
    X.drop(columns=["date"], inplace=True)
    return X

In [3]:
def add_external_data(X):
    
    X = X.copy()
    
    # merge original data + external data
    merged_X_train_external_DATA = get_data._merge_external_data_weather(X)
    
    # merge original data + external data + holidays
    merged_X_train_external_HOLIDAYS = get_data._merge_holidays_week_end(merged_X_train_external_DATA)
    
    # merge original data + external data + holidays + data COVID
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_Curfews_lockdowns_COVID(merged_X_train_external_HOLIDAYS)
    merged_X_train_external_HOLIDAYS_COVID = get_data._merge_indicators_COVID(merged_X_train_external_HOLIDAYS_COVID)
    
    # merge original data + external data + holidays + data COVID + data accidents
    merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS = get_data._merge_road_accidents(merged_X_train_external_HOLIDAYS_COVID)
     
    return merged_X_train_external_HOLIDAYS_COVID_ACCIDENTS

In [112]:
def get_estimator():
    
    '''
    Creates a pipe which:
        (1) performs all the transformations done by the functions merge_external_data() and _encode_dates(),
        (2) encodes the categorical and numerical data, 
        (3) performs a XGBOOST regression with tuned parameters.

    Parameters:
        None

    Returns:
        pipe (sklearn Pipeline objet): the given pipeline
    '''

    # Call the merge_external_data function
    merge_external = FunctionTransformer(add_external_data, validate=False)

    # Call the _encode_dates function to split the date column to several columns
    date_encoder = FunctionTransformer(_encode_dates)
    
    # Encode the final columns
    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", 'site_id']
    numeric_encoder = StandardScaler()
    numeric_cols = ['t', 'u','rr3', 'is_holiday', 'rea', 'rad', 'hosp', 'year', 'month', 'day', 'weekday', 'hour']
    

    # Create a ColumnTransformer object to perform all encodings
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_encoder, categorical_cols),
            ("numeric", numeric_encoder, numeric_cols)
        ]
    )
    
    params = {'objective': 'reg:squarederror',
              'base_score': None,
              'booster': None,
              'callbacks': None,
              'colsample_bylevel': None,
              'colsample_bynode': None,
              'colsample_bytree': None,
              'device': None,
              'early_stopping_rounds': None,
              'enable_categorical': False,
              'eval_metric': None,
              'feature_types': None,
              'gamma': 0,
              'grow_policy': None,
              'importance_type': None,
              'interaction_constraints': None,
              'learning_rate': 0.02,
              'max_bin': None,
              'max_cat_threshold': None,
              'max_cat_to_onehot': None,
              'max_delta_step': None,
              'max_depth': 13,
              'max_leaves': None,
              'min_child_weight': 5,
              'monotone_constraints': None,
              'multi_strategy': None,
              'n_estimators': 1500,
              'n_jobs': None,
              'num_parallel_tree': None,
              'random_state': None,
              'reg_alpha': None,
              'reg_lambda': None,
              'sampling_method': None,
              'scale_pos_weight': None,
              'subsample': 0.7,
              'tree_method': None,
              'validate_parameters': None,
              'verbosity': None,
              'verbose': True,
              'early_stopping': True}

    
    # Create the regressor object 
    regressor = xgb.XGBRegressor(**params)

    # Create pipeline
    pipe = make_pipeline(
        merge_external,
        date_encoder, 
        preprocessor, 
        regressor
    )
    
    return pipe

In [113]:
def get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test):
    from sklearn.metrics import mean_squared_error
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)

    # Calculate RMSE for training and test data
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print(f"Train set RMSE: {rmse_train:.2f}")
    print(f"Test set RMSE: {rmse_test:.2f}")


# LOAD DATA

In [114]:
# Read data
X_train, y_train = get_data.get_train_data()
X_test, y_test = get_data.get_test_data()
X_final_test = get_data.get_final_test_data()

# TRAINING

In [115]:
X_train.dtypes

counter_id                         category
counter_name                       category
site_id                               int64
site_name                          category
date                         datetime64[us]
counter_installation_date    datetime64[us]
counter_technical_id               category
latitude                            float64
longitude                           float64
dtype: object

In [116]:
pipe = get_estimator()

In [117]:
# Predict data and get RMSE
get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test)

Parameters: { "early_stopping", "verbose" } are not used.



Train set RMSE: 0.21
Test set RMSE: 0.48


# FINAL SUBMIT

In [118]:
concatenated_X_train_test = pd.concat([X_train, X_test], ignore_index=False)
concatenated_y_train_test = np.concatenate([y_train, y_test], axis=0)

pipe.fit(concatenated_X_train_test, concatenated_y_train_test)

Parameters: { "early_stopping", "verbose" } are not used.



## Create csv

In [119]:
def submission_kaggle(model, X_final_test):
    y_pred = model.predict(X_final_test)
    print(y_pred)
    results = pd.DataFrame(
        dict(
            Id=X_final_test.index,
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

    return results

In [120]:
a = submission_kaggle(pipe, X_final_test)

[1.2623069 1.0714648 1.2098156 ... 3.3640792 2.3899894 2.1868339]


In [121]:
a

Unnamed: 0,Id,log_bike_count
0,17081,1.262307
1,18655,1.071465
2,3124,1.209816
3,4147,1.049714
4,48210,2.503058
...,...,...
51435,42131,3.730787
51436,43042,3.189433
51437,43929,3.364079
51438,5707,2.389989
