# T# Training model

In [6]:
## To get some insights
# https://github.com/ceptln/paris-bike-traffic-prediction/tree/main


from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

import problem
from submissions.external_data.estimator import _encode_dates, _merge_external_data

## Read Data

In [7]:
# Read data
X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

##  Preprocessing

In [None]:
    # Adding cosinus and sinus features from date variables to enhance the date periodicity
    X['cos_hour'] = np.cos(X['hour']*(2.*np.pi/24))
    X['sin_hour'] = np.sin(X['hour']*(2.*np.pi/24))
    X['cos_day'] = np.cos(X['day']*(2.*np.pi/30))
    X['sin_day'] = np.sin(X['day']*(2.*np.pi/30))
    X['cos_month'] = np.cos(X['month']*(2.*np.pi/12))
    X['sin_month'] = np.sin(X['month']*(2.*np.pi/12))
    X['cos_weekday'] = np.cos(X['weekday']*(2.*np.pi/7))
    X['sin_weekday'] = np.sin(X['weekday']*(2.*np.pi/7))

In [21]:
def preprocessing(X_train):
    
    date_encoder = FunctionTransformer(_encode_dates)
    date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name"]

    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", categorical_encoder, categorical_cols),
        ]
    )
    return preprocessor, date_encoder

In [22]:
# Get preprocessor
preprocessor, date_encoder = preprocessing(X_train)

## Training pipe

In [39]:
regressor = Ridge()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
#pipe.fit(X_train, y_train)

# Training and Testing

In [40]:
def get_RMSE_local(pipe, X_train, y_train, X_test, y_test):
    
    n_folds = 5

    # Perform cross-validation and compute the scores
    cv_scores_train = cross_val_score(pipe, X_train, y_train, cv=n_folds, scoring='neg_mean_squared_error')
    cv_scores_test = cross_val_score(pipe, X_test, y_test, cv=n_folds, scoring='neg_mean_squared_error')

    # Convert the scores to root mean squared error
    rmse_scores_train = np.sqrt(-cv_scores_train)
    rmse_scores_test = np.sqrt(-cv_scores_test)
    
    print(
        f"Train set, RMSE={np.mean(rmse_scores_train):.2f}"
    )
    print(
        f"Test set, RMSE={np.mean(rmse_scores_test):.2f}"
    ) 

In [41]:
# Predict data and get RMSE
get_RMSE_local(pipe, X_train, y_train, X_test, y_test)

Train set, RMSE=0.93
Test set, RMSE=0.57
