# Initial model: random forest to predict 9:30 from 9:00 data

In [1]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import pandas as pd

In [2]:
input_str = './../../data'
run_str = '../..'

save_path = Path().resolve()
data_path = Path(input_str).resolve()

os.chdir(Path(run_str))

from model.parameters import stations_montrouge, stations_paris, relevant_stations
from model.parameters import week_ends, week_days, capacity_series

In [3]:
from data_preprocessing import get_train_data, get_test_data

In [4]:
X_train, y_train = get_train_data()

In [5]:
X_train

Unnamed: 0,date,2006,2008,2009,2111,2112,21205,21209,21212,21215,week_day,week-end,school_holiday,bank_holiday
0,2022-11-09,0.625000,0.500000,0.517241,0.800000,0.647059,0.730769,1.000000,0.500000,1.000000,2,False,False,False
1,2022-11-30,0.708333,0.571429,0.517241,0.866667,0.588235,0.846154,0.900000,0.863636,0.939394,2,False,False,False
2,2022-05-23,0.583333,0.785714,0.344828,0.766667,0.588235,0.846154,0.633333,0.545455,0.575758,0,False,False,False
3,2022-07-06,0.416667,0.392857,0.241379,0.733333,0.529412,0.461538,0.933333,0.545455,0.696970,2,False,False,False
4,2022-11-14,0.208333,0.142857,0.517241,0.466667,0.117647,0.923077,0.933333,1.000000,0.757576,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,2022-08-31,0.791667,0.535714,0.551724,0.933333,0.588235,0.807692,0.900000,0.136364,0.757576,2,False,True,False
104,2022-09-14,0.541667,0.678571,0.465517,0.800000,0.529412,0.846154,0.966667,0.590909,0.757576,2,False,False,False
105,2022-11-11,1.000000,0.892857,1.000000,0.933333,0.764706,0.615385,0.166667,0.909091,0.606061,4,False,False,True
106,2022-09-13,0.750000,0.571429,0.465517,0.933333,0.764706,0.884615,0.966667,0.454545,0.878788,1,False,False,False


In [6]:
y_train

Unnamed: 0,date,2006,2008,2009,2111,2112
0,2022-11-09,0.208333,0.214286,0.086207,0.466667,0.352941
1,2022-11-30,0.416667,0.357143,0.172414,0.600000,0.411765
2,2022-05-23,0.333333,0.500000,0.086207,0.533333,0.294118
3,2022-07-06,0.333333,0.178571,0.000000,0.733333,0.058824
4,2022-11-14,0.000000,0.107143,0.017241,0.200000,0.000000
...,...,...,...,...,...,...
103,2022-08-31,0.375000,0.357143,0.172414,0.766667,0.176471
104,2022-09-14,0.125000,0.535714,0.137931,0.666667,0.411765
105,2022-11-11,1.000000,0.892857,1.000000,0.900000,0.529412
106,2022-09-13,0.291667,0.321429,0.086207,0.633333,0.235294


Remarks:
* in the dataset, 136 lines compared to 191 dates in the "date data". Where does the difference come from?

# ML training!

In this section, we perform the ML training properly speaking!

In [None]:
X_train = pd.read_csv(save_path/"features_train.csv", index_col="date")
y_train = pd.read_csv(save_path/"target_train.csv", index_col="date")

In [None]:
X, X_cv, y, y_cv = train_test_split(X_train.values, y_train.values, test_size=0.33)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regr = RandomForestRegressor()

In [None]:
regr.fit(X, y)

In [None]:
regr.predict(X_cv)

array([[0.0775    , 0.15357143, 0.12017241, 0.276     , 0.06882353],
       [0.48375   , 0.49785714, 0.5112069 , 0.71266667, 0.50117647],
       [0.49958333, 0.355     , 0.20431034, 0.613     , 0.32705882],
       [0.4375    , 0.30571429, 0.12293103, 0.49033333, 0.27647059],
       [0.28791667, 0.37857143, 0.22482759, 0.33433333, 0.19588235],
       [0.37666667, 0.26714286, 0.12344828, 0.555     , 0.21117647],
       [0.38375   , 0.29714286, 0.20844828, 0.486     , 0.32176471],
       [0.37208333, 0.51714286, 0.42931034, 0.64233333, 0.44588235],
       [0.365     , 0.24678571, 0.17448276, 0.52833333, 0.28058824],
       [0.115     , 0.21571429, 0.10172414, 0.41333333, 0.11823529],
       [0.16708333, 0.20178571, 0.10586207, 0.33766667, 0.1       ],
       [0.2525    , 0.46      , 0.30568966, 0.48833333, 0.32352941],
       [0.3025    , 0.28892857, 0.15224138, 0.41      , 0.21882353],
       [0.295     , 0.49428571, 0.40586207, 0.576     , 0.51352941],
       [0.25708333, 0.33321429, 0.

In [None]:
y_cv

array([[0.08333333, 0.03571429, 0.05172414, 0.4       , 0.        ],
       [0.45833333, 0.5       , 0.46551724, 0.56666667, 0.47058824],
       [0.54166667, 0.46428571, 0.22413793, 0.76666667, 0.35294118],
       [0.70833333, 0.42857143, 0.15517241, 0.53333333, 0.29411765],
       [0.29166667, 0.39285714, 0.10344828, 0.23333333, 0.17647059],
       [0.25      , 0.25      , 0.13793103, 0.86666667, 0.        ],
       [0.20833333, 0.35714286, 0.25862069, 0.56666667, 0.23529412],
       [0.41666667, 0.5       , 0.24137931, 0.7       , 0.52941176],
       [0.20833333, 0.21428571, 0.0862069 , 0.46666667, 0.35294118],
       [0.125     , 0.07142857, 0.34482759, 0.4       , 0.23529412],
       [0.04166667, 0.10714286, 0.01724138, 0.5       , 0.05882353],
       [0.04166667, 0.53571429, 0.32758621, 0.4       , 0.11764706],
       [0.33333333, 0.25      , 0.13793103, 0.43333333, 0.23529412],
       [0.16666667, 0.39285714, 0.25862069, 0.76666667, 0.64705882],
       [0.25      , 0.46428571, 0.

In [None]:
regr.score(X_cv, y_cv)

0.47430029951473596

Saving values of regr.score(X_cv, y_cv)
* 2022-12-23
0.47430029951473596

* 

## Random forest take 2: one hot encoding of days

## AutoML approach

To run the AutoML system, we rely on an *ad hoc* virtual environnement created with conda. It can be activated with:
```
conda activate predi-veli-ml-venv
```
To run from it more easily, we create a separate python script.

The AutoML library that we use is only available from Linux