In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

# Dataset

In [2]:
dataset = pd.read_csv("master_lag_ml_inversed.csv")

In [3]:
print("Number of rows : {}".format(dataset.shape[0]))
print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 39600
Number of columns : 37

Display of dataset: 


Unnamed: 0,Date,code_region,Nom_region,Consommation (MW),year,month,brent_price,TIME_PERIOD,prix_kwh_elec,prix_gaz,...,rolling_mean_7,rolling_mean_15,lag_364,lag_inversed_1,lag_inversed_2,lag_inversed_3,lag_inversed_4,lag_inversed_5,lag_inversed_6,lag_inversed_7
0,2013-12-31,11,IDF,476296.0,2013,12,109.95,2013-01-01,0.1524,33.412419,...,446431.142857,467695.0,399392.0,476296.0,424366.0,455574.0,450735.0,425674.0,431730.0,470656.0
1,2014-01-01,11,IDF,424366.0,2014,1,,2014-01-01,0.1585,29.812258,...,450683.571429,461028.2,492157.0,424366.0,455574.0,450735.0,425674.0,431730.0,470656.0,466931.0
2,2014-01-02,11,IDF,455574.0,2014,1,107.94,2014-01-01,0.1585,29.812258,...,450867.571429,457468.0,487111.0,455574.0,450735.0,425674.0,431730.0,470656.0,466931.0,460424.0
3,2014-01-03,11,IDF,450735.0,2014,1,106.57,2014-01-01,0.1585,29.812258,...,449652.0,454167.8,470053.0,450735.0,425674.0,431730.0,470656.0,466931.0,460424.0,469782.0
4,2014-01-04,11,IDF,425674.0,2014,1,,2014-01-01,0.1585,29.812258,...,450649.571429,448541.0,433732.0,425674.0,431730.0,470656.0,466931.0,460424.0,469782.0,494143.0



Basics statistics: 


Unnamed: 0,Date,code_region,Nom_region,Consommation (MW),year,month,brent_price,TIME_PERIOD,prix_kwh_elec,prix_gaz,...,rolling_mean_7,rolling_mean_15,lag_364,lag_inversed_1,lag_inversed_2,lag_inversed_3,lag_inversed_4,lag_inversed_5,lag_inversed_6,lag_inversed_7
count,39600,39600.0,39600,39600.0,39600.0,39600.0,27492.0,35076,35076.0,39456.0,...,39600.0,39600.0,39600.0,39600.0,39588.0,39576.0,39564.0,39552.0,39540.0,39528.0
unique,3300,,12,,,,,9,,,...,,,,,,,,,,
top,2013-12-31,,IDF,,,,,2016-01-01,,,...,,,,,,,,,,
freq,12,,3300,,,,,4392,,,...,,,,,,,,,,
mean,,49.916667,,213100.266604,2018.016667,6.504848,66.441414,,0.175182,30.924968,...,213106.935281,213135.844902,215521.009798,213100.266604,213087.861637,213083.435959,213074.56449,213064.497333,213058.100013,213052.096046
std,,25.640326,,100100.702575,2.595605,3.459196,22.53491,,0.011112,32.80123,...,98722.111908,98297.006053,101542.767422,100100.702575,100095.710054,100096.80924,100094.894062,100092.445838,100092.972802,100093.46317
min,,11.0,,6570.0,2013.0,1.0,9.12,,0.1524,4.514677,...,66643.857143,71461.566667,59592.0,6570.0,6570.0,6570.0,6570.0,6570.0,6570.0,6570.0
25%,,27.75,,131327.25,2016.0,4.0,49.39,,0.1676,14.470833,...,129712.642857,130001.566667,132879.75,131327.25,131321.0,131319.875,131315.375,131311.625,131310.375,131307.5
50%,,48.0,,195702.5,2018.0,7.0,62.87,,0.1704,19.9658,...,196034.071429,196081.666667,197143.5,195702.5,195696.5,195690.0,195686.75,195681.75,195674.5,195667.0
75%,,75.25,,274129.75,2020.0,10.0,77.51,,0.1893,24.505,...,276470.928571,277235.816667,277276.75,274129.75,274104.25,274104.25,274083.5,274083.5,274083.5,274083.5



Percentage of missing values: 


Date                  0.000000
code_region           0.000000
Nom_region            0.000000
Consommation (MW)     0.000000
year                  0.000000
month                 0.000000
brent_price          30.575758
TIME_PERIOD          11.424242
prix_kwh_elec        11.424242
prix_gaz              0.363636
day                   0.000000
day_of_week           0.000000
lag_1                 0.000000
lag_2                 0.000000
lag_3                 0.000000
lag_4                 0.000000
lag_5                 0.000000
lag_6                 0.000000
lag_7                 0.000000
lag_8                 0.000000
lag_9                 0.000000
lag_10                0.000000
lag_11                0.000000
lag_12                0.000000
lag_13                0.000000
lag_14                0.000000
lag_15                0.000000
rolling_mean_7        0.000000
rolling_mean_15       0.000000
lag_364               0.000000
lag_inversed_1        0.000000
lag_inversed_2        0.030303
lag_inve

In [4]:
dataset.columns

Index(['Date', 'code_region', 'Nom_region', 'Consommation (MW)', 'year',
       'month', 'brent_price', 'TIME_PERIOD', 'prix_kwh_elec', 'prix_gaz',
       'day', 'day_of_week', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
       'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12',
       'lag_13', 'lag_14', 'lag_15', 'rolling_mean_7', 'rolling_mean_15',
       'lag_364', 'lag_inversed_1', 'lag_inversed_2', 'lag_inversed_3',
       'lag_inversed_4', 'lag_inversed_5', 'lag_inversed_6', 'lag_inversed_7'],
      dtype='object')

# Features / Labels split

In [5]:
# Separate target variable y from features X
print("Separating labels from features...")
features_list = [
                'Nom_region'
                ,'lag_1' 
                ,'lag_2' 
                ,'lag_3' 
                ,'lag_4' 
                ,'lag_5' 
                ,'lag_6' 
                ,'lag_7' 
                ,'lag_8','lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_364'
                , 'rolling_mean_7', 'rolling_mean_15'
                #, 'temp_max', 'temp_min', 'hours_of_sun', 'precipitation', 'windspeed' 
                #, 'prix_kwh_elec', 'prix_gaz', 'brent_price'
                , 'day', 'year', 'month', 'day_of_week'
                ]
target_variable = ['lag_inversed_1']

X = dataset.loc[:,features_list]

y = dataset.loc[:,target_variable]

Separating labels from features...


# Preprocessing

In [6]:
# Soit à la main : 
numeric_features = [
                'lag_1', 
                'lag_2', 
                'lag_3', 
                'lag_4', 
                'lag_5', 
                'lag_6', 
                'lag_7', 
                'lag_8','lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_364'
                , 'rolling_mean_7', 'rolling_mean_15'
                #'temp_max', 'temp_min', 'hours_of_sun', 'precipitation', 'windspeed' 
                #, 'prix_kwh_elec', 'prix_gaz', 'brent_price'
                , 'year'
                    ]
categorical_features = ['Nom_region', 'day_of_week', 'day', 'month']

In [7]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=1)),
    ('scaler', StandardScaler()) 
])

In [8]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    #('imputer', KNNImputer(n_neighbors=1)),
    ('encoder', OneHotEncoder(drop='first'))
    ])

In [9]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [10]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
X = preprocessor.fit_transform(X)
print('...Done.')

Performing preprocessings on train set...
...Done.


# Model

In [11]:
rfr = RandomForestRegressor(random_state=0, n_jobs=-1)

rfr.fit(X, y.values.ravel())

y_pred = rfr.predict(X)