In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, ParameterGrid
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error, f1_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier


import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

# Dataset

In [8]:
dataset = pd.read_csv("IDF_364.csv")

In [9]:
print("Number of rows : {}".format(dataset.shape[0]))
print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 3294
Number of columns : 43

Display of dataset: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_IDF,lag_10_IDF,lag_11_IDF,lag_12_IDF,lag_13_IDF,lag_14_IDF,lag_15_IDF,rolling_mean_7_IDF,rolling_mean_15_IDF,lag_364_IDF
0,4368,2013-12-31,11,476296.0,13005.0,0.0,395.0,71.0,48.0,6514.0,...,446277.0,477600.0,510076.0,500238.0,508977.0,524368.0,525853.0,446431.142857,467695.0,399392.0
1,4380,2014-01-01,11,424366.0,12694.0,0.0,655.0,41.0,48.0,7465.0,...,471877.0,446277.0,477600.0,510076.0,500238.0,508977.0,524368.0,450683.571429,461028.2,492157.0
2,4392,2014-01-02,11,455574.0,12185.0,0.0,598.0,155.0,99.0,7397.0,...,450994.0,471877.0,446277.0,477600.0,510076.0,500238.0,508977.0,450867.571429,457468.0,487111.0
3,4404,2014-01-03,11,450735.0,12003.0,0.0,818.0,124.0,123.0,7571.0,...,394599.0,450994.0,471877.0,446277.0,477600.0,510076.0,500238.0,449652.0,454167.8,470053.0
4,4416,2014-01-04,11,425674.0,12041.0,0.0,475.0,131.0,96.0,7528.0,...,454286.0,394599.0,450994.0,471877.0,446277.0,477600.0,510076.0,450649.571429,448541.0,433732.0



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_IDF,lag_10_IDF,lag_11_IDF,lag_12_IDF,lag_13_IDF,lag_14_IDF,lag_15_IDF,rolling_mean_7_IDF,rolling_mean_15_IDF,lag_364_IDF
count,3294.0,3294,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,...,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0
unique,,3294,,,,,,,,,...,,,,,,,,,,
top,,2013-12-31,,,,,,,,,...,,,,,,,,,,
freq,,1,,,,,,,,,...,,,,,,,,,,
mean,24124.010018,,11.0,380859.180024,11305.780358,0.0,823.41014,592.886005,292.537037,6731.604584,...,381100.229356,381125.574985,381156.929417,381223.190953,381279.034153,381332.339253,381379.17881,380964.682149,381073.9703,386901.053127
std,11409.12124,,0.0,87930.132963,13729.576386,0.0,928.075327,469.444345,146.50611,981.745845,...,87926.834149,87942.617598,87970.179497,87977.649839,88000.347222,88033.829405,88069.668771,83459.292017,81630.494676,89925.731779
min,4368.0,,11.0,23480.0,-816.0,0.0,0.0,0.0,0.0,3804.0,...,23480.0,23480.0,23480.0,23480.0,23480.0,23480.0,23480.0,249972.142857,252030.733333,226369.0
25%,14247.0,,11.0,311287.375,0.0,0.0,190.0,232.25,209.0,6104.25,...,311337.875,311337.875,311337.875,311418.0,311418.0,311418.0,311418.0,313190.357143,313095.9,316036.5
50%,24126.0,,11.0,358579.0,5070.75,0.0,475.0,507.0,291.0,6728.5,...,358747.5,358747.5,358747.5,358834.0,358908.0,358983.0,358983.0,356108.142857,357189.666667,363916.5
75%,34005.0,,11.0,447201.75,19692.25,0.0,1073.75,805.0,402.0,7314.75,...,447884.0,447973.0,448024.75,448070.75,448115.0,448196.5,448264.25,449925.607143,453995.166667,454261.0



Percentage of missing values: 


Unnamed: 0              0.000000
Date                    0.000000
Code INSEE région       0.000000
Consommation (MW)       0.000000
Thermique (MW)          0.000000
Nucléaire (MW)          0.000000
Eolien (MW)             0.000000
Solaire (MW)            0.000000
Hydraulique (MW)        0.000000
Bioénergies (MW)        0.000000
Ech. physiques (MW)     0.000000
Stockage batterie       0.000000
year                    0.000000
month                   0.000000
brent_price            30.449302
TIME_PERIOD            11.262902
prix_kwh_elec          11.262902
temp_max                0.091075
temp_min                0.091075
hours_of_sun            0.121433
precipitation           0.121433
windspeed               0.091075
prix_gaz                0.182149
day                     0.000000
day_of_week             0.000000
lag_1_IDF               0.000000
lag_2_IDF               0.000000
lag_3_IDF               0.000000
lag_4_IDF               0.000000
lag_5_IDF               0.000000
lag_6_IDF 

In [10]:
dataset.columns

Index(['Unnamed: 0', 'Date', 'Code INSEE région', 'Consommation (MW)',
       'Thermique (MW)', 'Nucléaire (MW)', 'Eolien (MW)', 'Solaire (MW)',
       'Hydraulique (MW)', 'Bioénergies (MW)', 'Ech. physiques (MW)',
       'Stockage batterie', 'year', 'month', 'brent_price', 'TIME_PERIOD',
       'prix_kwh_elec', 'temp_max', 'temp_min', 'hours_of_sun',
       'precipitation', 'windspeed', 'prix_gaz', 'day', 'day_of_week',
       'lag_1_IDF', 'lag_2_IDF', 'lag_3_IDF', 'lag_4_IDF', 'lag_5_IDF',
       'lag_6_IDF', 'lag_7_IDF', 'lag_8_IDF', 'lag_9_IDF', 'lag_10_IDF',
       'lag_11_IDF', 'lag_12_IDF', 'lag_13_IDF', 'lag_14_IDF', 'lag_15_IDF',
       'rolling_mean_7_IDF', 'rolling_mean_15_IDF', 'lag_364_IDF'],
      dtype='object')

# Train test split

In [11]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")

split_date = '2021-01-01'
train = dataset.loc[dataset['Date'] <= split_date].copy()
test = dataset.loc[dataset['Date'] > split_date].copy()

# Y_train = dataset.loc[dataset['Date'] <= split_date].copy()
# Y_test = dataset.loc[dataset['Date'] > split_date].copy()

Dividing into train and test sets...


In [12]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = [
                'lag_1_IDF', 'lag_2_IDF', 'lag_3_IDF', 'lag_4_IDF', 'lag_5_IDF', 'lag_6_IDF', 'lag_7_IDF', 'lag_8_IDF' 
                ,'lag_9_IDF', 'lag_10_IDF', 'lag_11_IDF', 'lag_12_IDF', 'lag_13_IDF', 'lag_14_IDF', 'lag_15_IDF', 'lag_364_IDF'
                ,'rolling_mean_7_IDF', 'rolling_mean_15_IDF'
                ,'temp_max', 'temp_min', 'hours_of_sun', 'precipitation', 'windspeed' 
                , 'prix_kwh_elec', 'prix_gaz', 'brent_price'
                , 'day', 'year', 'month', 'day_of_week'
                ]
target_variable = ["Consommation (MW)"]

X_train = train.loc[:,features_list]
X_test = test.loc[:,features_list]

y_train = train.loc[:,target_variable]
y_test = test.loc[:,target_variable]

Separating labels from features...


# Preprocessing

In [13]:
# Soit à la main : 
numeric_features = [
                'lag_1_IDF', 'lag_2_IDF', 'lag_3_IDF', 'lag_4_IDF', 'lag_5_IDF', 'lag_6_IDF', 'lag_7_IDF', 'lag_8_IDF' 
                ,'lag_9_IDF', 'lag_10_IDF', 'lag_11_IDF', 'lag_12_IDF', 'lag_13_IDF', 'lag_14_IDF', 'lag_15_IDF', 'lag_364_IDF'
                , 'rolling_mean_7_IDF', 'rolling_mean_15_IDF'
                , 'temp_max', 'temp_min', 'hours_of_sun', 'precipitation', 'windspeed' 
                , 'prix_kwh_elec', 'prix_gaz', 'brent_price'
                , 'day', 'year', 'month'
                    ]
categorical_features = ['day_of_week']

In [14]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=1)),
    ('scaler', StandardScaler()) 
])

In [15]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', KNNImputer(n_neighbors=1)),
    ('encoder', OneHotEncoder(drop='first')),
    ])

In [16]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [17]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()




# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
   lag_1_IDF  lag_2_IDF  lag_3_IDF  lag_4_IDF  lag_5_IDF  lag_6_IDF  \
0   494874.0   427028.0   418691.0   459244.0   454286.0   394599.0   
1   476296.0   494874.0   427028.0   418691.0   459244.0   454286.0   
2   424366.0   476296.0   494874.0   427028.0   418691.0   459244.0   
3   455574.0   424366.0   476296.0   494874.0   427028.0   418691.0   
4   450735.0   455574.0   424366.0   476296.0   494874.0   427028.0   

   lag_7_IDF  lag_8_IDF  lag_9_IDF  lag_10_IDF  ...  hours_of_sun  \
0   450994.0   471877.0   446277.0    477600.0  ...       2.21000   
1   394599.0   450994.0   471877.0    446277.0  ...       1.42500   
2   454286.0   394599.0   450994.0    471877.0  ...       3.92000   
3   459244.0   454286.0   394599.0    450994.0  ...       3.53250   
4   418691.0   459244.0   454286.0    394599.0  ...       2.71375   

   precipitation  windspeed  prix_kwh_elec   prix_gaz  brent_price  day  year  \
0         2.5750    21.7250        

# Model

In [18]:
sgdr = SGDRegressor(random_state=0)

In [19]:
param_grid = {
            'penalty': ['l1'],
            #'alpha' : [0.00025],
            'max_iter' : [5000]
            }
#0.00000035,
best_train_score = 0
best_test_score = 0

for g in ParameterGrid(param_grid):
    sgdr.set_params(**g)
    sgdr.fit(X_train, y_train.values.ravel())
    train_score = sgdr.score(X_train, y_train.values.ravel())
    test_score = sgdr.score(X_test, y_test.values.ravel())
    # if we got a better score, store the score and parameters
    if test_score > best_test_score:
        best_train_score = train_score
        best_test_score = test_score
        best_parameters = g

print("Best train score: {}".format(best_train_score))
print("Best test score: {}".format(best_test_score))
print("Best parameters: {}".format(best_parameters))

Best train score: 0.9957172804321053
Best test score: 0.9851311371267116
Best parameters: {'max_iter': 5000, 'penalty': 'l1'}
