In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error, f1_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

In [2]:
dataset = pd.read_csv("Centre_364.csv")

In [3]:
print("Number of rows : {}".format(dataset.shape[0]))
print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 3294
Number of columns : 43

Display of dataset: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_Centre,lag_10_Centre,lag_11_Centre,lag_12_Centre,lag_13_Centre,lag_14_Centre,lag_15_Centre,rolling_mean_7_Centre,rolling_mean_15_Centre,lag_364_Centre
0,4369,2013-12-31,24,123702.0,3692.0,458112.0,18462.0,261.0,1417.0,1718.0,...,121410.0,132535.0,136381.0,130637.0,134732.0,144785.0,142761.0,118279.0,124541.933333,103218.0
1,4381,2014-01-01,24,111026.0,3587.0,432699.0,26090.0,184.0,912.0,2437.0,...,121604.0,121410.0,132535.0,136381.0,130637.0,134732.0,144785.0,119504.0,122291.333333,129493.0
2,4393,2014-01-02,24,118557.0,3624.0,426931.0,24354.0,513.0,1813.0,2452.0,...,118092.0,121604.0,121410.0,132535.0,136381.0,130637.0,134732.0,119392.285714,121213.0,128514.0
3,4405,2014-01-03,24,118939.0,3612.0,461726.0,29514.0,442.0,1241.0,2442.0,...,102451.0,118092.0,121604.0,121410.0,132535.0,136381.0,130637.0,118925.142857,120433.133333,123471.0
4,4417,2014-01-04,24,113776.0,3673.0,483816.0,22203.0,418.0,899.0,2425.0,...,119339.0,102451.0,118092.0,121604.0,121410.0,132535.0,136381.0,119325.428571,118926.133333,113936.0



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_Centre,lag_10_Centre,lag_11_Centre,lag_12_Centre,lag_13_Centre,lag_14_Centre,lag_15_Centre,rolling_mean_7_Centre,rolling_mean_15_Centre,lag_364_Centre
count,3294.0,3294,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,...,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0
unique,,3294,,,,,,,,,...,,,,,,,,,,
top,,2013-12-31,,,,,,,,,...,,,,,,,,,,
freq,,1,,,,,,,,,...,,,,,,,,,,
mean,24125.010018,,24.0,102079.049636,1919.584092,396214.333637,12517.364602,2097.222526,592.917577,2632.889192,...,102143.547359,102152.262143,102159.500759,102177.372344,102193.767608,102209.917881,102225.280055,102106.553192,102136.857488,103210.080753
std,11409.12124,,0.0,24702.361361,2087.75204,88075.353596,10707.997696,1764.568592,712.85943,347.991379,...,24701.136916,24706.794872,24713.330377,24712.631632,24716.310189,24726.755287,24736.225797,23223.073475,22689.864076,25017.364847
min,4369.0,,24.0,7257.0,0.0,113551.0,168.0,0.0,0.0,1128.0,...,7257.0,7257.0,7257.0,7257.0,7257.0,7257.0,7257.0,66643.857143,71461.566667,61192.0
25%,14248.0,,24.0,83248.0,48.0,326685.375,4617.0,841.25,64.25,2395.25,...,83251.25,83251.25,83251.25,83252.5,83261.75,83261.75,83261.75,82242.0,82173.1,84052.25
50%,24127.0,,24.0,94815.0,599.5,391640.5,8992.0,1691.5,285.0,2632.5,...,94911.5,94911.5,94911.5,94935.0,94982.0,95022.5,95026.0,93900.714286,94362.433333,95701.0
75%,34006.0,,24.0,120924.5,3825.75,461317.5,17527.0,2721.75,840.25,2894.0,...,121018.75,121041.75,121076.0,121125.75,121148.0,121172.75,121195.75,121738.642857,122459.35,122298.25



Percentage of missing values: 


Unnamed: 0                 0.000000
Date                       0.000000
Code INSEE région          0.000000
Consommation (MW)          0.000000
Thermique (MW)             0.000000
Nucléaire (MW)             0.000000
Eolien (MW)                0.000000
Solaire (MW)               0.000000
Hydraulique (MW)           0.000000
Bioénergies (MW)           0.000000
Ech. physiques (MW)        0.000000
Stockage batterie          0.000000
year                       0.000000
month                      0.000000
brent_price               30.449302
TIME_PERIOD               11.262902
prix_kwh_elec             11.262902
temp_max                   0.091075
temp_min                   0.091075
hours_of_sun               0.121433
precipitation              0.121433
windspeed                  0.091075
prix_gaz                   0.182149
day                        0.000000
day_of_week                0.000000
lag_1_Centre               0.000000
lag_2_Centre               0.000000
lag_3_Centre               0

In [4]:
dataset.columns

Index(['Unnamed: 0', 'Date', 'Code INSEE région', 'Consommation (MW)',
       'Thermique (MW)', 'Nucléaire (MW)', 'Eolien (MW)', 'Solaire (MW)',
       'Hydraulique (MW)', 'Bioénergies (MW)', 'Ech. physiques (MW)',
       'Stockage batterie', 'year', 'month', 'brent_price', 'TIME_PERIOD',
       'prix_kwh_elec', 'temp_max', 'temp_min', 'hours_of_sun',
       'precipitation', 'windspeed', 'prix_gaz', 'day', 'day_of_week',
       'lag_1_Centre', 'lag_2_Centre', 'lag_3_Centre', 'lag_4_Centre',
       'lag_5_Centre', 'lag_6_Centre', 'lag_7_Centre', 'lag_8_Centre',
       'lag_9_Centre', 'lag_10_Centre', 'lag_11_Centre', 'lag_12_Centre',
       'lag_13_Centre', 'lag_14_Centre', 'lag_15_Centre',
       'rolling_mean_7_Centre', 'rolling_mean_15_Centre', 'lag_364_Centre'],
      dtype='object')

In [5]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")

split_date = '2021-01-01'
train = dataset.loc[dataset['Date'] <= split_date].copy()
test = dataset.loc[dataset['Date'] > split_date].copy()

Dividing into train and test sets...


In [6]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["day_of_week", "temp_max","temp_min","hours_of_sun", "precipitation", "windspeed","prix_kwh_elec","lag_1_Centre", "lag_2_Centre",
       "lag_3_Centre", "lag_4_Centre", "lag_5_Centre", "lag_6_Centre", "lag_7_Centre",
       "lag_8_Centre", "lag_9_Centre", "lag_10_Centre", "lag_11_Centre", "lag_12_Centre","lag_364_Centre",
       "lag_13_Centre", "lag_14_Centre", "lag_15_Centre", "rolling_mean_7_Centre",
       "rolling_mean_15_Centre"]
target_variable = ["Consommation (MW)"]

X_train = train.loc[:,features_list]
X_test = test.loc[:,features_list]

Y_train = train.loc[:,target_variable]
Y_test = test.loc[:,target_variable]

Separating labels from features...


In [7]:
numeric_features = ["temp_max","temp_min", "hours_of_sun", "precipitation", "windspeed","prix_kwh_elec","lag_1_Centre", "lag_2_Centre",
       "lag_3_Centre", "lag_4_Centre", "lag_5_Centre", "lag_6_Centre", "lag_7_Centre",
       "lag_8_Centre", "lag_9_Centre", "lag_10_Centre", "lag_11_Centre", "lag_12_Centre","lag_12_Centre",
       "lag_13_Centre", "lag_14_Centre", "lag_15_Centre", "rolling_mean_7_Centre",
       "rolling_mean_15_Centre"]
categorical_features = ["day_of_week"]

In [8]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=1)),
    ('scaler', StandardScaler())
])

In [9]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', KNNImputer(n_neighbors=1)),
    ('encoder', OneHotEncoder(drop="first"))
    ])

In [10]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5])
print()




# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:])
print()

Performing preprocessings on train set...
   day_of_week   temp_max  temp_min  hours_of_sun  precipitation  windspeed  \
0            1   9.700000  5.933333      3.181667       2.166667  23.183333   
1            2  10.533333  4.983333      2.433333       3.716667  33.566667   
2            3  10.350000  7.833333      3.738333       2.566667  33.300000   
3            4  12.116667  7.316667      3.735000       1.683333  32.266667   
4            5  11.200000  5.933333      3.265000       2.050000  24.666667   

   prix_kwh_elec  lag_1_Centre  lag_2_Centre  lag_3_Centre  ...  lag_9_Centre  \
0         0.1524      133032.0      116246.0      110974.0  ...      121410.0   
1         0.1585      123702.0      133032.0      116246.0  ...      121604.0   
2         0.1585      111026.0      123702.0      133032.0  ...      118092.0   
3         0.1585      118557.0      111026.0      123702.0  ...      102451.0   
4         0.1585      118939.0      118557.0      111026.0  ...      119339.0 

In [12]:
# Train model
print("Train model...")
regressor = Lasso(alpha=86)
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


  model = cd_fast.enet_coordinate_descent(


In [13]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[125926.60780831 121150.4267409  119799.37120942 ... 136998.25184535
 136245.34072542 138598.68609915]



In [14]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[137406.41611494 136088.62556253 144190.96957984 150601.40326858
 151539.27418843 153227.9260176  157557.03408261 150508.60589612
 144410.62743432 156912.60700208 156909.77245697 141634.10033339
 135613.4919889  138407.67290084 138276.00519054 134874.6622298
 138303.20154796 140520.24472827 134938.98718037 138434.14939738
 144716.24419982 132460.07569465 133942.70892209 150890.70455827
 154367.63268564 153778.86749171 141082.9564093  128133.52318738
 117094.32240581 111831.27606141 123239.5808277  127130.00924137
 122990.69555255 122731.76920333 120740.96750305 112303.83379885
 117050.09411286 137396.87834097 147234.60366517 155765.86475384
 165071.07427407 172556.49276653 167321.52688681 161473.26157526
 161282.44681566 151833.6418994  144825.3651291  142848.24019676
 136534.67317365 116842.98437429 107330.53996865 110793.01353112
 112393.0970434  114968.55920719 112865.59327816 112240.4227971
 112154.44453178 113088.75845193 121083.87062184 121543.

In [15]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9756336881505661
R2 score on test set :  0.9674737280787808
