In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error, f1_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

In [2]:
dataset = pd.read_csv("IDF_364.csv")

In [3]:
print("Number of rows : {}".format(dataset.shape[0]))
print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 3294
Number of columns : 43

Display of dataset: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_IDF,lag_10_IDF,lag_11_IDF,lag_12_IDF,lag_13_IDF,lag_14_IDF,lag_15_IDF,rolling_mean_7_IDF,rolling_mean_15_IDF,lag_364_IDF
0,4368,2013-12-31,11,476296.0,13005.0,0.0,395.0,71.0,48.0,6514.0,...,446277.0,477600.0,510076.0,500238.0,508977.0,524368.0,525853.0,446431.142857,467695.0,399392.0
1,4380,2014-01-01,11,424366.0,12694.0,0.0,655.0,41.0,48.0,7465.0,...,471877.0,446277.0,477600.0,510076.0,500238.0,508977.0,524368.0,450683.571429,461028.2,492157.0
2,4392,2014-01-02,11,455574.0,12185.0,0.0,598.0,155.0,99.0,7397.0,...,450994.0,471877.0,446277.0,477600.0,510076.0,500238.0,508977.0,450867.571429,457468.0,487111.0
3,4404,2014-01-03,11,450735.0,12003.0,0.0,818.0,124.0,123.0,7571.0,...,394599.0,450994.0,471877.0,446277.0,477600.0,510076.0,500238.0,449652.0,454167.8,470053.0
4,4416,2014-01-04,11,425674.0,12041.0,0.0,475.0,131.0,96.0,7528.0,...,454286.0,394599.0,450994.0,471877.0,446277.0,477600.0,510076.0,450649.571429,448541.0,433732.0



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,Date,Code INSEE région,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),Solaire (MW),Hydraulique (MW),Bioénergies (MW),...,lag_9_IDF,lag_10_IDF,lag_11_IDF,lag_12_IDF,lag_13_IDF,lag_14_IDF,lag_15_IDF,rolling_mean_7_IDF,rolling_mean_15_IDF,lag_364_IDF
count,3294.0,3294,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,...,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0,3294.0
unique,,3294,,,,,,,,,...,,,,,,,,,,
top,,2013-12-31,,,,,,,,,...,,,,,,,,,,
freq,,1,,,,,,,,,...,,,,,,,,,,
mean,24124.010018,,11.0,380859.180024,11305.780358,0.0,823.41014,592.886005,292.537037,6731.604584,...,381100.229356,381125.574985,381156.929417,381223.190953,381279.034153,381332.339253,381379.17881,380964.682149,381073.9703,386901.053127
std,11409.12124,,0.0,87930.132963,13729.576386,0.0,928.075327,469.444345,146.50611,981.745845,...,87926.834149,87942.617598,87970.179497,87977.649839,88000.347222,88033.829405,88069.668771,83459.292017,81630.494676,89925.731779
min,4368.0,,11.0,23480.0,-816.0,0.0,0.0,0.0,0.0,3804.0,...,23480.0,23480.0,23480.0,23480.0,23480.0,23480.0,23480.0,249972.142857,252030.733333,226369.0
25%,14247.0,,11.0,311287.375,0.0,0.0,190.0,232.25,209.0,6104.25,...,311337.875,311337.875,311337.875,311418.0,311418.0,311418.0,311418.0,313190.357143,313095.9,316036.5
50%,24126.0,,11.0,358579.0,5070.75,0.0,475.0,507.0,291.0,6728.5,...,358747.5,358747.5,358747.5,358834.0,358908.0,358983.0,358983.0,356108.142857,357189.666667,363916.5
75%,34005.0,,11.0,447201.75,19692.25,0.0,1073.75,805.0,402.0,7314.75,...,447884.0,447973.0,448024.75,448070.75,448115.0,448196.5,448264.25,449925.607143,453995.166667,454261.0



Percentage of missing values: 


Unnamed: 0              0.000000
Date                    0.000000
Code INSEE région       0.000000
Consommation (MW)       0.000000
Thermique (MW)          0.000000
Nucléaire (MW)          0.000000
Eolien (MW)             0.000000
Solaire (MW)            0.000000
Hydraulique (MW)        0.000000
Bioénergies (MW)        0.000000
Ech. physiques (MW)     0.000000
Stockage batterie       0.000000
year                    0.000000
month                   0.000000
brent_price            30.449302
TIME_PERIOD            11.262902
prix_kwh_elec          11.262902
temp_max                0.091075
temp_min                0.091075
hours_of_sun            0.121433
precipitation           0.121433
windspeed               0.091075
prix_gaz                0.182149
day                     0.000000
day_of_week             0.000000
lag_1_IDF               0.000000
lag_2_IDF               0.000000
lag_3_IDF               0.000000
lag_4_IDF               0.000000
lag_5_IDF               0.000000
lag_6_IDF 

In [4]:
dataset.columns

Index(['Unnamed: 0', 'Date', 'Code INSEE région', 'Consommation (MW)',
       'Thermique (MW)', 'Nucléaire (MW)', 'Eolien (MW)', 'Solaire (MW)',
       'Hydraulique (MW)', 'Bioénergies (MW)', 'Ech. physiques (MW)',
       'Stockage batterie', 'year', 'month', 'brent_price', 'TIME_PERIOD',
       'prix_kwh_elec', 'temp_max', 'temp_min', 'hours_of_sun',
       'precipitation', 'windspeed', 'prix_gaz', 'day', 'day_of_week',
       'lag_1_IDF', 'lag_2_IDF', 'lag_3_IDF', 'lag_4_IDF', 'lag_5_IDF',
       'lag_6_IDF', 'lag_7_IDF', 'lag_8_IDF', 'lag_9_IDF', 'lag_10_IDF',
       'lag_11_IDF', 'lag_12_IDF', 'lag_13_IDF', 'lag_14_IDF', 'lag_15_IDF',
       'rolling_mean_7_IDF', 'rolling_mean_15_IDF', 'lag_364_IDF'],
      dtype='object')

In [5]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")

split_date = '2021-01-01'
train = dataset.loc[dataset['Date'] <= split_date].copy()
test = dataset.loc[dataset['Date'] > split_date].copy()

# Y_train = dataset.loc[dataset['Date'] <= split_date].copy()
# Y_test = dataset.loc[dataset['Date'] > split_date].copy()

Dividing into train and test sets...


In [6]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["day_of_week", "temp_max","temp_min","hours_of_sun", "precipitation", "windspeed","prix_kwh_elec","lag_1_IDF", "lag_2_IDF",
       "lag_3_IDF", "lag_4_IDF", "lag_5_IDF", "lag_6_IDF", "lag_7_IDF",
       "lag_8_IDF", "lag_9_IDF", "lag_10_IDF", "lag_11_IDF", "lag_12_IDF","lag_364_IDF",
       "lag_13_IDF", "lag_14_IDF", "lag_15_IDF", "rolling_mean_7_IDF",
       "rolling_mean_15_IDF"]
target_variable = ["Consommation (MW)"]

X_train = train.loc[:,features_list]
X_test = test.loc[:,features_list]

Y_train = train.loc[:,target_variable]
Y_test = test.loc[:,target_variable]

Separating labels from features...


In [7]:
# Soit à la main : 
numeric_features = ["temp_max","temp_min", "hours_of_sun", "precipitation", "windspeed","prix_kwh_elec","lag_1_IDF", "lag_2_IDF",
       "lag_3_IDF", "lag_4_IDF", "lag_5_IDF", "lag_6_IDF", "lag_7_IDF",
       "lag_8_IDF", "lag_9_IDF", "lag_10_IDF", "lag_11_IDF", "lag_12_IDF","lag_12_IDF",
       "lag_13_IDF", "lag_14_IDF", "lag_15_IDF", "rolling_mean_7_IDF",
       "rolling_mean_15_IDF"]
categorical_features = ["day_of_week"]

In [8]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=1)), # missing values will be replaced by columns' mean or median
    ('scaler', StandardScaler()) # inutile pour les decision trees, le modèle gèrera qd même les coefs des features
                                # Mais TJR le mettre, au pire il ne sert à rien, au mieux ça améliore les perfs (pas juste une question d'échelle, 
                                # certains modèles en ont vmt besoin)
])

In [9]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', KNNImputer(n_neighbors=1)), # missing values will be replaced by columns' most frequent value
    ('encoder', OneHotEncoder(drop="first"))
    ])

In [10]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()




# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
   day_of_week  temp_max  temp_min  hours_of_sun  precipitation  windspeed  \
0            1    8.4875    5.4250       2.21000         2.5750    21.7250   
1            2   10.1625    4.6250       1.42500         2.7250    33.9375   
2            3   10.1750    7.8500       3.92000         1.5875    32.8250   
3            4   12.3750    7.8625       3.53250         1.9500    32.3625   
4            5   10.8125    6.5125       2.71375         1.1875    22.1250   

   prix_kwh_elec  lag_1_IDF  lag_2_IDF  lag_3_IDF  ...  lag_9_IDF  lag_10_IDF  \
0         0.1524   494874.0   427028.0   418691.0  ...   446277.0    477600.0   
1         0.1585   476296.0   494874.0   427028.0  ...   471877.0    446277.0   
2         0.1585   424366.0   476296.0   494874.0  ...   450994.0    471877.0   
3         0.1585   455574.0   424366.0   476296.0  ...   394599.0    450994.0   
4         0.1585   450735.0   455574.0   424366.0  ...   454286.0    394599.0   

  

In [12]:
# Train model
print("Train model...")
regressor = Lasso(alpha=86)
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


  model = cd_fast.enet_coordinate_descent(


In [13]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[480131.40626222 447374.73966057 448631.45869791 ... 475325.56167512
 468640.83424491 466676.57343439]



In [14]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[462326.76149617 472741.11597945 520793.63061015 544929.15863773
 545039.10548715 541063.3114182  545590.69329358 523723.19992398
 508935.66486348 556257.70609549 543146.4802049  507979.35449328
 487389.4339846  507175.41384341 521098.02459129 494303.82252294
 515216.56155565 519633.38914485 494700.45498219 473185.35877747
 476988.37196501 461529.05912873 479871.070206   538151.64100089
 549980.01745063 543275.31452044 489051.71754975 456397.8727157
 425578.07243564 408822.57020825 448479.91042975 439190.60635076
 418983.5364777  431415.35836407 442443.42514616 414777.46269684
 428976.77775728 519726.97570465 579098.65198802 597093.45043398
 600240.22007122 600192.59019637 571750.79689747 560222.11110993
 565119.85567598 508436.9557772  465742.70893217 454276.30106744
 437484.92907025 376946.9793306  347119.9221492  385773.01621042
 379146.83551964 365115.73254719 365051.24786386 370489.80893053
 365131.97803218 377733.91792745 420143.62904873 417469

In [15]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9925188783248788
R2 score on test set :  0.9850309515565837
