In [60]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_squared_error

## Data processing

### Load data

In [2]:
Load = pd.read_csv("./data_input/Load.csv",sep=';',usecols=['FR','DateTime'])
Load.index = pd.to_datetime(Load['DateTime'],format='%d/%m/%Y %H:%M:%S.%f').dt.round('H')
Load.drop(['DateTime'],axis=1,inplace=True)
Load.columns = ['Load']

### Features

In [3]:
# Temperature
TA = pd.read_csv("./data_input/TA_tab.csv")
TA.index = pd.to_datetime(TA[['Year','Month','Day']])
TA.drop(['Year','Month','Day'],axis=1,inplace=True)
# Public holiday
Raw_JF = pd.read_csv('./data_input/jours_feries_metropole.csv')
Raw_JF.index = pd.to_datetime(Raw_JF['date'])
JF = TA[[]].copy()
JF['Férié'] = 0
JF.loc[:,'Férié'].loc[Raw_JF.loc[(Raw_JF.index>='2014')&(Raw_JF.index<'2020-01-01')].index] = 1
# Resample from day to hours
TA = TA.resample('h').ffill()
JF = JF.resample('h').ffill()

Temperatures that are the most negatively correlated with load in France between 2015 and 2020.

In [4]:
correlations = pd.concat(
                [Load.query('index>="2015" & index<"2020"'),
                TA.query('index>="2015" & index<"2020"')],axis=1).corr()
correlations[['Load']].nsmallest(10,columns='Load')

Unnamed: 0,Load
ITC4,-0.789307
ITH2,-0.789079
ITH1,-0.787501
ITH4,-0.787022
DE27,-0.786647
ITC1,-0.786626
AT21,-0.786171
CH05,-0.786077
ITH3,-0.786007
CH07,-0.785964


In [5]:
from utils import temperature_pca, create_time_features

Features creation

In [10]:
# Features initialisation
Features = pd.DataFrame(index=Load.index)
# Time features
# Features = Features.merge(
#     create_time_features(Load),
#     how='left',
#     left_index=True,
#     right_index=True
#     )
# Temperature PCA on all location features
Features = Features.merge(
    temperature_pca(TA,0.95,'all'),
    how='left',
    left_index=True,
    right_index=True
    )
# Temperature PCA on France features
Features = Features.merge(
    temperature_pca(TA,0.95,'FR'),
    how='left',
    left_index=True,
    right_index=True
    )
# Public Holiday features
Features = Features.merge(
    JF,
    how='left',
    left_index=True,
    right_index=True
    )

## Train & Test sets

In [47]:
X_train = Features.query('index >= 2015&index < 2019')
y_train = Load.query('index >= 2015&index < 2019')['Load']

X_test = Features.query('index >= 2019&index < 2020')
y_test = Load.query('index >= 2019&index < 2020')['Load']

train_set = pd.concat([X_train,y_train], axis = 1)
test_set = pd.concat([X_test,y_test], axis = 1)
train_set = train_set.rename(columns={'Load':'y'})
train_set['ds'] = train_set.index
test_set = pd.concat([X_test,y_test], axis = 1)
test_set = test_set.rename(columns={'Load':'y'})
test_set['ds'] = test_set.index
all_set = pd.concat([Features.query('index>=2015'),Load.query('index >= 2015')])
all_set = all_set.rename(columns={'Load':'y'})
all_set['ds'] = all_set.index

In [48]:
model = Prophet()
for feature in ['all_PCA_1', 'all_PCA_2', 'all_PCA_3', 'all_PCA_4', 'FR_PCA_1', 'Férié']:
    model.add_regressor(feature)
model.fit(train_set)

13:26:01 - cmdstanpy - INFO - Chain [1] start processing
13:26:51 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x26930c8a740>

In [49]:
forecast = model.predict(test_set)

In [65]:
# Merge the actual and predicted values into a single dataframe
df_all = pd.merge(test_set, forecast[['ds', 'yhat']], on='ds', how='outer')

# Make the plot
df_all.plot(x='ds', y=['y','yhat'], title=f"RMSE = {np.sqrt(mean_squared_error(df_all['y'],df_all['yhat']))}")