In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from keras.models import load_model
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso, LassoLars, LinearRegression, ElasticNet, Ridge, PassiveAggressiveRegressor, \
SGDRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor

%matplotlib inline

Using TensorFlow backend.


In [2]:
price_data = pd.read_csv('./raw_data/EPEX_spot_DA_auction_hour_prices_20070720-20170831.csv', parse_dates=True,
                        index_col=0)

In [3]:
scaler = joblib.load('./models/DA_price_scaler.pkl')

In [4]:
price_data.loc[:, 'DA_price_scaled'] = scaler.transform(price_data['DA_price'].values.reshape(-1, 1))

In [5]:
timeseries_input = price_data['DA_price_scaled'].copy(deep=True).to_frame()

In [6]:
lags = 168
for l in range(1, lags+1):
    timeseries_input['l_{}'.format(l)] = timeseries_input['DA_price_scaled'].shift(l)

In [7]:
timeseries_input.dropna(inplace=True)

In [8]:
timeseries_input.drop('DA_price_scaled', axis=1, inplace=True)

In [9]:
features = timeseries_input[timeseries_input.index.hour == 0]

In [10]:
feature_arr = features.as_matrix().reshape(features.shape[0], features.shape[1], 1)
feature_arr_lin = features.as_matrix().reshape(features.shape[0], features.shape[1])

In [11]:
timeseries_model = load_model('./models/timeseries_final_model_tuning.hdf5')

In [12]:
pred = timeseries_model.predict([feature_arr, feature_arr_lin])

In [13]:
price_data.loc[timeseries_input.index[0]:timeseries_input.index[-1], 'bottleneck_features'] = scaler.inverse_transform(pred).flatten()

In [14]:
price_data.dropna(inplace=True)
price_data.drop('DA_price_scaled', axis=1, inplace=True)

In [15]:
gen = pd.read_csv('./processed_data/20150101-20170830-gen_per_prod_type.csv', parse_dates=True, index_col=0)
forecast = pd.read_csv('./processed_data/20150101-20170830-forecast_load_renewable_gen.csv', parse_dates=True, index_col=0)

In [16]:
data = pd.concat([gen, forecast], axis=1)

### Simple Regression Model on important features

In [17]:
data.columns

Index(['biomass', 'brown_coal', 'hard_coal', 'wind_offshore', 'pumped_hydro',
       'solar', 'river_hydro', 'wind_onshore', 'nuclear', 'other',
       'load_forecast', 'load_true', 'sum_forecast', 'solar_forecast',
       'offshore_forecast', 'onshore_forecast'],
      dtype='object')

In [50]:
# prim_cols = ['brown_coal', 'hard_coal', 'load_true', 'nuclear', 'solar', 'wind_offshore', 'wind_onshore']
prim_cols = ['brown_coal', 'hard_coal', 'load_true', 'nuclear', 'solar', 'wind_offshore', 'wind_onshore',
             'pumped_hydro', 'river_hydro', 'other']
sub_cols = ['solar_forecast', 'offshore_forecast', 'onshore_forecast', 'load_forecast', 'brown_coal', 'hard_coal',
            'nuclear', 'pumped_hydro', 'river_hydro', 'other']

In [51]:
training_data = data[prim_cols]
training_data_sub = data[sub_cols]

In [52]:
training_data.head()

Unnamed: 0_level_0,brown_coal,hard_coal,load_true,nuclear,solar,wind_offshore,wind_onshore,pumped_hydro,river_hydro,other
MTU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-01 00:00:00,15905.0,2760.0,49025.0,10742.0,0.0,520.0,8177.0,708.0,2689.0,4889.0
2015-01-01 00:15:00,15853.0,2900.0,48534.0,10585.0,0.0,517.0,8153.0,576.0,2672.0,4803.0
2015-01-01 00:30:00,15699.0,2798.0,48548.0,10643.0,0.0,514.0,8226.0,607.0,2662.0,4869.0
2015-01-01 00:45:00,15487.0,2673.0,48249.0,10872.0,0.0,515.0,8217.0,285.0,2656.0,4851.0
2015-01-01 01:00:00,15596.0,2096.0,47658.0,11089.0,0.0,515.0,8257.0,269.0,2622.0,4782.0


In [53]:
training_data = training_data.resample('1H').mean()
training_data_sub = training_data_sub.resample('1H').mean()

In [54]:
training_data = training_data.join(price_data)
training_data_sub = training_data_sub.join(price_data)

In [55]:
training_data.dropna(inplace=True)
training_data_sub.dropna(inplace=True)

In [24]:
training_data.head()

Unnamed: 0_level_0,brown_coal,hard_coal,load_true,nuclear,solar,wind_offshore,wind_onshore,pumped_hydro,river_hydro,other,DA_price,bottleneck_features
MTU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,15736.0,2782.75,48589.0,10710.5,0.0,516.5,8193.25,544.0,2669.75,4853.0,25.02,26.270916
2015-01-01 01:00:00,15364.75,1929.75,47032.25,11086.25,0.0,516.25,8367.5,409.5,2617.0,4743.5,18.29,22.510576
2015-01-01 02:00:00,14852.75,1824.0,45619.0,11026.25,0.0,514.0,8604.0,632.75,2578.75,4836.5,16.04,20.091061
2015-01-01 03:00:00,14111.0,1959.0,44253.75,11027.75,0.0,517.75,8617.0,558.25,2545.25,4840.25,14.6,17.68614
2015-01-01 04:00:00,14149.0,2012.25,43765.5,10962.25,0.0,519.75,8707.5,602.75,2557.75,4820.75,14.95,17.597729


### Benchmark from the timeseries model

In [56]:
train = training_data[:'2016']
test = training_data['2017']

train_sub = training_data_sub[:'2016']
test_sub = training_data_sub['2017']

In [57]:
print(mean_absolute_error(test.DA_price, test.bottleneck_features))

5.55367538082


### Benchmark regression with perfect information

In [65]:
features_train = train[['brown_coal', 'hard_coal', 'load_true', 'nuclear', 'solar', 'wind_offshore', 'wind_onshore',
                       'bottleneck_features', 'pumped_hydro', 'river_hydro', 'other']]
labels_train = train[['DA_price']]
features_test = test[['brown_coal', 'hard_coal', 'load_true', 'nuclear', 'solar', 'wind_offshore', 'wind_onshore',
                       'bottleneck_features', 'pumped_hydro', 'river_hydro', 'other']]

In [66]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test.DA_price, pred))

4.62698059411


In [67]:
from sklearn.feature_selection import RFE

In [68]:
selector = RFE(LinearRegression(), n_features_to_select=1)
selector.fit(features_train.as_matrix(), labels_train.as_matrix())
selector.ranking_

  y = column_or_1d(y, warn=True)


array([ 3, 10,  9, 11,  8,  7,  5,  1,  2,  6,  4])

### Using forecast inputs

In [69]:
features_train = train_sub[['solar_forecast', 'offshore_forecast', 'onshore_forecast', 'load_forecast',
                        'bottleneck_features', 'brown_coal', 'hard_coal', 'nuclear', 'pumped_hydro',
                        'river_hydro', 'other']]
labels_train = train_sub[['DA_price']]
features_test = test_sub[['solar_forecast', 'offshore_forecast', 'onshore_forecast', 'load_forecast',
                        'bottleneck_features', 'brown_coal', 'hard_coal', 'nuclear', 'pumped_hydro',
                        'river_hydro', 'other']]

In [70]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test_sub.DA_price, pred))

4.52281743062


### Using 'predictable' inputs

In [76]:
features_train = train[['load_true', 'solar','wind_offshore', 'wind_onshore',
                       'bottleneck_features']]
labels_train = train[['DA_price']]
features_test = test[['load_true', 'solar','wind_offshore', 'wind_onshore',
                       'bottleneck_features']]

In [78]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test.DA_price, pred))

4.98114110312


In [72]:
features_train = train_sub[['load_forecast', 'solar_forecast','offshore_forecast', 'onshore_forecast',
                       'bottleneck_features']]
labels_train = train_sub[['DA_price']]
features_test = test_sub[['load_forecast', 'solar_forecast','offshore_forecast', 'onshore_forecast',
                       'bottleneck_features']]

In [74]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test_sub.DA_price, pred))

4.72861605
