In [5]:
import pandas as pd
import numpy as np
import datetime as dt
from keras.models import load_model
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso, LassoLars, LinearRegression, ElasticNet, Ridge, PassiveAggressiveRegressor, \
SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV 
%matplotlib inline

In [6]:
price_data = pd.read_csv('./raw_data/EPEX_spot_DA_auction_hour_prices_20070720-20170831.csv', parse_dates=True,
                        index_col=0)

In [7]:
actual = pd.read_csv('./processed_data/20150101-20170830-gen_per_prod_type.csv', parse_dates=True, index_col=0)
forecast = pd.read_csv('./processed_data/20150101-20170830-forecast_load_renewable_gen.csv', parse_dates=True, index_col=0)

In [8]:
forecast.drop('sum_forecast', axis=1, inplace=True)

In [9]:
timeseries_bottleneck = pd.read_csv('./bottleneck_features/bnf_timeseries.csv', parse_dates=True, index_col=0)

In [10]:
solar_bottleneck = pd.read_csv('./bottleneck_features/bnf_solar.csv', parse_dates=True, index_col=0)
wind_bottleneck = pd.read_csv('./bottleneck_features/bnf_wind.csv', parse_dates=True, index_col=0)

In [11]:
actual = actual.resample('1H').mean()
forecast = forecast.resample('1H').mean()

In [12]:
features = pd.concat([actual, forecast, timeseries_bottleneck, solar_bottleneck, wind_bottleneck], axis=1)
features.dropna(inplace=True)

In [13]:
index = features.index.intersection(price_data.index)

In [14]:
features = features.loc[index]
labels = price_data.loc[index]

In [15]:
features.columns

Index(['biomass', 'brown_coal', 'hard_coal', 'wind_offshore', 'pumped_hydro',
       'solar', 'river_hydro', 'wind_onshore', 'nuclear', 'other',
       'load_forecast', 'load_true', 'solar_forecast', 'offshore_forecast',
       'onshore_forecast', 'timeseries_pred', 'solar_bottleneck',
       'wind_bottleneck'],
      dtype='object')

In [16]:
features.head()

Unnamed: 0,biomass,brown_coal,hard_coal,wind_offshore,pumped_hydro,solar,river_hydro,wind_onshore,nuclear,other,load_forecast,load_true,solar_forecast,offshore_forecast,onshore_forecast,timeseries_pred,solar_bottleneck,wind_bottleneck
2015-01-01 01:00:00,4261.0,15364.75,1929.75,516.25,409.5,0.0,2617.0,8367.5,11086.25,4743.5,46952.5,47032.25,0.0,598.25,8161.75,16.366163,0.0,12577.362305
2015-01-01 02:00:00,4295.5,14852.75,1824.0,514.0,632.75,0.0,2578.75,8604.0,11026.25,4836.5,45751.5,45619.0,0.0,599.5,8324.75,13.697455,0.0,11986.717773
2015-01-01 03:00:00,4313.75,14111.0,1959.0,517.75,558.25,0.0,2545.25,8617.0,11027.75,4840.25,45306.25,44253.75,0.0,603.75,8440.25,10.958999,0.0,12911.827148
2015-01-01 04:00:00,4308.5,14149.0,2012.25,519.75,602.75,0.0,2557.75,8707.5,10962.25,4820.75,45423.0,43765.5,0.0,605.25,8621.25,10.259408,0.0,12743.263672
2015-01-01 05:00:00,4304.0,13509.5,1753.5,520.0,629.25,0.0,2554.75,8775.5,10696.0,4958.0,45701.5,43589.5,0.0,611.25,8825.75,11.00336,0.0,13013.442383


In [17]:
labels.head()

Unnamed: 0,DA_price
2015-01-01 01:00:00,18.29
2015-01-01 02:00:00,16.04
2015-01-01 03:00:00,14.6
2015-01-01 04:00:00,14.95
2015-01-01 05:00:00,14.5


In [47]:
def apply_predictor(predictor, features, labels):
    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)
    predictor.fit(X_train, y_train)
    pred = predictor.predict(X_test)
    
    mae = mean_absolute_error(y_test, pred)
    
    return mae

In [19]:
cols1 = ['biomass', 'brown_coal', 'hard_coal', 'wind_offshore', 'pumped_hydro',
       'solar', 'river_hydro', 'wind_onshore', 'nuclear', 'other', 'load_true']

cols2 = ['biomass', 'brown_coal', 'hard_coal', 'wind_offshore', 'pumped_hydro',
       'solar', 'river_hydro', 'wind_onshore', 'nuclear', 'other', 'load_true',
       'timeseries_pred']

cols3 = ['biomass', 'brown_coal', 'hard_coal', 'pumped_hydro', 'river_hydro',
        'nuclear', 'other', 'load_forecast', 'solar_forecast', 'offshore_forecast',
        'onshore_forecast']

cols4 = ['biomass', 'brown_coal', 'hard_coal', 'pumped_hydro', 'river_hydro',
        'nuclear', 'other', 'load_forecast', 'solar_forecast', 'offshore_forecast',
        'onshore_forecast', 'timeseries_pred']

cols5 = ['wind_offshore', 'solar', 'wind_onshore', 'load_forecast', 'timeseries_pred']

cols6 = ['load_forecast', 'solar_forecast', 'offshore_forecast', 'onshore_forecast',
        'timeseries_pred']

cols7 = ['load_true', 'solar_forecast', 'offshore_forecast', 'onshore_forecast',
        'timeseries_pred']

cols8 = ['load_forecast', 'solar_bottleneck', 'wind_bottleneck', 'timeseries_pred']

In [20]:
input_variations = [cols1, cols2, cols3, cols4, cols5, cols6, cols7, cols8]

In [21]:
for curr_feat in input_variations:
    
    mae = apply_predictor(LinearRegression(), features[curr_feat], labels)
    print('MAE: ', mae)

MAE:  3.58887406018
MAE:  3.38890336459
MAE:  3.43930478581
MAE:  3.27488568743
MAE:  3.67634697445
MAE:  3.62665276734
MAE:  3.56286427771
MAE:  3.7734588474


In [22]:
for curr_feat in input_variations:
    
    mae = apply_predictor(GradientBoostingRegressor(), features[curr_feat], labels)
    print('MAE: ', mae)

  y = column_or_1d(y, warn=True)


MAE:  3.04566880347
MAE:  2.96973754622
MAE:  2.92887532147
MAE:  2.85845931214
MAE:  3.47389323628
MAE:  3.41083126679
MAE:  3.3592143902
MAE:  3.56219247035


In [24]:
parameters = {'loss':('ls', 'lad', 'huber'), 'learning_rate':[0.05, 0.1, 0.3], 'n_estimators':[100, 1000, 5000],
              'min_samples_split':[2, 10, 20], 'max_depth':[3, 5, 10]}
predictor = GradientBoostingRegressor(random_state=7)
clf = GridSearchCV(predictor, parameters, verbose=2)
clf.fit(features[cols8].as_matrix(), labels.as_matrix().squeeze())

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.3s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.2s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.2s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000, total=   2.2s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000, total=   2.1s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=1000, total=   2.2s
[CV] learning_rate=0.05, loss=ls, max_depth=3, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.05, loss=ls, max_depth=3, min_sample

[CV]  learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.4s
[CV] learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.4s
[CV] learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.4s
[CV] learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=1000, total=   3.5s
[CV] learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=1000, total=   3.4s
[CV] learning_rate=0.05, loss=ls, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.05, loss=ls, max_depth=5, mi

[CV]  learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=2, n_estimators=5000, total=  20.2s
[CV] learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=2, n_estimators=5000, total=  21.2s
[CV] learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100, total=   0.5s
[CV] learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100, total=   0.6s
[CV] learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=100, total=   0.8s
[CV] learning_rate=0.05, loss=lad, max_depth=3, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.05, loss=lad, max_dept

[CV]  learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=1000, total=   7.0s
[CV] learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000, total=  30.7s
[CV] learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000, total=  30.0s
[CV] learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.05, loss=lad, max_depth=5, min_samples_split=20, n_estimators=5000, total=  25.8s
[CV] learning_rate=0.05, loss=lad, max_depth=10, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=lad, max_depth=10, min_samples_split=2, n_estimators=100, total=   4.1s
[CV] learning_rate=0.05, loss=lad, max_depth=10, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=lad, ma

[CV]  learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=1000, total=   6.0s
[CV] learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=1000, total=   5.9s
[CV] learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=1000, total=   6.9s
[CV] learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=5000, total=  32.2s
[CV] learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=5000, total=  30.5s
[CV] learning_rate=0.05, loss=huber, max_depth=3, min_samples_split=10, n_estimators=5000 
[CV]  learning_

[CV]  learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=100, total=  11.1s
[CV] learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=100, total=  12.4s
[CV] learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=100, total=  14.0s
[CV] learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=1000, total=  46.3s
[CV] learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=1000, total=  42.6s
[CV] learning_rate=0.05, loss=huber, max_depth=10, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=

[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=10, n_estimators=5000, total=  14.4s
[CV] learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=10, n_estimators=5000, total=  13.9s
[CV] learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100, total=   0.4s
[CV] learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100, total=   0.3s
[CV] learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=100, total=   0.4s
[CV] learning_rate=0.1, loss=ls, max_depth=3, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.1, loss=ls, max_depth=3, min_samples_sp

[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=2, n_estimators=5000, total=  37.0s
[CV] learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=2, n_estimators=5000, total=  37.1s
[CV] learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=2, n_estimators=5000, total=  37.3s
[CV] learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=10, n_estimators=100, total=   1.2s
[CV] learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=10, n_estimators=100, total=   1.2s
[CV] learning_rate=0.1, loss=ls, max_depth=10, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.1, loss=ls, max_depth=10, min_sam

[CV]  learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=1000, total=   4.6s
[CV] learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000, total=  19.8s
[CV] learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000, total=  19.5s
[CV] learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=3, min_samples_split=20, n_estimators=5000, total=  21.2s
[CV] learning_rate=0.1, loss=lad, max_depth=5, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.1, loss=lad, max_depth=5, min_samples_split=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.1, loss=lad, max_depth=5, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.1, loss=lad, max_depth=5, min

[CV]  learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=1000, total=  16.2s
[CV] learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=1000, total=  17.6s
[CV] learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000, total= 1.2min
[CV] learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000, total= 1.3min
[CV] learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=10, n_estimators=5000, total= 1.6min
[CV] learning_rate=0.1, loss=lad, max_depth=10, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=lad, m

[CV]  learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000, total=   8.9s
[CV] learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000, total=   8.8s
[CV] learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=1000, total=   8.9s
[CV] learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=5000, total=  43.7s
[CV] learning_rate=0.1, loss=huber, max_depth=5, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.1, loss=huber,

[CV]  learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=100, total=   3.0s
[CV] learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=100, total=   2.8s
[CV] learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=100, total=   2.9s
[CV] learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=1000, total=  14.5s
[CV] learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=1000, total=  14.3s
[CV] learning_rate=0.1, loss=huber, max_depth=10, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=

[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=2, n_estimators=5000, total=  17.8s
[CV] learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=1000, total=   3.4s
[CV] learning_rate=0.3, loss=ls, max_depth=5, min_samples_split=10, n_estimators=1000 
[CV]  learning_rate=0.3, loss=ls, max_depth=5, min_samples_spl

[CV]  learning_rate=0.3, loss=ls, max_depth=10, min_samples_split=20, n_estimators=5000, total=  19.2s
[CV] learning_rate=0.3, loss=ls, max_depth=10, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.3, loss=ls, max_depth=10, min_samples_split=20, n_estimators=5000, total=  19.0s
[CV] learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] learning_rate=0.3, loss=lad, max_depth=3, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.3, loss=lad, max_depth=3, min_sample

[CV]  learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=10, n_estimators=5000, total=  21.8s
[CV] learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=10, n_estimators=5000, total=  21.5s
[CV] learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=10, n_estimators=5000 
[CV]  learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=10, n_estimators=5000, total=  22.1s
[CV] learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.6s
[CV] learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.6s
[CV] learning_rate=0.3, loss=lad, max_depth=5, min_samples_split=20, n_estimators=100 
[CV]  learning_rate=0.3, loss=lad, max_depth=5, mi

[CV]  learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=1000, total=   4.6s
[CV] learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=1000 
[CV]  learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=1000, total=   4.7s
[CV] learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000, total=  23.3s
[CV] learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000, total=  23.1s
[CV] learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000 
[CV]  learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=2, n_estimators=5000, total=  23.5s
[CV] learning_rate=0.3, loss=huber, max_depth=3, min_samples_split=10, n_estimators=100 
[CV]  learning_rate=0.3, loss=huber

[CV]  learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=100, total=   0.7s
[CV] learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000, total=   6.7s
[CV] learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000, total=   6.6s
[CV] learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000 
[CV]  learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=1000, total=   6.7s
[CV] learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=5000, total=  33.1s
[CV] learning_rate=0.3, loss=huber, max_depth=5, min_samples_split=20, n_estimators=5000 
[CV]  learning_rate=0.3, l

[Parallel(n_jobs=1)]: Done 729 out of 729 | elapsed: 187.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=7,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ('ls', 'lad', 'huber'), 'learning_rate': [0.05, 0.1, 0.3], 'n_estimators': [100, 1000, 5000], 'min_samples_split': [2, 10, 20], 'max_depth': [3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [25]:
print(clf.best_params_, clf.best_score_)

{'learning_rate': 0.05, 'loss': 'huber', 'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 100} 0.803616693861


In [65]:
joblib.dump(clf, './models/final_model_gridsearch.pkl')
joblib.dump(clf.best_estimator_, './models/final_model_best_estimator.pkl');

In [27]:
clf.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=20,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=7, subsample=1.0, verbose=0,
             warm_start=False)

In [62]:
clf.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=20,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=7, subsample=1.0, verbose=0,
             warm_start=False)

In [28]:
print(mean_absolute_error(labels, clf.best_estimator_.predict(features[cols8])))

3.18673146368


In [63]:
print(mean_absolute_error(labels, clf.best_estimator_.predict(features[cols8])))

3.18673146368


In [58]:
mae = apply_predictor(GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
         learning_rate=0.05, loss='huber', max_depth=5,
         max_features=None, max_leaf_nodes=None,
         min_impurity_decrease=0.0, min_impurity_split=None,
         min_samples_leaf=1, min_samples_split=20,
         min_weight_fraction_leaf=0.0, n_estimators=100,
         presort='auto', random_state=7, subsample=1.0, verbose=0,
         warm_start=False), features[cols4], labels)
print('MAE: ', mae)

  y = column_or_1d(y, warn=True)


MAE:  2.69307192868


In [53]:
X_train, X_test, y_train, y_test = train_test_split(features[cols8], labels, test_size=0.2, random_state=0)

In [52]:
print(mean_absolute_error(y_test, clf.best_estimator_.predict(X_test)))

3.11414704769


In [46]:
from sklearn.model_selection import KFold

In [50]:
mae = apply_predictor(GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
         learning_rate=0.05, loss='huber', max_depth=5,
         max_features=None, max_leaf_nodes=None,
         min_impurity_decrease=0.0, min_impurity_split=None,
         min_samples_leaf=1, min_samples_split=20,
         min_weight_fraction_leaf=0.0, n_estimators=200,
         presort='auto', random_state=7, subsample=1.0, verbose=0,
         warm_start=False), features[cols8], labels)
print('MAE: ', mae)

  y = column_or_1d(y, warn=True)


MAE:  3.43777485824


In [67]:
mae = apply_predictor(GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
         learning_rate=0.05, loss='huber', max_depth=5,
         max_features=None, max_leaf_nodes=None,
         min_impurity_decrease=0.0, min_impurity_split=None,
         min_samples_leaf=1, min_samples_split=20,
         min_weight_fraction_leaf=0.0, n_estimators=500,
         presort='auto', random_state=7, subsample=1.0, verbose=0,
         warm_start=False), features[cols6], labels)
print('MAE: ', mae)

  y = column_or_1d(y, warn=True)


MAE:  3.18377165944


In [55]:
parameters = {'loss':['huber'], 'learning_rate':[0.05], 'n_estimators':[100, 200],
              'min_samples_split':[10], 'max_depth':[5]}
predictor = GradientBoostingRegressor(random_state=7)
clf2 = GridSearchCV(predictor, parameters, verbose=2)
clf2.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100 


  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100, total=   1.0s
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100, total=   1.0s
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100 


  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=100, total=   0.9s
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200 


  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200, total=   1.6s
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200 


  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200, total=   1.6s
[CV] learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200 


  y = column_or_1d(y, warn=True)


[CV]  learning_rate=0.05, loss=huber, max_depth=5, min_samples_split=10, n_estimators=200, total=   2.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.3s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=7,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['huber'], 'learning_rate': [0.05], 'n_estimators': [100, 200], 'min_samples_split': [10], 'max_depth': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [59]:
clf2.best_params_

{'learning_rate': 0.05,
 'loss': 'huber',
 'max_depth': 5,
 'min_samples_split': 10,
 'n_estimators': 200}

In [61]:
print(mean_absolute_error(y_test, clf2.best_estimator_.predict(X_test)))

3.43853717343


### Simple Regression Model on important features

### Benchmark from the timeseries model

In [24]:
bottleneck_timeseries = pd.read_csv('./bottleneck_features/bnf_timeseries.csv', index_col=0, parse_dates=True)
bottleneck_timeseries = bottleneck_timeseries.join(price_data, how='inner')

In [26]:
print(mean_absolute_error(bottleneck_timeseries.timeseries_pred, bottleneck_timeseries.DA_price))

5.5289360522


### Benchmark regression with perfect information

In [34]:
regr = LinearRegression()

regr.fit(actual.drop('DA_price', axis=1).as_matrix(), actual['DA_price'].as_matrix())

pred = regr.predict(actual.drop('DA_price', axis=1).as_matrix())

print(mean_absolute_error(actual['DA_price'].as_matrix(), pred))

3.85435578932


In [30]:
regr = LinearRegression()

regr.fit(actual.drop('DA_price', axis=1).as_matrix(), actual['DA_price'].as_matrix())

pred = regr.predict(actual.drop('DA_price', axis=1).as_matrix())

print(mean_absolute_error(actual['DA_price'].as_matrix(), pred))

4.56425788451


### Using forecast inputs

In [31]:
regr = LinearRegression()

regr.fit(forecast.drop('DA_price', axis=1).as_matrix(), forecast['DA_price'].as_matrix())

pred = regr.predict(forecast.drop('DA_price', axis=1).as_matrix())

print(mean_absolute_error(forecast['DA_price'].as_matrix(), pred))

4.99793768595


### Using 'predictable' inputs

In [76]:
features_train = train[['load_true', 'solar','wind_offshore', 'wind_onshore',
                       'bottleneck_features']]
labels_train = train[['DA_price']]
features_test = test[['load_true', 'solar','wind_offshore', 'wind_onshore',
                       'bottleneck_features']]

In [78]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test.DA_price, pred))

4.98114110312


In [72]:
features_train = train_sub[['load_forecast', 'solar_forecast','offshore_forecast', 'onshore_forecast',
                       'bottleneck_features']]
labels_train = train_sub[['DA_price']]
features_test = test_sub[['load_forecast', 'solar_forecast','offshore_forecast', 'onshore_forecast',
                       'bottleneck_features']]

In [74]:
regr = LinearRegression()

regr.fit(features_train.as_matrix(), labels_train.as_matrix())

pred = regr.predict(features_test)

print(mean_absolute_error(test_sub.DA_price, pred))

4.72861605
