In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import datetime
import warnings
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
def get_clean_data(df_original):
    weekday_dict = {
        0:'Wd', 1:'Wd', 2:'Wd', 3:'Wd', 4:'Wd', 5:'F', 6:'F' 
    }
    
    df = df_original.copy(deep=True)
    df['date'] =  pd.to_datetime(df['datetime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df['time'] = pd.to_datetime(df['datetime'].apply(lambda x: x[:19])).dt.time
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['datetime'].apply(lambda x: x[11:13]).astype(int)
    df['minute'] = df['datetime'].apply(lambda x: x[14:16]).astype(int)
    df['weekday'] = df['date'].dt.dayofweek
    df.replace({'weekday':weekday_dict}, inplace=True)
    df['season'] = np.where(df['month'].isin(list(range(4,10))), 'summer', 'winter')
    df['date_hour'] = df.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
    df.set_index('date_hour', inplace=True)
    df = df[df.index < '2017']
    clean_df = df[['date', 'year', 'month', 'season', 'day','weekday','time', 'hour', 'minute', 'value']]
    clean_df = clean_df[~clean_df.index.duplicated()]
    clean_df['hour'] = np.where(clean_df['hour'].isin(np.arange(9,23)), 'Peak', 'off_peak')
    clean_df['value'] = np.log1p(clean_df['value'])
    clean_df_freq = clean_df.asfreq('H')
    
    return clean_df_freq

### df with secondary band, spot price and demand 

In [3]:
raw_band_price = pd.read_csv('export_PrecioBandaDeRegulaciónSecundaria_2017-01-15_19-11.csv', encoding='latin1', delimiter=';')
band_price = get_clean_data(raw_band_price)
band_price = band_price.rename(columns={'value':'band'})

In [4]:
raw_spot_price = pd.read_csv('export_PrecioMercadoSPOTDiario_2017-02-02_09-47.csv', encoding='latin1', delimiter=';')
raw_spot_price = raw_spot_price[raw_spot_price['geoid'] == 3]
spot_price = get_clean_data(raw_spot_price)
spot_price = spot_price.rename(columns={'value':'spot'})

In [5]:
raw_demand = pd.read_csv('export_DemandaProgramada_2017-02-02_09-45.csv', encoding='latin1', delimiter=';')
demand = get_clean_data(raw_demand)
demand = demand.rename(columns={'value':'demand'})

In [6]:
spot_band_demand = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)
spot_band_demand = spot_band_demand.merge(demand[['demand']], how='left', left_index=True, right_index=True)

In [7]:
spot_band_demand = spot_band_demand[['hour', 'weekday', 'season', 'spot','demand', 'band']]

In [8]:
for feature in ['hour', 'weekday', 'season']:
    spot_band_demand[pd.get_dummies(spot_band_demand[feature], drop_first=True).columns.tolist()] = pd.get_dummies(spot_band_demand[feature], drop_first=True)

In [9]:
for feature in ['demand', 'spot']:
    spot_band_demand[feature + '-1'] = spot_band_demand[feature].shift(periods=+1)
    spot_band_demand[feature + '-24'] = spot_band_demand[feature].shift(periods=+24)
spot_band_demand['band-24'] = spot_band_demand['band'].shift(periods=24)
spot_band_demand.dropna(subset=['demand-1', 'spot-1', 'demand-24', 'spot-24', 'band-24'], inplace=True)

### Preprocessing for Decision Tree

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import tree as tree_m
from sklearn.tree import export_graphviz
import graphviz

In [12]:
#Set features and output
X = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).values
Y = spot_band_demand['band'].values

In [13]:
spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).corr()

Unnamed: 0,spot,demand,off_peak,Wd,winter,demand-1,demand-24,spot-1,spot-24,band-24
spot,1.0,0.331452,-0.2107577,0.1954997,-0.1118234,0.3083,0.244857,0.967238,0.7438,-0.295832
demand,0.331452,1.0,-0.6699812,0.3851586,0.0724036,0.950607,0.811914,0.311336,0.24019,-0.353952
off_peak,-0.210758,-0.669981,1.0,8.301321e-17,3.785815e-17,-0.633823,-0.66936,-0.192752,-0.209136,0.335134
Wd,0.1955,0.385159,8.301321e-17,1.0,-0.003754593,0.380531,0.048825,0.194077,0.012664,-0.054205
winter,-0.111823,0.072404,3.785815e-17,-0.003754593,1.0,0.072325,0.071415,-0.111743,-0.110365,0.089186
demand-1,0.3083,0.950607,-0.6338228,0.3805307,0.07232508,1.0,0.773877,0.331478,0.224937,-0.330597
demand-24,0.244857,0.811914,-0.6693597,0.04882523,0.07141492,0.773877,1.0,0.229392,0.333904,-0.452303
spot-1,0.967238,0.311336,-0.1927516,0.1940771,-0.1117432,0.331478,0.229392,1.0,0.736396,-0.286688
spot-24,0.7438,0.24019,-0.2091356,0.01266361,-0.1103646,0.224937,0.333904,0.736396,1.0,-0.436683
band-24,-0.295832,-0.353952,0.3351342,-0.05420461,0.08918638,-0.330597,-0.452303,-0.286688,-0.436683,1.0


In [14]:
features_list = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).columns.tolist()

In [55]:
#Split data set into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, 
                                                    test_size=0.2, random_state=1)

### Randomized Search CV

In [16]:
params = {
    'criterion':['mse'],
    'splitter':['best','random'],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,10), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(200, 2500, 50).astype(int), 'max_leaf_nodes': np.linspace(10,180, 40).astype(int)
}

In [17]:
#Random search, set up tree and cross-validation
tree = DecisionTreeRegressor()
kf = KFold(n_splits=5)

#Randomized Search CV for tree
random_search = RandomizedSearchCV(tree, param_distributions=params, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [18]:
random_search.fit(x_train, y_train);

In [21]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'min_samples_leaf': 293, 'splitter': 'best', 'max_features': 'auto', 'min_samples_split': 408, 'criterion': 'mse', 'max_leaf_nodes': 140, 'max_depth': 6}


In [22]:
tree.set_params(**best_params, random_state=0)

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features='auto',
           max_leaf_nodes=140, min_impurity_split=1e-07,
           min_samples_leaf=293, min_samples_split=408,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [23]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    CV_mse.append(mean_squared_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.6736 +- 0.1224
Mean squared error: 74.6409 +- 7.3130


In [24]:
CV_mae, CV_mse

([5.6920443561673677,
  5.7568515011122958,
  5.652656678001561,
  5.6955584492812781,
  5.5710125094500595],
 [73.198639443948068,
  78.06729186196533,
  69.412998328704404,
  79.459508584056422,
  73.066234894199425])

In [25]:
tree.fit(x_train, y_train)
print('MAE: ' +str(mean_absolute_error(np.exp(y_train), np.exp(tree.predict(x_train))))+', MSE: ' +
      str(mean_squared_error(np.exp(y_train), np.exp(tree.predict(x_train)))))

MAE: 5.4608076777, MSE: 69.8308737933


In [26]:
#Print test error and feature importance
print('MAE: ' +str(mean_absolute_error(np.exp(y_test), np.exp(tree.predict(x_test))))+', MSE: ' +
      str(mean_squared_error(np.exp(y_test), np.exp(tree.predict(x_test)))))
pd.DataFrame(data=tree.feature_importances_, index=features_list)

MAE: 5.45480276258, MSE: 69.0471404626


Unnamed: 0,0
spot,0.127965
demand,0.102587
off_peak,0.0
Wd,0.043122
winter,0.0
demand-1,0.0
demand-24,0.016356
spot-1,0.002283
spot-24,0.042762
band-24,0.664925


In [None]:
#Plot tree with graphviz
export_graphviz(tree, out_file="mytree.dot", max_depth=None, feature_names=features_list, label='none',
               filled=True, leaves_parallel=False, impurity=False, node_ids=False, proportion=False, rotate=False,
               rounded=True, special_characters=False)
with open("mytree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
#Convert tree.dot into pdf
#graphviz.render(engine='dot', format='pdf', filepath='C:\\Users\\Usuario\\Desktop\\Master_Thesis\\Secondary_reserve_TS\\mytree.dot')

### Random forest Regressor

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
rfr = RandomForestRegressor()

In [29]:
params_forest = {
    'n_estimators':np.linspace(5, 200, 50).astype(int),
    'n_jobs':[-1],
    'criterion':['mse'], 'bootstrap':[True],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,10), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(200, 2500, 50).astype(int), 'max_leaf_nodes': np.linspace(10,180, 40).astype(int)
}

In [30]:
kf = KFold(n_splits=5, random_state=0)

#Randomized Search CV for random forest
random_search = RandomizedSearchCV(rfr, param_distributions=params_forest, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [31]:
%%time
random_search.fit(x_train, y_train);

Wall time: 10min 19s


RandomizedSearchCV(cv=[(array([ 4205,  4206, ..., 21022, 21023]), array([   0,    1, ..., 4203, 4204])), (array([    0,     1, ..., 21022, 21023]), array([4205, 4206, ..., 8408, 8409])), (array([    0,     1, ..., 21022, 21023]), array([ 8410,  8411, ..., 12613, 12614])), (array([    0,     1, ..., 21022, 21023]), array([12615, 12616, ..., 16818, 16819])), (array([    0,     1, ..., 16818, 16819]), array([16820, 16821, ..., 21022, 21023]))],
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=350, n_jobs=-1,
          param_distributions={'min_samples_leaf': array([ 200,  246,  293,  340,  387,

In [32]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'min_samples_leaf': 246, 'max_features': 'auto', 'min_samples_split': 350, 'n_jobs': -1, 'n_estimators': 104, 'max_leaf_nodes': 149, 'criterion': 'mse', 'bootstrap': True, 'max_depth': 5}


In [33]:
rfr.set_params(**best_params, random_state=0)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=149,
           min_impurity_split=1e-07, min_samples_leaf=246,
           min_samples_split=350, min_weight_fraction_leaf=0.0,
           n_estimators=104, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [39]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    rfr.fit(x_train[train_index], y_train[train_index])
    y_pred = rfr.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    CV_mse.append(mean_squared_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.4841 +- 0.1262
Mean squared error: 71.7650 +- 8.0756


In [40]:
CV_mae, CV_mse

([5.4252327363260031,
  5.5584097078655006,
  5.4504525723945436,
  5.5625307562456312,
  5.4238679150526448],
 [67.83331129191798,
  74.280080064609237,
  66.981019322891441,
  77.833964937248254,
  71.896860910513013])

In [41]:
rfr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=149,
           min_impurity_split=1e-07, min_samples_leaf=246,
           min_samples_split=350, min_weight_fraction_leaf=0.0,
           n_estimators=104, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [42]:
print('Training MAE: ' +str(mean_absolute_error(np.exp(y_train), np.exp(rfr.predict(x_train))))+', Training MSE: ' +
      str(mean_squared_error(np.exp(y_train), np.exp(rfr.predict(x_train)))))

Training MAE: 5.36168003756, Training MSE: 68.4824772971


In [38]:
#Print test error and feature importance
print('MAE: ' +str(mean_absolute_error(np.exp(y_test), np.exp(rfr.predict(x_test))))+', MSE: ' +
      str(mean_squared_error(np.exp(y_test), np.exp(rfr.predict(x_test)))))
pd.DataFrame(data=rfr.feature_importances_, index=features_list)

MAE: 5.29036110853, MSE: 66.8473291781


Unnamed: 0,0
spot,0.148761
demand,0.091712
off_peak,1e-05
Wd,0.033322
winter,4.4e-05
demand-1,0.000353
demand-24,0.016233
spot-1,0.010587
spot-24,0.0332
band-24,0.665779


In [None]:
export_graphviz(rfr.estimators_[12], out_file="0.dot", max_depth=None, feature_names=features_list, label='none',
               filled=True, leaves_parallel=False, impurity=False, node_ids=False, proportion=False, rotate=False,
               rounded=True, special_characters=False)
with open("0.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [56]:
# Customised random forest, this is the final model
rfr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=150,
           min_impurity_split=1e-07, min_samples_leaf=250,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [57]:
rfr.fit(x_train, y_train)
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    rfr.fit(x_train[train_index], y_train[train_index])
    y_pred = rfr.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    CV_mse.append(mean_squared_error(np.exp(y_train[test_index]), np.exp(y_pred)))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.4495 +- 0.1220
Mean squared error: 71.6048 +- 6.4518


In [58]:
CV_mae, CV_mse

([5.4781733744442844,
  5.3826629287152663,
  5.4504303779802408,
  5.3885714917906915,
  5.5475508906005571],
 [74.030576720959331,
  72.729519789271833,
  68.621324143623909,
  67.089329293329243,
  75.553460691601856])

In [59]:
rfr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=150,
           min_impurity_split=1e-07, min_samples_leaf=250,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [60]:
print('Training MAE: ' +str(mean_absolute_error(np.exp(y_train), np.exp(rfr.predict(x_train))))+', Training MSE: ' +
      str(mean_squared_error(np.exp(y_train), np.exp(rfr.predict(x_train)))))

Training MAE: 5.34723842403, Training MSE: 68.6166279102


In [61]:
print('MAE: ' +str(mean_absolute_error((np.exp(y_test)), (np.exp(rfr.predict(x_test)))))+', MSE: ' +
      str(mean_squared_error((np.exp(y_test)), (np.exp(rfr.predict(x_test))))))
pd.DataFrame(data=rfr.feature_importances_, index=features_list)

MAE: 5.50882564202, MSE: 67.013501059


Unnamed: 0,0
spot,0.173307
demand,0.076485
off_peak,0.0
Wd,0.02899
winter,0.0
demand-1,0.001642
demand-24,0.015751
spot-1,0.009788
spot-24,0.037599
band-24,0.656438


In [62]:
#Fit the whole data set to obtain the residuals
rfr.fit(X, Y)
residuals = Y - rfr.predict(X)
res_index = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).index

In [63]:
residuals_random_forest = pd.DataFrame(data=residuals, index=res_index, columns=['residuals'])
residuals_random_forest.to_csv('Residuals_from_random_forest.csv')