In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import datetime
import warnings
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
def get_clean_data(df_original):
    weekday_dict = {
        0:'Wd', 1:'Wd', 2:'Wd', 3:'Wd', 4:'Wd', 5:'F', 6:'F' 
    }
    
    df = df_original.copy(deep=True)
    df['date'] =  pd.to_datetime(df['datetime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df['time'] = pd.to_datetime(df['datetime'].apply(lambda x: x[:19])).dt.time
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['datetime'].apply(lambda x: x[11:13]).astype(int)
    df['minute'] = df['datetime'].apply(lambda x: x[14:16]).astype(int)
    df['weekday'] = df['date'].dt.dayofweek
    df.replace({'weekday':weekday_dict}, inplace=True)
    df['season'] = np.where(df['month'].isin(list(range(4,10))), 'summer', 'winter')
    df['date_hour'] = df.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
    df.set_index('date_hour', inplace=True)
    df = df[df.index < '2017']
    clean_df = df[['date', 'year', 'month', 'season', 'day','weekday','time', 'hour', 'minute', 'value']]
    clean_df = clean_df[~clean_df.index.duplicated()]
    clean_df['hour'] = np.where(clean_df['hour'].isin(np.arange(9,23)), 'Peak', 'off_peak')
    #clean_df['value'] = np.log1p(clean_df['value'])
    clean_df_freq = clean_df.asfreq('H')
    
    return clean_df_freq

### df with secondary band, spot price and demand 

In [3]:
raw_band_price = pd.read_csv('export_PrecioBandaDeRegulaciónSecundaria_2017-01-15_19-11.csv', encoding='latin1', delimiter=';')
band_price = get_clean_data(raw_band_price)
band_price = band_price.rename(columns={'value':'band'})

In [4]:
raw_spot_price = pd.read_csv('export_PrecioMercadoSPOTDiario_2017-02-02_09-47.csv', encoding='latin1', delimiter=';')
raw_spot_price = raw_spot_price[raw_spot_price['geoid'] == 3]
spot_price = get_clean_data(raw_spot_price)
spot_price = spot_price.rename(columns={'value':'spot'})

In [5]:
raw_demand = pd.read_csv('export_DemandaProgramada_2017-02-02_09-45.csv', encoding='latin1', delimiter=';')
demand = get_clean_data(raw_demand)
demand = demand.rename(columns={'value':'demand'})

In [6]:
spot_band_demand = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)
spot_band_demand = spot_band_demand.merge(demand[['demand']], how='left', left_index=True, right_index=True)

In [7]:
spot_band_demand = spot_band_demand[['hour', 'weekday', 'season', 'spot','demand', 'band']]

In [8]:
for feature in ['hour', 'weekday', 'season']:
    spot_band_demand[pd.get_dummies(spot_band_demand[feature], drop_first=True).columns.tolist()] = pd.get_dummies(spot_band_demand[feature], drop_first=True)

In [9]:
for feature in ['demand', 'spot']:
    spot_band_demand[feature + '-1'] = spot_band_demand[feature].shift(periods=+1)
    spot_band_demand[feature + '-24'] = spot_band_demand[feature].shift(periods=+24)
spot_band_demand['band-24'] = spot_band_demand['band'].shift(periods=24)
spot_band_demand.dropna(subset=['demand-1', 'spot-1', 'demand-24', 'spot-24', 'band-24'], inplace=True)

### Preprocessing for Decision Tree

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import tree as tree_m
from sklearn.tree import export_graphviz
import graphviz

In [11]:
#Set features and output
X = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).values
Y = spot_band_demand['band'].values

In [12]:
spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).corr()

Unnamed: 0,spot,demand,off_peak,Wd,winter,demand-1,demand-24,spot-1,spot-24,band-24
spot,1.0,0.439271,-0.2782497,0.2362048,-0.03455513,0.404751,0.329684,0.966815,0.77522,-0.293587
demand,0.439271,1.0,-0.6632193,0.391155,0.0820222,0.950713,0.811719,0.418458,0.330222,-0.304035
off_peak,-0.27825,-0.663219,1.0,8.301321e-17,3.785815e-17,-0.630145,-0.662662,-0.259829,-0.27738,0.298587
Wd,0.236205,0.391155,8.301321e-17,1.0,-0.003754593,0.386605,0.060692,0.234965,0.021821,-0.031532
winter,-0.034555,0.082022,3.785815e-17,-0.003754593,1.0,0.081945,0.081045,-0.034386,-0.029929,0.135055
demand-1,0.404751,0.950713,-0.630145,0.3866047,0.08194535,1.0,0.772684,0.439311,0.303557,-0.281814
demand-24,0.329684,0.811719,-0.6626618,0.06069245,0.08104539,0.772684,1.0,0.31328,0.440859,-0.40291
spot-1,0.966815,0.418458,-0.259829,0.2349651,-0.03438614,0.439311,0.31328,1.0,0.761822,-0.281421
spot-24,0.77522,0.330222,-0.2773799,0.02182063,-0.02992868,0.303557,0.440859,0.761822,1.0,-0.481855
band-24,-0.293587,-0.304035,0.2985872,-0.03153192,0.1350551,-0.281814,-0.40291,-0.281421,-0.481855,1.0


In [13]:
features_list = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).columns.tolist()

In [14]:
#Split data set into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, 
                                                    test_size=0.2, random_state=1)

### Randomized Search CV

In [15]:
params = {
    'criterion':['mse'],
    'splitter':['best','random'],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,10), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(200, 2500, 50).astype(int), 'max_leaf_nodes': np.linspace(10,180, 40).astype(int)
}

In [16]:
#Random search, set up tree and cross-validation
tree = DecisionTreeRegressor()
kf = KFold(n_splits=5)

#Randomized Search CV for tree
random_search = RandomizedSearchCV(tree, param_distributions=params, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [17]:
random_search.fit(x_train, y_train);

In [18]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'max_leaf_nodes': 140, 'splitter': 'best', 'max_features': 'auto', 'min_samples_leaf': 293, 'max_depth': 6, 'min_samples_split': 408, 'criterion': 'mse'}


In [19]:
tree.set_params(**best_params, random_state=0)

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features='auto',
           max_leaf_nodes=140, min_impurity_split=1e-07,
           min_samples_leaf=293, min_samples_split=408,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [20]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(y_train[test_index], y_pred))
    CV_mse.append(mean_squared_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.6245 +- 0.1562
Mean squared error: 70.6660 +- 7.2539


In [21]:
CV_mae, CV_mse

([5.6483176367792698,
  5.6516878706406395,
  5.6524523246030265,
  5.4724808518472141,
  5.6973343740178857],
 [73.044985030447634,
  73.73185483890316,
  68.649118572286881,
  64.436846099798984,
  73.46695041432325])

In [22]:
tree.fit(x_train, y_train)
print('MAE: ' +str(mean_absolute_error(y_train, tree.predict(x_train)))+', MSE: ' +
      str(mean_squared_error(y_train, tree.predict(x_train))))

MAE: 5.4443461512, MSE: 66.5193109854


In [23]:
#Print test error and feature importance
print('MAE: ' +str(mean_absolute_error(y_test, tree.predict(x_test)))+', MSE: ' +
      str(mean_squared_error(y_test, tree.predict(x_test))))
pd.DataFrame(data=tree.feature_importances_, index=features_list)

MAE: 5.64976678096, MSE: 67.2140913528


Unnamed: 0,0
spot,0.257636
demand,0.041164
off_peak,0.002279
Wd,0.025763
winter,0.0
demand-1,0.0
demand-24,0.012536
spot-1,0.0
spot-24,0.068105
band-24,0.592516


In [None]:
#Plot tree with graphviz
export_graphviz(tree, out_file="mytree.dot", max_depth=None, feature_names=features_list, label='none',
               filled=True, leaves_parallel=False, impurity=False, node_ids=False, proportion=False, rotate=False,
               rounded=True, special_characters=False)
with open("mytree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
#Convert tree.dot into pdf
#graphviz.render(engine='dot', format='pdf', filepath='C:\\Users\\Usuario\\Desktop\\Master_Thesis\\Secondary_reserve_TS\\mytree.dot')

### Random forest Regressor

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
rfr = RandomForestRegressor()

In [26]:
params_forest = {
    'n_estimators':np.linspace(5, 200, 50).astype(int),
    'n_jobs':[-1],
    'criterion':['mse'], 'bootstrap':[True],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,10), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(200, 2500, 50).astype(int), 'max_leaf_nodes': np.linspace(10,180, 40).astype(int)
}

In [27]:
kf = KFold(n_splits=5, random_state=0)

#Randomized Search CV for random forest
random_search = RandomizedSearchCV(rfr, param_distributions=params_forest, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [28]:
%%time
random_search.fit(x_train, y_train);

Wall time: 9min 54s


RandomizedSearchCV(cv=[(array([ 4205,  4206, ..., 21022, 21023]), array([   0,    1, ..., 4203, 4204])), (array([    0,     1, ..., 21022, 21023]), array([4205, 4206, ..., 8408, 8409])), (array([    0,     1, ..., 21022, 21023]), array([ 8410,  8411, ..., 12613, 12614])), (array([    0,     1, ..., 21022, 21023]), array([12615, 12616, ..., 16818, 16819])), (array([    0,     1, ..., 16818, 16819]), array([16820, 16821, ..., 21022, 21023]))],
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=350, n_jobs=-1,
          param_distributions={'max_leaf_nodes': array([ 10,  14,  18,  23,  27,  31,  

In [29]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'max_leaf_nodes': 149, 'n_estimators': 104, 'max_features': 'auto', 'min_samples_leaf': 246, 'max_depth': 5, 'criterion': 'mse', 'n_jobs': -1, 'min_samples_split': 350, 'bootstrap': True}


In [30]:
rfr.set_params(**best_params, random_state=0)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=149,
           min_impurity_split=1e-07, min_samples_leaf=246,
           min_samples_split=350, min_weight_fraction_leaf=0.0,
           n_estimators=104, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [31]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    rfr.fit(x_train[train_index], y_train[train_index])
    y_pred = rfr.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(y_train[test_index], y_pred))
    CV_mse.append(mean_squared_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.4739 +- 0.1651
Mean squared error: 67.3838 +- 7.8926


In [32]:
CV_mae, CV_mse

([5.4726774861400829,
  5.5215645700447498,
  5.4848111785928646,
  5.3227372319662898,
  5.5677634135732141],
 [69.395101258852904,
  71.169284744034428,
  63.797995491156577,
  61.577724703944952,
  70.978659644128783])

In [33]:
rfr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=149,
           min_impurity_split=1e-07, min_samples_leaf=246,
           min_samples_split=350, min_weight_fraction_leaf=0.0,
           n_estimators=104, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [34]:
print('Training MAE: ' +str(mean_absolute_error(y_train, rfr.predict(x_train)))+', Training MSE: ' +
      str(mean_squared_error(y_train, rfr.predict(x_train))))

Training MAE: 5.32773830882, Training MSE: 63.7361267729


In [35]:
#Print test error and feature importance
print('MAE: ' +str(mean_absolute_error(y_test, rfr.predict(x_test)))+', MSE: ' +
      str(mean_squared_error(y_test, rfr.predict(x_test))))
pd.DataFrame(data=rfr.feature_importances_, index=features_list)

MAE: 5.49424270586, MSE: 62.9732373884


Unnamed: 0,0
spot,0.234905
demand,0.055386
off_peak,0.000537
Wd,0.023064
winter,0.000131
demand-1,0.000225
demand-24,0.013748
spot-1,0.01336
spot-24,0.050829
band-24,0.607816


In [None]:
export_graphviz(rfr.estimators_[12], out_file="0.dot", max_depth=None, feature_names=features_list, label='none',
               filled=True, leaves_parallel=False, impurity=False, node_ids=False, proportion=False, rotate=False,
               rounded=True, special_characters=False)
with open("0.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [46]:
# Customised random forest, this is the final model
rfr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=150,
           min_impurity_split=1e-07, min_samples_leaf=250,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           n_estimators=120, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [47]:
rfr.fit(x_train, y_train)
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    rfr.fit(x_train[train_index], y_train[train_index])
    y_pred = rfr.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(y_train[test_index], y_pred))
    CV_mse.append(mean_squared_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 5.4830 +- 0.1651
Mean squared error: 67.5827 +- 7.8584


In [48]:
CV_mae, CV_mse

([5.4826489746880469,
  5.5281288971704861,
  5.493963013059604,
  5.3319063831053208,
  5.578293252628189],
 [69.563844884928187,
  71.285205383970947,
  64.022371862281034,
  61.797239773952853,
  71.244932156114672])

In [49]:
rfr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=150,
           min_impurity_split=1e-07, min_samples_leaf=250,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           n_estimators=120, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [50]:
print('Training MAE: ' +str(mean_absolute_error(y_train, rfr.predict(x_train)))+', Training MSE: ' +
      str(mean_squared_error(y_train, rfr.predict(x_train))))

Training MAE: 5.33193635007, Training MSE: 63.8359932336


In [51]:
print('MAE: ' +str(mean_absolute_error(y_test, rfr.predict(x_test)))+', MSE: ' +
      str(mean_squared_error(y_test, rfr.predict(x_test))))
pd.DataFrame(data=rfr.feature_importances_, index=features_list)

MAE: 5.49698288024, MSE: 63.042333512


Unnamed: 0,0
spot,0.236223
demand,0.055719
off_peak,0.000657
Wd,0.022769
winter,0.000117
demand-1,0.000242
demand-24,0.013768
spot-1,0.012356
spot-24,0.050332
band-24,0.607816


In [57]:
#Transform y_true and y_pred from last model into log scale for the residual fitting
rfr.fit(X, Y)
Y_log = np.log1p(Y)
Y_pred_log = np.log1p(rfr.predict(X))
residuals = Y_log - Y_pred_log
res_index = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).index
residuals_random_forest = pd.DataFrame(data=residuals, index=res_index, columns=['residuals'])
residuals_random_forest.to_csv('Residuals_from_random_forest.csv')