In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import datetime
import warnings
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [220]:
def get_clean_data(df_original):
    weekday_dict = {
        0:'Wd', 1:'Wd', 2:'Wd', 3:'Wd', 4:'Wd', 5:'F', 6:'F' 
    }
    
    df = df_original.copy(deep=True)
    df['date'] =  pd.to_datetime(df['datetime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df['time'] = pd.to_datetime(df['datetime'].apply(lambda x: x[:19])).dt.time
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['datetime'].apply(lambda x: x[11:13]).astype(int)
    df['minute'] = df['datetime'].apply(lambda x: x[14:16]).astype(int)
    df['weekday'] = df['date'].dt.dayofweek
    df.replace({'weekday':weekday_dict}, inplace=True)
    df['season'] = np.where(df['month'].isin(list(range(4,10))), 'summer', 'winter')
    df['date_hour'] = df.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
    df.set_index('date_hour', inplace=True)
    df = df[df.index < '2017']
    clean_df = df[['date', 'year', 'month', 'season', 'day','weekday','time', 'hour', 'minute', 'value']]
    clean_df = clean_df[~clean_df.index.duplicated()]
    
    return clean_df

### df with secondary band and spot price 

In [221]:
raw_band_price = pd.read_csv('export_PrecioBandaDeRegulaciónSecundaria_2017-01-15_19-11.csv', encoding='latin1', delimiter=';')
band_price = get_clean_data(raw_band_price)
band_price = band_price.rename(columns={'value':'band'})

In [222]:
raw_spot_price = pd.read_csv('export_PrecioMercadoSPOTDiario_2017-02-02_09-47.csv', encoding='latin1', delimiter=';')
raw_spot_price = raw_spot_price[raw_spot_price['geoid'] == 3]
spot_price = get_clean_data(raw_spot_price)
spot_price = spot_price.rename(columns={'value':'spot'})

In [223]:
#Merge both df
spot_band = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)

In [224]:
#Clean df in order to train a model, output: band price
spot_band = spot_band[['hour', 'weekday', 'season', 'spot', 'band']]

In [225]:
#Create dummy variables
for feature in ['hour', 'weekday', 'season']:
    spot_band[pd.get_dummies(spot_band[feature], drop_first=False).columns.tolist()] = pd.get_dummies(spot_band[feature], drop_first=False)

In [226]:
spot_band['spot-1'] = spot_band['spot'].shift()
spot_band.dropna(subset=['spot-1'], inplace=True)

### df with secondary band, spot price and demand

In [331]:
raw_demand = pd.read_csv('export_DemandaProgramada_2017-02-02_09-45.csv', encoding='latin1', delimiter=';')
demand = get_clean_data(raw_demand)
demand = demand.rename(columns={'value':'demand'})

In [332]:
spot_band_demand = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)
spot_band_demand = spot_band_demand.merge(demand[['demand']], how='left', left_index=True, right_index=True)

In [333]:
spot_band_demand = spot_band_demand[['hour', 'weekday', 'season', 'spot','demand', 'band']]

In [334]:
for feature in ['hour', 'weekday', 'season']:
    spot_band_demand[pd.get_dummies(spot_band_demand[feature], drop_first=False).columns.tolist()] = pd.get_dummies(spot_band_demand[feature], drop_first=False)

In [335]:
for feature in ['spot', 'demand']:
    spot_band_demand[feature +'-1'] = spot_band_demand[feature].shift(periods=1)
    spot_band_demand[feature + '+1'] = spot_band_demand[feature].shift(periods=-1)
spot_band_demand.dropna(subset=['spot-1','demand-1', 'spot+1','demand+1'], inplace=True)

### Decision tree

In [337]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [382]:
#Set features and output
X = spot_band_demand.drop(labels=['hour', 'weekday','season', 'band'], axis=1).values
Y = spot_band_demand['band'].values

In [383]:
#Split data set into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, 
                                                    test_size=0.2, random_state=100)

In [390]:
#Decision Tree Regressor
tree = DecisionTreeRegressor(max_depth=10, max_features=None, 
                             min_samples_split=300, min_samples_leaf=150,
                            random_state=100)

In [391]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=100)

CV_mae = list()
CV_mse = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(y_train[test_index], y_pred))
    CV_mse.append(mean_squared_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 6.7853 +- 0.2283
Mean squared error: 92.2666 +- 12.5171


In [392]:
CV_mae, CV_mse

([6.6305950709744108,
  6.7962333122337117,
  6.9526161801355713,
  6.6934196889940827,
  6.8535642814880005],
 [82.34412268059657,
  92.921280725695553,
  101.36930170667839,
  89.69495604741445,
  95.003580502116279])

In [393]:
tree.fit(x_train, y_train);

In [394]:
print(mean_absolute_error(y_test, tree.predict(x_test)), mean_squared_error(y_test, tree.predict(x_test)))

6.65957294675 85.6635178404


In [395]:
Y.mean()

19.421253279592381

In [396]:
Y.std()

12.182203667830006

In [397]:
X.shape

(26299, 34)

### Randomized Search CV

In [398]:
params = {
    'criterion':['mse'],
    'splitter':['best','random'],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,21), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(200, 2500, 50).astype(int), 'max_leaf_nodes': np.linspace(10,180, 40).astype(int)
}

In [399]:
#Random search, set up tree and cross-validation
tree = DecisionTreeRegressor()
kf = KFold(n_splits=5)

#Randomized Search CV for tree
random_search = RandomizedSearchCV(tree, param_distributions=params, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [400]:
random_search.fit(x_train, y_train);

In [401]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'max_depth': 19, 'min_samples_leaf': 246, 'splitter': 'best', 'max_features': 'auto', 'criterion': 'mse', 'min_samples_split': 732, 'max_leaf_nodes': 127}


In [402]:
tree.set_params(**best_params, random_state=0)

DecisionTreeRegressor(criterion='mse', max_depth=19, max_features='auto',
           max_leaf_nodes=127, min_impurity_split=1e-07,
           min_samples_leaf=246, min_samples_split=732,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [403]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_mse = list()
CV_mae = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_mae.append(mean_absolute_error(y_train[test_index], y_pred))
    CV_mse.append(mean_squared_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_mae), 2*np.std(CV_mae)))
print('Mean squared error: %0.4f +- %0.4f' %(np.mean(CV_mse), 2*np.std(CV_mse)))

Mean absolute error: 6.8973 +- 0.2842
Mean squared error: 94.3758 +- 14.8353


In [404]:
CV_mae, CV_mse

([6.6686824600242902,
  6.9694991654932652,
  7.1028287694093581,
  6.868990414410332,
  6.8764406489753318],
 [82.342407707676827,
  95.529808244021353,
  105.5611420855721,
  92.841946160598852,
  95.60367584306951])

In [405]:
tree.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=19, max_features='auto',
           max_leaf_nodes=127, min_impurity_split=1e-07,
           min_samples_leaf=246, min_samples_split=732,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [406]:
print(mean_absolute_error(y_test, tree.predict(x_test)), mean_squared_error(y_test, tree.predict(x_test)))

6.72953793214 86.2856923665
