In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import datetime
import warnings
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
def get_clean_data(df_original):
    weekday_dict = {
        0:'L', 1:'M', 2:'X', 3:'J', 4:'V', 5:'S', 6:'D' 
    }
    
    df = df_original.copy(deep=True)
    df['date'] =  pd.to_datetime(df['datetime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df['time'] = pd.to_datetime(df['datetime'].apply(lambda x: x[:19])).dt.time
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['datetime'].apply(lambda x: x[11:13]).astype(int)
    df['minute'] = df['datetime'].apply(lambda x: x[14:16]).astype(int)
    df['weekday'] = df['date'].dt.dayofweek
    df.replace({'weekday':weekday_dict}, inplace=True)
    df['season'] = np.where(df['month'].isin(list(range(4,10))), 'summer', 'winter')
    df['date_hour'] = df.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
    df.set_index('date_hour', inplace=True)
    df = df[df.index < '2017']
    clean_df = df[['date', 'year', 'month', 'season', 'day','weekday','time', 'hour', 'minute', 'value']]
    
    return clean_df

### df with secondary band and spot price 

In [137]:
raw_band_price = pd.read_csv('export_PrecioBandaDeRegulaciónSecundaria_2017-01-15_19-11.csv', encoding='latin1', delimiter=';')
band_price = get_clean_data(raw_band_price)
band_price = band_price.rename(columns={'value':'band'})

In [138]:
raw_spot_price = pd.read_csv('export_PrecioMercadoSPOTDiario_2017-02-02_09-47.csv', encoding='latin1', delimiter=';')
raw_spot_price = raw_spot_price[raw_spot_price['geoid'] == 3]
spot_price = get_clean_data(raw_spot_price)
spot_price = spot_price.rename(columns={'value':'spot'})

In [139]:
#Merge both df
spot_band = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)

In [140]:
#Clean df in order to train a model, output: band price
spot_band = spot_band[['hour', 'weekday', 'season', 'spot', 'band']]

In [141]:
#Create dummy variables
for feature in ['hour', 'weekday', 'season']:
    spot_band[pd.get_dummies(spot_band[feature], drop_first=False).columns.tolist()] = pd.get_dummies(spot_band[feature], drop_first=False)

In [142]:
spot_band.shape

(26310, 38)

### Decision tree

In [143]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [144]:
#Set features and output
X = spot_band.drop(labels=['hour', 'weekday','season', 'band'], axis=1).values
Y = spot_band['band'].values

In [145]:
#Split data set into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, 
                                                    test_size=0.2, random_state=100)

In [146]:
#Decision Tree Regressor
tree = DecisionTreeRegressor(max_depth=10, max_features=None, 
                             min_samples_split=100, min_samples_leaf=30,
                            random_state=100)

In [147]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=100)

CV_score = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_score.append(mean_absolute_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_score), 2*np.std(CV_score)))

Mean absolute error: 7.0603 +- 0.1448


In [148]:
CV_score

[7.0760788747042849,
 6.968445006526732,
 7.0909565356062521,
 6.9949343989738617,
 7.1712983133783501]

In [149]:
tree.fit(x_train, y_train);

In [150]:
mean_absolute_error(y_test, tree.predict(x_test))

7.0437857518838625

In [151]:
Y.mean()

19.426196883314329

In [152]:
Y.std()

12.185915236473567

In [153]:
X.shape

(26310, 34)

### Randomized Search CV

In [154]:
params = {
    'criterion':['mse'],
    'splitter':['best','random'],
    'max_features':['auto','log2', 'sqrt'],
    'max_depth':np.arange(2,21), 'min_samples_split':np.linspace(50, 1000, 200).astype(int),
    'min_samples_leaf': np.linspace(20, 750, 90).astype(int), 'max_leaf_nodes': np.linspace(10,200, 40).astype(int)
}

In [163]:
#Random search, set up tree and cross-validation
tree = DecisionTreeRegressor()
kf = KFold(n_splits=5)

#Randomized Search CV for tree
random_search = RandomizedSearchCV(tree, param_distributions=params, n_jobs=-1, n_iter=350,
                                scoring='neg_mean_squared_error', cv=list(kf.split(x_train,y_train)), random_state=0)

In [164]:
random_search.fit(x_train, y_train);

In [157]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score']==1)[0]]
print(best_params)

{'criterion': 'mse', 'splitter': 'best', 'max_features': 'auto', 'min_samples_split': 517, 'max_leaf_nodes': 190, 'min_samples_leaf': 52, 'max_depth': 19}


In [158]:
tree.set_params(**best_params, random_state=0)

DecisionTreeRegressor(criterion='mse', max_depth=19, max_features='auto',
           max_leaf_nodes=190, min_impurity_split=1e-07,
           min_samples_leaf=52, min_samples_split=517,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [159]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=0)

CV_score = list()

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_score.append(mean_absolute_error(y_train[test_index], y_pred))
    
print('Mean absolute error: %0.4f +- %0.4f' %(np.mean(CV_score), 2*np.std(CV_score)))

Mean absolute error: 6.9652 +- 0.1544


In [160]:
CV_score

[6.9609829925957998,
 6.8628414784267928,
 6.9789459911506242,
 6.9254587710559115,
 7.097593817280746]

In [161]:
tree.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=19, max_features='auto',
           max_leaf_nodes=190, min_impurity_split=1e-07,
           min_samples_leaf=52, min_samples_split=517,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [162]:
mean_absolute_error(y_test, tree.predict(x_test))

6.8962953575577091