In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import datetime
import warnings
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
def get_clean_data(df_original):
    weekday_dict = {
        0:'L', 1:'M', 2:'X', 3:'J', 4:'V', 5:'S', 6:'D' 
    }
    
    df = df_original.copy(deep=True)
    df['date'] =  pd.to_datetime(df['datetime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df['time'] = pd.to_datetime(df['datetime'].apply(lambda x: x[:19])).dt.time
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['datetime'].apply(lambda x: x[11:13]).astype(int)
    df['minute'] = df['datetime'].apply(lambda x: x[14:16]).astype(int)
    df['weekday'] = df['date'].dt.dayofweek
    df.replace({'weekday':weekday_dict}, inplace=True)
    df['season'] = np.where(df['month'].isin(list(range(4,10))), 'summer', 'winter')
    df['date_hour'] = df.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
    df.set_index('date_hour', inplace=True)
    df = df[df.index < '2017']
    clean_df = df[['date', 'year', 'month', 'season', 'day','weekday','time', 'hour', 'minute', 'value']]
    
    return clean_df

### df with secondary band and spot price 

In [3]:
raw_band_price = pd.read_csv('export_PrecioBandaDeRegulaciónSecundaria_2017-01-15_19-11.csv', encoding='latin1', delimiter=';')
band_price = get_clean_data(raw_band_price)
band_price = band_price.rename(columns={'value':'band'})

In [4]:
raw_spot_price = pd.read_csv('export_PrecioMercadoSPOTDiario_2017-02-02_09-47.csv', encoding='latin1', delimiter=';')
raw_spot_price = raw_spot_price[raw_spot_price['geoid'] == 3]
spot_price = get_clean_data(raw_spot_price)
spot_price = spot_price.rename(columns={'value':'spot'})

In [5]:
#Merge both df
spot_band = band_price.merge(spot_price[['spot']], how='left', left_index=True, right_index=True)

In [6]:
#Clean df in order to train a model, output: band price
spot_band = spot_band[['hour', 'weekday', 'season', 'spot', 'band']]

In [7]:
#Create dummy variables
for feature in ['hour', 'weekday', 'season']:
    spot_band[pd.get_dummies(spot_band[feature], drop_first=False).columns.tolist()] = pd.get_dummies(spot_band[feature], drop_first=False)

In [8]:
spot_band.shape

(26310, 38)

### Decision tree

In [39]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [40]:
#Set features and output
X = spot_band.drop(labels=['hour', 'weekday', 'season', 'band'], axis=1).values
Y = spot_band['band'].values

In [90]:
#Split data set into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, 
                                                    test_size=0.2, random_state=2)

In [91]:
#Decision Tree Regressor
tree = DecisionTreeRegressor(max_depth=10, max_features=None, 
                             min_samples_split=100, min_samples_leaf=30,
                            random_state=2)

In [92]:
#CV with kfold=5
kf = KFold(n_splits=5, random_state=2)

CV_score = []

for train_index, test_index in kf.split(x_train, y_train):
    tree.fit(x_train[train_index], y_train[train_index])
    y_pred = tree.predict(x_train[test_index])
    CV_score.append(mean_absolute_error(y_train[test_index], y_pred))

In [93]:
CV_score

[6.9614567827321219,
 7.0650964680730128,
 7.0417715573662587,
 7.0015665104649409,
 6.9707350944098048]

In [94]:
tree.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=30, min_samples_split=100,
           min_weight_fraction_leaf=0.0, presort=False, random_state=2,
           splitter='best')

In [95]:
mean_absolute_error(y_test, tree.predict(x_test))

7.1085057547198396

In [78]:
y_test

array([ 19.32,   4.7 ,  14.2 , ...,   9.9 ,  20.69,  33.17])

In [79]:
tree.predict(x_test)

array([ 20.40021417,  12.56954142,  14.47195228, ...,  18.86791667,
        14.03406417,  30.09921053])