## DayTrade System

In [33]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mpdates
import mplfinance as mpf
from mplfinance.original_flavor import candlestick_ohlc

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

### Constants and Parameters

In [34]:
dfColumns = ['date','open', 'high', 'low','close','tickvol','volume','spread']
newColumns = ['open', 'high', 'low', 'close']
col15 = ['open','d-5_high','d-5_low','d-4_high','d-4_low','d-3_high','d-3_low','d-2_high','d-2_low','d-1_high','d-1_low','EMA_high','EMA_low','UB_close','LB_close']
col25 = ['open','d-5_open','d-5_high','d-5_low','d-5_close','d-4_open','d-4_high','d-4_low','d-4_close','d-3_open','d-3_high','d-3_low','d-3_close','d-2_open','d-2_high','d-2_low','d-2_close','d-1_open','d-1_high','d-1_low','d-1_close','EMA_high' ,'EMA_low','UB_close','LB_close']
DAYS = 5
TEST_BEGIN = '2021.04.06'
TEST_END = '2021.06.01'
TRAIN_BEGIN = '2020.03.31'
TRAIN_END = '2021.05.31'
TIMES = 10

### Auxiliary functions

In [35]:
def calculateEMA(df, n, columns):
    """
    returns an n period exponential moving average for
    the time series s

    pd.Dataframe df: is the original dataframe - it needs to contain a 'Close' named column
    int          n: is the number of periods to consider in the calculus

    returns a dataframe with a new column containing the MME
    """
    
    k  = 2 / (n+1)
    for column in columns:
        EMAcolumn = 'EMA_'+column

        df[EMAcolumn] = df[column]
        df[EMAcolumn].iloc[n-1] = df[column].iloc[0:n-2].mean()

        for i in range(n, len(df)):
            df[EMAcolumn].iloc[i] = (df[column].iloc[i] - df[EMAcolumn].iloc[i-1])*k + df[EMAcolumn].iloc[i-1]
            #MME = (Close[i] - MME[i-1])*k + MME[i-1] 
     
    #The following function makes all the hardwork. However I'll keep it commented, once I've already implemented the calculation
    #f['pandasEMA'] = df['close'].ewm(span=5, adjust=False).mean()
    
    return df

def calculateBB(df, n, columns):
    ma = df['close'].rolling(window=n).mean()
    std = df['close'].rolling(window=n).std() 
    
    for column in columns:
        df['UB_'+column] = ma + (std * 2)
        df['LB_'+column] = ma - (std * 2)
    
    return df

def removeColumns(df, columns):
    df = df.drop(columns, axis=1)
    return df

def addPreviousDays(df, days, columns):
    auxDf = df
    for day in range(1,days+1):
        for column in columns:
            colName = 'd-'+str(day)+'_'+column
            df[colName] = auxDf[column].shift(day)
    return df

def getPeriod(df, begin, end, resetIndex = False):
    """
    returns the df in the chosen interval
    
    Object begin: Start date forrmated as 'yyyy.mm.dd'
    Object   end: end date 'yyyy.mm.dd'

    returns a dataframe with the historic of the selected period
    """
    indexBegin = df[df['date']==begin].index[0]
    indexEnd = df[df['date']==end].index[0]
    
    if (resetIndex):
        return df[(df.index >= indexBegin) & (df.index <= indexEnd)].reset_index(drop=True)
    else: 
        return df[(df.index >= indexBegin) & (df.index <= indexEnd)]
    
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100, np.std(np.abs((actual - pred) / actual)) * 100

def mae(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs(actual - pred)), np.std(np.abs(actual - pred))

def Normalize(df):
    date = df['date']
    maxVal = df[['high', 'low']].max()
    normalized = removeColumns(df,['date'])/(removeColumns(df,['date']).max()*2)
    
    
    df.loc[:, df.columns != 'date'] = normalized
    
    return df, maxVal

def createDate(day, month, year):
    if(day >= 10):
        day = str(day)
    else:
        day = '0' + str(day)
    
    if(month < 10):
        month = '0' + str(month)
    else:
        month = str(month)
        
    year = str(year)
    date = year + '.' + month + '.' + day
    
    return date

def getDate(date):
    
    split = date.split('.')
    
    year = int(split[0])
    month = int(split[1])
    day = int(split[2])
    
    return year, month, day
 
def printResult(error, column):
    print(column+': '+str(error[0].round(2))+' +- '+str(error[1].round(2)) + ' %')

### Opening the dataset

In [72]:
df = pd.read_csv('Data/PETR4_Daily.csv', sep = "\t")
df.columns = dfColumns
df = removeColumns(df, ['tickvol', 'volume','spread'])

df2 = pd.read_csv('Data/VALE3_Daily.csv', sep = "\t")

df2.columns = dfColumns
df2 = removeColumns(df2, ['tickvol', 'volume','spread'])

df 
df.tail(290)

Unnamed: 0,date,open,high,low,close
1049,2020.03.31,13.16,14.07,13.16,13.54
1050,2020.04.01,13.01,14.08,12.89,13.84
1051,2020.04.02,14.90,16.01,14.52,15.01
1052,2020.04.03,15.77,15.83,14.44,14.84
1053,2020.04.06,15.53,15.58,14.69,15.26
...,...,...,...,...,...
1334,2021.05.26,25.91,26.18,25.65,26.09
1335,2021.05.27,26.08,26.25,25.77,25.91
1336,2021.05.28,26.20,27.02,26.15,26.99
1337,2021.05.31,26.96,27.20,26.78,26.87


In [73]:
#PETR4
df = calculateEMA(df,DAYS,newColumns)
df = calculateBB(df,DAYS,newColumns)
df = addPreviousDays(df,DAYS, newColumns)

#VALE3
df2 = calculateEMA(df2,DAYS,newColumns)
df2 = calculateBB(df2,DAYS,newColumns)
df2 = addPreviousDays(df2,DAYS, newColumns)

In [38]:
##df = getPeriod(df, '2020.03.02','2021.04.30', True)
##initialDf = getPeriod(df, '2020.03.02','2021.04.30', True)

In [74]:
initialDf_PETR4 = getPeriod(df, '2019.01.02','2021.06.01', True)
initialDf_VALE3 = getPeriod(df2, '2020.01.02','2021.06.01', True)

In [75]:
initialDf_PETR4

Unnamed: 0,date,open,high,low,close,EMA_open,EMA_high,EMA_low,EMA_close,UB_open,...,d-3_low,d-3_close,d-4_open,d-4_high,d-4_low,d-4_close,d-5_open,d-5_high,d-5_low,d-5_close
0,2019.01.02,21.18,22.73,20.93,22.60,20.609175,21.499077,20.380957,21.271656,23.121863,...,19.18,20.37,19.55,20.08,19.41,19.69,20.38,20.39,19.50,19.63
1,2019.01.03,22.51,23.32,22.36,23.16,21.242783,22.106051,21.040638,21.901104,24.121289,...,19.99,20.36,19.34,20.37,19.18,20.37,19.55,20.08,19.41,19.69
2,2019.01.04,23.35,23.43,22.99,23.22,21.945189,22.547367,21.690425,22.340736,24.635035,...,20.74,21.31,20.22,20.65,19.99,20.36,19.34,20.37,19.18,20.37
3,2019.01.07,23.35,24.35,23.20,23.59,22.413459,23.148245,22.193617,22.757157,24.561307,...,20.93,22.60,20.77,21.45,20.74,21.31,20.22,20.65,19.99,20.36
4,2019.01.08,23.86,23.88,23.27,23.45,22.895640,23.392163,22.552411,22.988105,23.963289,...,22.36,23.16,21.18,22.73,20.93,22.60,20.77,21.45,20.74,21.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,2021.05.26,25.91,26.18,25.65,26.09,26.114791,26.354219,25.760657,26.042263,26.470349,...,25.94,25.95,26.15,26.28,25.72,25.93,25.85,26.35,25.80,26.15
595,2021.05.27,26.08,26.25,25.77,25.91,26.103194,26.319479,25.763771,25.998175,26.471798,...,25.99,26.39,26.08,26.31,25.94,25.95,26.15,26.28,25.72,25.93
596,2021.05.28,26.20,27.02,26.15,26.99,26.135462,26.552986,25.892514,26.328784,27.180120,...,25.78,25.84,26.28,26.52,25.99,26.39,26.08,26.31,25.94,25.95
597,2021.05.31,26.96,27.20,26.78,26.87,26.410308,26.768657,26.188343,26.509189,27.435810,...,25.65,26.09,26.48,26.62,25.78,25.84,26.28,26.52,25.99,26.39


#### Experimento 1: Benchmark

In [41]:
def exp01(df):
    df = df[['date','open', 'high', 'low', 'close']]
    
    df = df[['date','open', 'high', 'low', 'close']]

    minaux = df['low'].copy()
    maxaux = df['high'].copy()

    df['maxonedaylag'] = df['high'].shift(1)
    df['minonedaylag'] = df['low'].shift(1)

    df['maxSma5'] = df['high'].rolling(window=5).mean()
    df['minSma5'] = df['low'].rolling(window=5).mean()

    df['maxSma10'] = df['high'].rolling(window=10).mean()
    df['minSma10'] = df['low'].rolling(window=10).mean()

    df['maxSma20'] = df['high'].rolling(window=20).mean()
    df['minSma20'] = df['low'].rolling(window=20).mean()
    
    df = getPeriod(df, TEST_BEGIN, TEST_END)
    
    return df

def printExp01(df, stock):
    print(stock)
    
    print("1-day lag: ")
    printResult(mape(df['high'],df['maxonedaylag']), 'High')
    printResult(mape(df['low'],df['minonedaylag']), 'Low')
    #print (mae(df['high'],df['maxonedaylag']))
    #print (mae(df['low'],df['minonedaylag']))

    print("SMA5: ")
    printResult(mape(df['high'],df['maxSma5']), 'High')
    printResult(mape(df['low'],df['minSma5']), 'Low')
    #print (mae(df['high'],df['maxSma5']))
    #print (mae(df['low'],df['minSma5']))

    print("SMA10: ")
    printResult(mape(df['high'],df['maxSma10']), 'High')
    printResult(mape(df['low'],df['minSma10']), 'Low')
    #print (mae(df['high'],df['maxSma10']))
    #print (mae(df['low'],df['minSma10']))

    print("SMA20: ")
    printResult(mape(df['high'],df['maxSma20']), 'High')
    printResult(mape(df['low'],df['minSma20']), 'Low')
    #print (mae(df['high'],df['maxSma20']))
    #print (mae(df['low'],df['minSma20']))
    
aux = getPeriod(initialDf_PETR4, TRAIN_BEGIN, TEST_END)
exp1 = exp01(aux)

printExp01(exp1, 'PETR4')

aux = getPeriod(initialDf_VALE3, TRAIN_BEGIN, TEST_END)
exp1 = exp01(aux)

printExp01(exp1, 'VALE3')

PETR4
1-day lag: 
High: 1.35 +- 1.25 %
Low: 1.2 +- 1.12 %
SMA5: 
High: 1.32 +- 1.13 %
Low: 1.26 +- 1.17 %
SMA10: 
High: 2.02 +- 1.7 %
Low: 1.96 +- 1.83 %
SMA20: 
High: 3.24 +- 2.28 %
Low: 3.33 +- 2.45 %
VALE3
1-day lag: 
High: 1.31 +- 0.87 %
Low: 1.3 +- 0.94 %
SMA5: 
High: 1.69 +- 1.15 %
Low: 1.63 +- 0.99 %
SMA10: 
High: 2.54 +- 1.6 %
Low: 2.43 +- 1.53 %
SMA20: 
High: 4.14 +- 2.27 %
Low: 4.08 +- 2.3 %


#### Experimento 02: Rodando MLP para testes

In [64]:
def exp02(df, stock, inputColumns = []):
    df = getPeriod(df, TRAIN_BEGIN, TEST_END, True)
    
    df_train = getPeriod(df, TRAIN_BEGIN, TRAIN_END)
    df_test = getPeriod(df, TEST_BEGIN, TEST_END)

    if(inputColumns == []):
        x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
        y_train = df_train[['high','low']]

        x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
        y_test = df_test[['high','low']]
    
    else:
        x_train = df_train[inputColumns]
        y_train = df_train[['high','low']]

        x_test = df_test[inputColumns]
        y_test = df_test[['high','low']]
    

    MLP = MLPRegressor(hidden_layer_sizes = (8), #sqrt(33*2) according to the article
                       momentum = 0.8,
                       max_iter = 100000,
                       activation = 'relu',
                       learning_rate = 'constant',
                       learning_rate_init = 0.01)

    MLP.fit(x_train, y_train)
    y_pred = MLP.predict(x_test)
    
    errorMax = mape(y_test['high'],y_pred[:,0])
    errorMin = mape(y_test['low'],y_pred[:,1])
    #printResult(mape(y_test['high'],y_pred[:,0]), 'High')
    #printResult(mape(y_test['low'],y_pred[:,1]), 'Low')
    #print (mae(y_test['high'],y_pred[:,0]))
    #print (mae(y_test['low'],y_pred[:,1]))
    
    return errorMax, errorMin

def test30timesExp02(df,stock, phrase, columnInput = []):
    errorMax = []
    errorMin = []
    
    for i in range(1, TIMES+1):
        if(columnInput == []):
            errorMax_petr4, errorMin_petr4 = exp02(df, stock)
        else: 
            errorMax_petr4, errorMin_petr4 = exp02(df, stock, columnInput)
        
        errorMax.append(errorMax_petr4[0])
        errorMin.append(errorMin_petr4[0])
    
    errorMax = np.array(errorMax)
    errorMin = np.array(errorMin)
    print(phrase)
    printResult((errorMax.mean(),errorMax.std()), 'High')
    printResult((errorMin.mean(),errorMin.std()), 'Low')
    
    
test30timesExp02(initialDf_PETR4, 'PETR4', 'PETR4 15 Variables', col15) 
test30timesExp02(initialDf_PETR4, 'PETR4', 'PETR4 25 Variables', col25) 
test30timesExp02(initialDf_PETR4, 'PETR4', 'PETR4 33 Variables') 
print('\n')
test30timesExp02(initialDf_VALE3, 'VALE3', 'VALE3 15 Variables', col15) 
test30timesExp02(initialDf_VALE3, 'VALE3', 'VALE3 25 Variables', col25) 
test30timesExp02(initialDf_VALE3, 'VALE3', 'VALE3 33 Variables') 

PETR4 15 Variables
High: 5.3 +- 3.87 %
Low: 5.71 +- 3.46 %
PETR4 25 Variables
High: 3.95 +- 2.09 %
Low: 5.33 +- 3.39 %
PETR4 33 Variables
High: 3.9 +- 1.82 %
Low: 5.83 +- 2.18 %


VALE3 15 Variables
High: 15.49 +- 14.65 %
Low: 11.71 +- 10.87 %
VALE3 25 Variables
High: 10.13 +- 6.45 %
Low: 8.69 +- 5.42 %
VALE3 33 Variables
High: 14.1 +- 11.44 %
Low: 15.36 +- 11.25 %


#### Experimento 03: Reproduzindo o experimento realizado por Martinez

In [71]:
def exp03(df, stock, inputColumns = []):
    aux, maxVal = Normalize(df)
    df = aux
    
    df_train = getPeriod(df, TRAIN_BEGIN, TRAIN_END)
    df_test = getPeriod(df, TEST_BEGIN, TEST_END)
    
    
    if(inputColumns == []):
        x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
        y_train = df_train[['high','low']]

        x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
        y_test = df_test[['high','low']]
    else:
        x_train = df_train[inputColumns]
        y_train = df_train[['high','low']]

        x_test = df_test[inputColumns]
        y_test = df_test[['high','low']]
    
    MLP = MLPRegressor(hidden_layer_sizes = (8), #sqrt(33*2) according to the article
                   momentum = 0.8,
                   max_iter = 100000,
                   activation = 'logistic',
                   learning_rate = 'constant',
                   learning_rate_init = 0.01)

    MLP.fit(x_train, y_train)
    y_pred = MLP.predict(x_test)
    
    y_predFix = y_pred
    y_predFix[:,0] = y_predFix[:,0]*maxVal['high']*2
    y_predFix[:,1] = y_predFix[:,1]*maxVal['low']*2

    y_testFix = y_test*maxVal*2
    
    #print(stock)
    
    errorMax = mape(y_testFix['high'],y_predFix[:,0])
    errorMin = mape(y_testFix['low'],y_predFix[:,1])
    ##printResult(mape(y_testFix['high'],y_predFix[:,0]), 'High')
    ##printResult(mape(y_testFix['low'],y_predFix[:,1]), 'Low')
    #print (mae(y_testFix['high'],y_predFix[:,0]))
    #print (mae(y_testFix['low'],y_predFix[:,1]))
    
    return errorMax, errorMin

def test30timesExp03(df,stock, phrase, columnInput = []):
    errorMax = []
    errorMin = []
    
    for i in range(1, TIMES+1):
        if(columnInput == []):
            errorMax_petr4, errorMin_petr4 = exp03(df, stock)
        else: 
            errorMax_petr4, errorMin_petr4 = exp03(df, stock, columnInput)
        
        errorMax.append(errorMax_petr4[0])
        errorMin.append(errorMin_petr4[0])
    
    errorMax = np.array(errorMax)
    errorMin = np.array(errorMin)
    print(phrase)
    printResult((errorMax.mean(),errorMax.std()), 'High')
    printResult((errorMin.mean(),errorMin.std()), 'Low')
    
test30timesExp03(initialDf_PETR4, 'PETR4', 'PETR4 15 Variables', col15) 
test30timesExp03(initialDf_PETR4, 'PETR4', 'PETR4 25 Variables', col25) 
test30timesExp03(initialDf_PETR4, 'PETR4', 'PETR4 33 Variables') 
print('\n')
test30timesExp03(initialDf_VALE3, 'VALE3', 'VALE3 15 Variables', col15) 
test30timesExp03(initialDf_VALE3, 'VALE3', 'VALE3 25 Variables', col25) 
test30timesExp03(initialDf_VALE3, 'VALE3', 'VALE3 33 Variables') 

PETR4 15 Variables
High: 7.1 +- 4.33 %
Low: 8.99 +- 4.73 %
PETR4 25 Variables
High: 8.41 +- 7.1 %
Low: 7.13 +- 3.52 %
PETR4 33 Variables
High: 6.09 +- 4.58 %
Low: 6.54 +- 4.42 %


VALE3 15 Variables
High: 8.48 +- 9.85 %
Low: 16.22 +- 7.98 %
VALE3 25 Variables
High: 6.54 +- 3.6 %
Low: 8.77 +- 6.09 %
VALE3 33 Variables
High: 9.86 +- 6.99 %
Low: 10.83 +- 5.02 %


#### USANDO MINMAXSCALER

In [24]:
import pandas as pd
from sklearn import preprocessing

df3 = getPeriod(initialDf, '2020.03.02','2021.04.30', True)

def Normalize2(df, columns):
    date = df['date']

    x = removeColumns(df,['date']).values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normalized = pd.DataFrame(x_scaled)
    normalized.columns = columns[1:]
    
    df.loc[:, df.columns != 'date'] = normalized
    df.columns = columns
    
    return df

df3 = Normalize2(df3, df3.columns)
df3

NameError: name 'initialDf' is not defined

In [None]:
df_train = getPeriod(df3, '2020.03.02','2021.03.31')
df_test = getPeriod(df3, '2021.04.01','2021.04.30')

x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
y_train = df_train[['high','low']]

x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
y_test = df_test[['high','low']]
MLP = MLPRegressor(hidden_layer_sizes = (8), #sqrt(33*2) according to the article
                   momentum = 0.8,
                   max_iter = 100000,
                   activation = 'relu',
                   learning_rate = 'constant',
                   learning_rate_init = 0.01)

MLP.fit(x_train, y_train)
y_pred = MLP.predict(x_test)

print (mape(y_test['high'],y_pred[:,0]))
print (mape(y_test['low'],y_pred[:,1]))
print (mae(y_test['high'],y_pred[:,0]))
print (mae(y_test['low'],y_pred[:,1]))

#### Experimento 04: Utilizando Random Forest

In [89]:
def exp04(df, stock, inputColumns = []):
    #df = getPeriod(df, '2020.03.02','2021.04.30', True)
    
    #print(df)
    df_train = getPeriod(df, TRAIN_BEGIN, TRAIN_END)
    df_test = getPeriod(df, TEST_BEGIN, TEST_END)
    
    if(inputColumns == []):
        x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
        y_train = df_train[['high','low']]

        x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
        y_test = df_test[['high','low']]
    else:
        x_train = df_train[inputColumns]
        y_train = df_train[['high','low']]

        x_test = df_test[inputColumns]
        y_test = df_test[['high','low']]

    rf = RandomForestRegressor(n_estimators = 500)
    rf.fit(x_train, y_train);
    y_pred = rf.predict(x_test)
    
    errorMax = mape(y_test['high'],y_pred[:,0])
    errorMin = mape(y_test['low'],y_pred[:,1])
    
    #print (mape(y_test['high'],y_pred[:,0]))
    #print (mape(y_test['low'],y_pred[:,1]))
    #print (mae(y_test['high'],y_pred[:,0]))
    #print (mae(y_test['low'],y_pred[:,1]))
    
    return errorMax, errorMin

def test30timesExp04(df,stock, phrase, columnInput = []):
    errorMax = []
    errorMin = []
    
    for i in range(1, TIMES+1):
        if(columnInput == []):
            errorMax_petr4, errorMin_petr4 = exp04(df, stock)
        else: 
            errorMax_petr4, errorMin_petr4 = exp04(df, stock, columnInput)
            
        errorMax.append(errorMax_petr4[0])
        errorMin.append(errorMin_petr4[0])
    
    errorMax = np.array(errorMax)
    errorMin = np.array(errorMin)
    print(phrase)
    printResult((errorMax.mean(),errorMax.std()), 'High')
    printResult((errorMin.mean(),errorMin.std()), 'Low')
    
test30timesExp04(initialDf_PETR4, 'PETR4', 'PETR4 15 Variables', col15) 
test30timesExp04(initialDf_PETR4, 'PETR4', 'PETR4 25 Variables', col25) 
test30timesExp04(initialDf_PETR4, 'PETR4', 'PETR4 33 Variables') 
print('\n')
test30timesExp04(initialDf_VALE3, 'VALE3', 'VALE3 15 Variables', col15) 
test30timesExp04(initialDf_VALE3, 'VALE3', 'VALE3 25 Variables', col25) 
test30timesExp04(initialDf_VALE3, 'VALE3', 'VALE3 33 Variables')     

PETR4 15 Variables
High: 0.34 +- 0.01 %
Low: 0.25 +- 0.0 %
PETR4 25 Variables
High: 0.35 +- 0.01 %
Low: 0.26 +- 0.0 %
PETR4 33 Variables
High: 0.33 +- 0.01 %
Low: 0.24 +- 0.01 %


VALE3 15 Variables
High: 0.38 +- 0.01 %
Low: 0.31 +- 0.01 %
VALE3 25 Variables
High: 0.38 +- 0.01 %
Low: 0.31 +- 0.01 %
VALE3 33 Variables
High: 0.38 +- 0.01 %
Low: 0.32 +- 0.01 %


In [None]:
df4 = getPeriod(initialDf, '2020.03.02','2021.04.30', True)

df_train = getPeriod(df4, '2020.03.02','2021.03.31')
df_test = getPeriod(df4, '2021.04.01','2021.04.30')

x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
y_train = df_train[['high','low']]

x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
y_test = df_test[['high','low']]

rf = RandomForestRegressor(n_estimators = 1000)
rf.fit(x_train, y_train);
y_pred = rf.predict(x_test)

print (mape(y_test['high'],y_pred[:,0]))
print (mape(y_test['low'],y_pred[:,1]))
print (mae(y_test['high'],y_pred[:,0]))
print (mae(y_test['low'],y_pred[:,1]))

In [None]:
importance = rf.feature_importances_

for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

importance

from matplotlib import pyplot as plt
plt.barh(x_train.columns, rf.feature_importances_)

In [95]:
def exp041(df, stock, inputColumns = []):
    #df = getPeriod(df, '2020.03.02','2021.04.30', True)
    
    #print(df)
    df_train = getPeriod(df, TRAIN_BEGIN, TRAIN_END)
    df_test = getPeriod(df, TEST_BEGIN, TEST_END)
    
    if(inputColumns == []):
        x_train = df_train.drop(['date', 'high', 'low', 'close'], axis=1)
        y_train = df_train[['high','low']]

        x_test = df_test.drop(['date', 'high', 'low', 'close'], axis=1)
        y_test = df_test[['high','low']]
    else:
        x_train = df_train[inputColumns]
        y_train = df_train[['high','low']]

        x_test = df_test[inputColumns]
        y_test = df_test[['high','low']]

    for i in range(1, 5):    
        rf = RandomForestRegressor(n_estimators = 1)
        rf.fit(x_train, y_train);
        y_pred = rf.predict(x_test)
        
        print(i)
        errorMax = mape(y_test['high'],y_pred[:,0])
        errorMin = mape(y_test['low'],y_pred[:,1])
        printResult(errorMax, 'High')
        printResult(errorMin, 'Low')
    
    importance = rf.feature_importances_
    
    featureImp = []
    for feat, importance in zip(x_train.columns, rf.feature_importances_):  
        temp = [feat, importance*100]
        featureImp.append(temp)

    fT_df = pd.DataFrame(featureImp, columns = ['Feature', 'Importance'])
    print (fT_df.sort_values('Importance', ascending = False))
    
    #print (mape(y_test['high'],y_pred[:,0]))
    #print (mape(y_test['low'],y_pred[:,1]))
    #print (mae(y_test['high'],y_pred[:,0]))
    #print (mae(y_test['low'],y_pred[:,1]))
    
    return errorMax, errorMin, y_pred, y_test

a,b,c,d = exp041(initialDf_VALE3, 'PETR4')

1
High: 0.83 +- 1.27 %
Low: 0.74 +- 1.09 %
2
High: 0.69 +- 1.1 %
Low: 0.49 +- 0.73 %
3
High: 0.53 +- 0.89 %
Low: 0.33 +- 0.55 %
4
High: 0.6 +- 1.05 %
Low: 0.47 +- 0.72 %
      Feature  Importance
30   d-5_high   82.851569
0        open   10.889458
3     EMA_low    4.249056
15    d-1_low    0.589814
2    EMA_high    0.440272
21   d-3_open    0.295597
11   UB_close    0.267926
5     UB_open    0.149985
16  d-1_close    0.075970
20  d-2_close    0.051237
4   EMA_close    0.049326
10     LB_low    0.027121
27    d-4_low    0.021520
13   d-1_open    0.004948
18   d-2_high    0.004122
1    EMA_open    0.003880
6     LB_open    0.003543
32  d-5_close    0.003331
28  d-4_close    0.002825
9      UB_low    0.002745
26   d-4_high    0.002615
22   d-3_high    0.002195
7     UB_high    0.002188
8     LB_high    0.001822
19    d-2_low    0.001554
31    d-5_low    0.001527
17   d-2_open    0.001272
14   d-1_high    0.000719
23    d-3_low    0.000696
25   d-4_open    0.000508
12   LB_close    0.00025

In [54]:
printResult(a, 'High')
printResult(b, 'Low')

High: 0.33 +- 0.35 %
Low: 0.25 +- 0.19 %


array([[23.57602, 23.08437],
       [23.53759, 23.06663],
       [23.36198, 22.77046],
       [23.09886, 22.61874],
       [23.48019, 22.91561],
       [23.39123, 22.92486],
       [23.72485, 23.1257 ],
       [23.79454, 23.05702],
       [23.33554, 22.83646],
       [24.34258, 22.77913],
       [24.70237, 23.87352],
       [24.26341, 23.57463],
       [24.06663, 23.47461],
       [24.06325, 23.49523],
       [24.09761, 23.16473],
       [24.07802, 23.32339],
       [24.19125, 23.39426],
       [23.99261, 23.2978 ],
       [23.89043, 23.30035],
       [23.81564, 23.04458],
       [23.92901, 23.0976 ],
       [23.88446, 23.32756],
       [24.33141, 23.38929],
       [25.11351, 24.52913],
       [25.11369, 24.29094],
       [25.437  , 24.65228],
       [25.18157, 24.483  ],
       [26.18158, 25.37831],
       [26.67897, 26.1214 ],
       [26.82763, 26.21882],
       [26.41154, 25.79991],
       [26.34962, 25.77762],
       [26.32001, 25.87301],
       [26.60243, 25.97075],
       [26.678

In [53]:
d

Unnamed: 0,high,low
559,23.58,23.13
560,23.55,23.15
561,23.26,22.61
562,23.01,22.61
563,23.58,23.01
564,23.33,22.92
565,23.78,23.2
566,23.83,23.02
567,23.24,22.81
568,24.89,22.77


In [59]:
initialDf_PETR4

Unnamed: 0,date,open,high,low,close,EMA_open,EMA_high,EMA_low,EMA_close,UB_open,...,d-3_low,d-3_close,d-4_open,d-4_high,d-4_low,d-4_close,d-5_open,d-5_high,d-5_low,d-5_close
0,2019.01.02,21.18,22.73,20.93,22.60,20.609175,21.499077,20.380957,21.271656,23.121863,...,19.18,20.37,19.55,20.08,19.41,19.69,20.38,20.39,19.50,19.63
1,2019.01.03,22.51,23.32,22.36,23.16,21.242783,22.106051,21.040638,21.901104,24.121289,...,19.99,20.36,19.34,20.37,19.18,20.37,19.55,20.08,19.41,19.69
2,2019.01.04,23.35,23.43,22.99,23.22,21.945189,22.547367,21.690425,22.340736,24.635035,...,20.74,21.31,20.22,20.65,19.99,20.36,19.34,20.37,19.18,20.37
3,2019.01.07,23.35,24.35,23.20,23.59,22.413459,23.148245,22.193617,22.757157,24.561307,...,20.93,22.60,20.77,21.45,20.74,21.31,20.22,20.65,19.99,20.36
4,2019.01.08,23.86,23.88,23.27,23.45,22.895640,23.392163,22.552411,22.988105,23.963289,...,22.36,23.16,21.18,22.73,20.93,22.60,20.77,21.45,20.74,21.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,2021.05.26,25.91,26.18,25.65,26.09,26.114791,26.354219,25.760657,26.042263,26.470349,...,25.94,25.95,26.15,26.28,25.72,25.93,25.85,26.35,25.80,26.15
595,2021.05.27,26.08,26.25,25.77,25.91,26.103194,26.319479,25.763771,25.998175,26.471798,...,25.99,26.39,26.08,26.31,25.94,25.95,26.15,26.28,25.72,25.93
596,2021.05.28,26.20,27.02,26.15,26.99,26.135462,26.552986,25.892514,26.328784,27.180120,...,25.78,25.84,26.28,26.52,25.99,26.39,26.08,26.31,25.94,25.95
597,2021.05.31,26.96,27.20,26.78,26.87,26.410308,26.768657,26.188343,26.509189,27.435810,...,25.65,26.09,26.48,26.62,25.78,25.84,26.28,26.52,25.99,26.39
