In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from tqdm import tqdm
from ztools.ntf import telegram
import time
import pickle

from statsmodels.tsa.api import SimpleExpSmoothing, Holt, ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.vector_ar.var_model import VAR
import statsmodels.api as sm

from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC

  from numpy.core.umath_tests import inner1d


In [2]:
def min_time(times):
    mins = str(int(times//60))
    secs = str(int(times%60))
    if len(secs) == 1:
        secs = f'0{secs}'
    return f'{mins}:{secs}'

In [3]:
cols = [
    'ES_4',
    'ES_7',
    'ES_12',
    'SARIMAX',
    'SES_0.6',
    'SES_0.9',
    'SES_1.2',
    'DP',
    'DP_U',
    'DP_D',
    'NUM_U',
    'NUM_D',
    'DIST_U',
    'DIST_D',
    'TARGET'
]
data = pd.DataFrame(columns = cols)
#data.to_csv('tests_data_brl.csv', index = False)

In [4]:
data = pd.read_csv('tests_data_brl.csv')

In [5]:
print('Iniciando procedimento.')
start = time.time()
print('Lendo arquivo de dados.')
df = pd.read_csv('data_brl.csv')

run_past = False

update = not run_past

last_samples = len(data)
samples = 20

hist = {
    'ES_4': [],
    'ES_7': [],
    'ES_12': [],
    'SARIMAX': [],
    'SES_0.6': [],
    'SES_0.9': [],
    'SES_1.2': [],
    'DP': [],
    'DP_U': [],
    'DP_D': [],
    'NUM_U': [],
    'NUM_D': [],
    'DIST_U': [],
    'DIST_D': [],
    'TARGET': []
}

if run_past:
    do_list = list(range(last_samples + 1, last_samples + samples + 1))
    
elif update:
    do_list = list(range(1, samples + 1))

print('Gerando dados de previsões.')
for i in do_list:
    
    n_total = len(df)
    n_test = i
    n_pred = n_test

    pred = {}

    null = np.array([None for _ in range(n_total - n_test)])

    sp = 4
    name = f'ES_{sp}'
    fit1 = ExponentialSmoothing(np.asarray(df['val'][:-n_test]) ,seasonal_periods=sp ,trend='add', seasonal='add',).fit()
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    sp = 7
    name = f'ES_{sp}'
    fit1 = ExponentialSmoothing(np.asarray(df['val'][:-n_test]) ,seasonal_periods=sp ,trend='add', seasonal='add',).fit()
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    sp = 12
    name = f'ES_{sp}'
    fit1 = ExponentialSmoothing(np.asarray(df['val'][:-n_test]) ,seasonal_periods=sp ,trend='add', seasonal='add',).fit()
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    name = f'SARIMAX'
    fit1 = sm.tsa.statespace.SARIMAX(df['val'][:-n_test], order=(3, 1, 4),seasonal_order=(0,1,1,7)).fit()
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    sl = 0.6
    name = f'SES_{sl}'
    fit1 = SimpleExpSmoothing(np.asarray(df['val'][:-n_test])).fit(smoothing_level=sl,optimized=False)
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    sl = 0.9
    name = f'SES_{sl}'
    fit1 = SimpleExpSmoothing(np.asarray(df['val'][:-n_test])).fit(smoothing_level=sl,optimized=False)
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    sl = 1.2
    name = f'SES_{sl}'
    fit1 = SimpleExpSmoothing(np.asarray(df['val'][:-n_test])).fit(smoothing_level=sl,optimized=False)
    forecast = fit1.forecast(n_pred)
    pred[name] = np.concatenate((null, forecast))

    n = n_total - n_test -1
    l = df['val'][n]
    f = df['val'][n+1]
    v = round((f - l)/l, 4)
    
    if v > 0:
        trg = 1
    else:
        trg = 0
    
    preds = []
    ups = []
    downs = []
    
    for k in pred.keys():
        p = pred[k][n+1]
        preds.append(p)
        up = (p > l)
        if up:
            ups.append(p)
            hist[k].append(1)
        else:
            downs.append(p)
            hist[k].append(0)

    sd = np.std(preds)
    u_sd = np.std(ups)
    d_sd = np.std(downs)
    u_len = len(ups)
    d_len = len(downs)
    u_dif = abs(np.mean(ups) - l)
    d_dif = abs(np.mean(downs) - l)
    
    hist['DP'].append(sd)
    hist['DP_U'].append(u_sd)
    hist['DP_D'].append(d_sd)
    hist['NUM_U'].append(u_len)
    hist['NUM_D'].append(d_len)
    hist['DIST_U'].append(u_dif)
    hist['DIST_D'].append(d_dif)
    hist['TARGET'].append(trg)
    
    print(f'{round((i - do_list[0] + 1)/len(do_list), 3)*100}%')
    
print('Dados de previsão gerados.')
end = time.time()
delay = end - start
print(f'Tempo de processamento: {int(delay//3600)}:{int((delay%3600)//60)}:{int((delay%3600)%60)}')
telegram(msg = 'True Model - Geração de dados concluída.')

Iniciando procedimento.
Lendo arquivo de dados.
Gerando dados de previsões.


  loc = initial_p <= lb
  loc = initial_p >= ub


5.0%




10.0%




15.0%




20.0%




25.0%




30.0%




35.0%




40.0%




45.0%




50.0%




55.00000000000001%




60.0%




65.0%




70.0%




75.0%




80.0%




85.0%




90.0%




95.0%




100.0%
Dados de previsão gerados.
Tempo de processamento: 0:7:32


In [6]:
print('Atualizando planilha de dados de dados.')
data = pd.concat([data, pd.DataFrame(hist)], ignore_index = True)
data = data.drop_duplicates()
print('Baixando planilha.')
data.to_csv('tests_data_brl.csv', index = False)
print('Planilha atualizada.')

Atualizando planilha de dados de dados.
Baixando planilha.
Planilha atualizada.


In [7]:
print('Preparando dados para modelagem.')
data = data.fillna(0)
data[['ES_4', 'ES_7', 'ES_12', 'SARIMAX', 'SES_0.6', 'SES_0.9', 'SES_1.2', 'TARGET']] = data[['ES_4', 'ES_7', 'ES_12', 'SARIMAX', 'SES_0.6', 'SES_0.9', 'SES_1.2', 'TARGET']].astype(int)
train = data.drop('TARGET', axis = 1)
targets = data['TARGET'].values
print('Dados preparados.')

Preparando dados para modelagem.
Dados preparados.


In [8]:
#print(f'Último resultado: {round(grid_search.best_score_, 4)}')
print('Iniciando modelagem.')
start = time.time()
clf = GradientBoostingClassifier()
model_name = 'GradientBoosting'
print(f'Modelo selecionado: {model_name}.')
n_folds = 5
cross_validation = StratifiedKFold(targets, n_folds=n_folds) # n_folds deve ser escolhido de forma precisa
print(f'Validação cruzada - Número de grupos: {n_folds}.')
print('Definindo grade de parâmetros.')
parameter_grid = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9],
    'max_depth' : list(range(4, 16)),
    'n_estimators': list(range(40, 361, 40)),
    'criterion': ['friedman_mse'],
    'min_samples_split': list(range(60, 151, 30)),
    'max_features': [None, 'auto']
                 }
grid_search = GridSearchCV(clf,param_grid=parameter_grid,cv=cross_validation, verbose = 0)
print('Iniciando busca de melhores parâmetros.')
grid_search.fit(train, targets)
print('Melhores parâmetros encontrados.')
found = time.time()
times = min_time(found - start)
print(f'Melhor pontuação: {round(grid_search.best_score_, 4)}')
print(f'Melhores parâmetros:')
for param in grid_search.best_params_.keys():
    val = grid_search.best_params_[param]
    if type(val) == str:
        val = f'"{val}"'
    print(f'\t{param}: {val},')
print(f'Tempo de processo: {times} min')
telegram(msg = f'Busca de parâmetros concluída. Melhor puntuação: {round(grid_search.best_score_, 4)}. Tempo de execução: {times} min')

Iniciando modelagem.
Modelo selecionado: GradientBoosting.
Validação cruzada - Número de grupos: 5.
Definindo grade de parâmetros.
Iniciando busca de melhores parâmetros.
Melhores parâmetros encontrados.
Melhor pontuação: 0.5298
Melhores parâmetros:
	criterion: "friedman_mse",
	learning_rate: 0.3,
	max_depth: 12,
	max_features: None,
	min_samples_split: 60,
	n_estimators: 40,
Tempo de processo: 127:36 min


In [9]:
telegram(msg = f'Busca de parâmetros concluída. Melhor puntuação: {round(grid_search.best_score_, 4)}. Tempo de execução: {times} min')

In [9]:
print('Criando modelo com parâmetros selecionados.')
clf = GradientBoostingClassifier(
    criterion = 'friedman_mse',
    learning_rate = 0.3,
    max_depth = 12,
    max_features = None,
    min_samples_split = 60,
    n_estimators = 40
)
print('Ajustando modelo aos dados.')
clf.fit(train, targets)
print('Salvando modelo.')
nvers = len(train)
pickle.dump(clf, open(f'gradient_brl_{nvers}s.sav', 'wb'))
print('Modelagem concluida.')

Criando modelo com parâmetros selecionados.
Ajustando modelo aos dados.
Salvando modelo.
Modelagem concluida.


In [10]:
#print(f'Último resultado: {round(grid_search.best_score_, 4)}')
print('Iniciando modelagem.')
start = time.time()
clf = RandomForestClassifier()
model_name = 'RandomForest'
print(f'Modelo selecionado: {model_name}.')
n_folds = 4
cross_validation = StratifiedKFold(targets, n_folds=n_folds) # n_folds deve ser escolhido de forma precisa
print(f'Validação cruzada - Número de grupos: {n_folds}.')
print('Definindo grade de parâmetros.')
parameter_grid = {
    'max_depth' : list(range(4, 16)),
    'n_estimators': list(range(200, 361, 40)),
    #'min_impurity_decrease':  np.arange(0, 0.20, 0.5),
    'max_features': [None, 'auto']
}
grid_search = GridSearchCV(clf,param_grid=parameter_grid,cv=cross_validation, verbose = 0)
print('Iniciando busca de melhores parâmetros.')
grid_search.fit(train, targets)
print('Melhores parâmetros encontrados.')
found = time.time()
times = min_time(found - start)
print(f'Melhor pontuação: {round(grid_search.best_score_, 4)}')
print(f'Melhores parâmetros:')
for param in grid_search.best_params_.keys():
    val = grid_search.best_params_[param]
    if type(val) == str:
        val = f'"{val}"'
    print(f'\t{param}: {val},')
print(f'Tempo de processo: {times} min')
telegram(msg = f'Busca de parâmetros concluída. Melhor puntuação: {round(grid_search.best_score_, 4)}. Tempo de execução: {times} min')

Iniciando modelagem.
Modelo selecionado: RandomForest.
Validação cruzada - Número de grupos: 4.
Definindo grade de parâmetros.
Iniciando busca de melhores parâmetros.
Melhores parâmetros encontrados.
Melhor pontuação: 0.5325
Melhores parâmetros:
	max_depth: 9,
	max_features: None,
	n_estimators: 200,
Tempo de processo: 5:31 min


In [11]:
print('Criando modelo com parâmetros selecionados.')
clf = RandomForestClassifier(
    max_depth = 9,
    max_features = None,
    n_estimators = 200
)
print('Ajustando modelo aos dados.')
clf.fit(train, targets)
print('Salvando modelo.')
nvers = len(train)
pickle.dump(clf, open(f'randomforest_brl_{nvers}s.sav', 'wb'))
print('Modelagem concluida.')

Criando modelo com parâmetros selecionados.
Ajustando modelo aos dados.
Salvando modelo.
Modelagem concluida.


In [57]:
start = time.time()
clf = ExtraTreesClassifier()
cross_validation = StratifiedKFold(targets, n_folds=5) # n_folds deve ser escolhido de forma precisa
parameter_grid = {
                 'max_depth' : list(range(5, 16)),
                 'n_estimators': list(range(50, 320, 20)),
                 'criterion': ['gini','entropy']
                 }
grid_search = GridSearchCV(clf,param_grid=parameter_grid,cv=cross_validation, verbose = 0)
grid_search.fit(train, targets)
found = time.time()
times = min_times(found - start)
print(f'Best score: {grid_search.best_score_}')
print(f'Best parameters: {grid_search.best_params_}')
print(f'Tempo de processo: {times} min')

Fitting 5 folds for each of 308 candidates, totalling 1540 fits


[Parallel(n_jobs=1)]: Done 1540 out of 1540 | elapsed:  5.0min finished


Best score: 0.4
Best parameters: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 110}
Time: {spmin:spsec} min


In [12]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(1, activation='relu', input_shape = (14,)))

model.add(layers.Dense(8, activation='relu'))

model.add(layers.Dense(32, activation='relu'))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(256, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(16, activation='relu'))

model.add(layers.Dense(2, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train,
                    targets,
                    epochs=600,
                    batch_size=20)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

In [13]:
# serialize model to JSON
model_json = model.to_json()
with open(f'sequential_brl_{nvers}s.json', "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f'sequential_brl_{nvers}s.h5')

In [14]:
nvers

755