In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
from data.data_loader import DataLoader
from utils.data_preprocessing import DataPreprocessor
from models.lstm_model import LSTMModel
from backtesting.backtester import Backtester


In [3]:
symbol = "BTC-USD"
start_date = "2020-01-01"
interval = "1d"

# Carregar os dados de hora em hora ou diário
data_loader = DataLoader(symbol=symbol, start_date=start_date, interval=interval)
data = data_loader.load_data()

Dados carregados do arquivo data\BTC-USD_2020-01-01_to_2024-10-20_1d.csv.


In [4]:
# mostrar os dados
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,7194.892090,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
2020-01-02,7202.551270,7212.155273,6935.270020,6985.470215,6985.470215,20802083465
2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
2020-01-04,7345.375488,7427.385742,7309.514160,7410.656738,7410.656738,18444271275
2020-01-05,7410.451660,7544.497070,7400.535645,7411.317383,7411.317383,19725074095
...,...,...,...,...,...,...
2024-10-15,66050.367188,67881.679688,64809.195312,67041.109375,67041.109375,48863870879
2024-10-16,67042.460938,68375.289062,66758.726562,67612.718750,67612.718750,38195189534
2024-10-17,67617.078125,67912.210938,66647.390625,67399.835938,67399.835938,32790898511
2024-10-18,67419.109375,68969.750000,67177.820312,68418.789062,68418.789062,36857165014


In [5]:
# Verificar se há valores não numéricos ou inconsistentes
print("Verificando a presença de valores não numéricos...")
print(data.dtypes)  # Verificar os tipos de dados das colunas

Verificando a presença de valores não numéricos...
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


In [6]:
data.isnull().sum()  # Verificar se há valores nulos
# data.head(20)  # Verificar se há valores inconsistentes

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [7]:
data.head(20)  # Verificar se há valores inconsistentes

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,7194.89209,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
2020-01-02,7202.55127,7212.155273,6935.27002,6985.470215,6985.470215,20802083465
2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
2020-01-04,7345.375488,7427.385742,7309.51416,7410.656738,7410.656738,18444271275
2020-01-05,7410.45166,7544.49707,7400.535645,7411.317383,7411.317383,19725074095
2020-01-06,7410.452148,7781.867188,7409.292969,7769.219238,7769.219238,23276261598
2020-01-07,7768.682129,8178.21582,7768.227539,8163.692383,8163.692383,28767291327
2020-01-08,8161.935547,8396.738281,7956.774414,8079.862793,8079.862793,31672559265
2020-01-09,8082.295898,8082.295898,7842.403809,7879.071289,7879.071289,24045990466
2020-01-10,7878.307617,8166.554199,7726.774902,8166.554199,8166.554199,28714583844


In [8]:
# Remover caracteres indesejados e converter as colunas de preços para numérico
# Substituir quaisquer pontos, vírgulas ou caracteres especiais que possam estar nos dados
for col in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
    data[col] = data[col].replace({',': '', '\.': ''}, regex=True)

  data[col] = data[col].replace({',': '', '\.': ''}, regex=True)


In [9]:
data.head(20) 

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,7194.89209,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
2020-01-02,7202.55127,7212.155273,6935.27002,6985.470215,6985.470215,20802083465
2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
2020-01-04,7345.375488,7427.385742,7309.51416,7410.656738,7410.656738,18444271275
2020-01-05,7410.45166,7544.49707,7400.535645,7411.317383,7411.317383,19725074095
2020-01-06,7410.452148,7781.867188,7409.292969,7769.219238,7769.219238,23276261598
2020-01-07,7768.682129,8178.21582,7768.227539,8163.692383,8163.692383,28767291327
2020-01-08,8161.935547,8396.738281,7956.774414,8079.862793,8079.862793,31672559265
2020-01-09,8082.295898,8082.295898,7842.403809,7879.071289,7879.071289,24045990466
2020-01-10,7878.307617,8166.554199,7726.774902,8166.554199,8166.554199,28714583844


In [10]:
# Converter colunas de preços para float
cols_to_convert = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
for col in cols_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Forçar conversão para numérico, erros para NaN


In [11]:
data.head(20) 

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,7194.89209,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
2020-01-02,7202.55127,7212.155273,6935.27002,6985.470215,6985.470215,20802083465
2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
2020-01-04,7345.375488,7427.385742,7309.51416,7410.656738,7410.656738,18444271275
2020-01-05,7410.45166,7544.49707,7400.535645,7411.317383,7411.317383,19725074095
2020-01-06,7410.452148,7781.867188,7409.292969,7769.219238,7769.219238,23276261598
2020-01-07,7768.682129,8178.21582,7768.227539,8163.692383,8163.692383,28767291327
2020-01-08,8161.935547,8396.738281,7956.774414,8079.862793,8079.862793,31672559265
2020-01-09,8082.295898,8082.295898,7842.403809,7879.071289,7879.071289,24045990466
2020-01-10,7878.307617,8166.554199,7726.774902,8166.554199,8166.554199,28714583844


In [12]:
# Verificar se há valores não numéricos ou inconsistentes
print("Verificando a presença de valores não numéricos...")
print(data.dtypes)  # Verificar os tipos de dados das colunas

Verificando a presença de valores não numéricos...
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


In [13]:
preprocessor = DataPreprocessor()
X, Y, dates, scaler = preprocessor.preprocess(data)

# Verificar se o número de amostras está consistente entre X, Y e dates
if len(X) != len(Y) or len(X) != len(dates):
    print(
        f"Inconsistência encontrada: X tem {len(X)} amostras, Y tem {len(Y)} amostras, e dates tem {len(dates)}."
    )

Iniciando o preprocessamento dos dados...
Preprocessamento concluído.
