In [34]:
import kagglehub

path = kagglehub.dataset_download("imranbukhari/comprehensive-bnbusd-1m-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/imranbukhari/comprehensive-bnbusd-1m-data?dataset_version_number=22...


100%|████████████████████████████████████████| 311M/311M [00:55<00:00, 5.87MB/s]

Extracting files...





Path to dataset files: /Users/reefu/.cache/kagglehub/datasets/imranbukhari/comprehensive-bnbusd-1m-data/versions/22


In [15]:
import pandas as pd
import numpy as np
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("/Users/reefu/.cache/kagglehub/datasets/imranbukhari/comprehensive-bnbusd-1m-data/versions/22/BNBUSD_1m_Binance.csv")

df.sample(5)

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,Ignore
3380787,2024-04-16 15:14:00,531.8,531.8,530.1,530.2,399.471,2024-04-16 15:14:59.999,211965.5144,483.0,114.243,60611.6565,0.0
1930961,2021-07-14 11:38:00,302.52,302.61,301.78,301.89,2941.3436,2021-07-14 11:38:59.999,888653.071529,636.0,886.1401,267781.24886,0.0
1917273,2021-07-04 23:30:00,308.44,308.74,308.33,308.64,386.3261,2021-07-04 23:30:59.999,119247.140955,177.0,301.3769,93028.622955,0.0
2226960,2022-02-05 07:27:00,424.7,424.7,424.2,424.5,307.907,2022-02-05 07:27:59.999,130673.9911,257.0,119.968,50915.896,0.0
3036275,2023-08-21 09:22:00,215.2,215.2,215.0,215.0,235.945,2023-08-21 09:22:59.999,50752.5491,63.0,21.656,4658.9895,0.0


In [3]:
df['Open time'] = pd.to_datetime(df['Open time'])

In [4]:
features = ["Open", "High", "Low", "Close", "Volume", "Number of trades"]
df = df[["Open time"] + features]

In [5]:
df.set_index('Open time', inplace=True)

In [6]:
df = df.resample('D').agg({
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum',
    'Number of trades': 'sum'
})

In [7]:
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

In [8]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Number of trades
Open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-11-06,1.3e-05,0.000281,0.0,0.000108,0.002115,1.5e-05
2017-11-07,0.000108,0.000282,0.001425,0.000414,0.002787,0.0
2017-11-08,0.000414,0.000522,0.001711,0.000667,0.013387,0.000283
2017-11-09,0.000667,0.000787,0.00192,0.000668,0.010391,0.000107
2017-11-10,0.000668,0.00056,0.001511,0.000298,0.020217,0.000203


In [9]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i : i + seq_length])
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

seq_length = 25  
data = df.values
X, y = create_sequences(data, seq_length)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [14]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [20]:
def get_data_loader(X, y, batch_size):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [21]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [22]:
def objective(trial):
    # Tune hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    epochs = trial.suggest_int("epochs", 50, 150, step=50)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)

    # Get DataLoader
    train_loader = get_data_loader(X_train, y_train, batch_size)

    # Initialize Model
    model = BiLSTMModel(input_size=6, hidden_size=128, num_layers=2, dropout=dropout)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
    return total_loss / len(train_loader)

In [23]:
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-03-19 05:58:04,086] A new study created in memory with name: no-name-16b87f30-bc06-492c-97b4-4ed0f5dc825c
[I 2025-03-19 06:00:26,739] Trial 0 finished with value: 0.02697115042246878 and parameters: {'learning_rate': 0.06757196147013436, 'batch_size': 32, 'epochs': 100, 'dropout': 0.30846635521314913}. Best is trial 0 with value: 0.02697115042246878.
[I 2025-03-19 06:02:29,063] Trial 1 finished with value: 0.002142420987023369 and parameters: {'learning_rate': 0.0003017858348195838, 'batch_size': 128, 'epochs': 150, 'dropout': 0.2792094778754688}. Best is trial 1 with value: 0.002142420987023369.
[I 2025-03-19 06:03:20,367] Trial 2 finished with value: 0.00237863479626619 and parameters: {'learning_rate': 0.005620720580428703, 'batch_size': 64, 'epochs': 50, 'dropout': 0.26641497321559876}. Best is trial 1 with value: 0.002142420987023369.
[I 2025-03-19 06:04:12,731] Trial 3 finished with value: 0.002546184495176104 and parameters: {'learning_rate': 0.005690231163062163, 'batch

In [24]:
best_params = study.best_params
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.010750156717437719, 'batch_size': 128, 'epochs': 150, 'dropout': 0.20131669209387182}


In [25]:
best_model = BiLSTMModel(input_size=6, hidden_size=128, num_layers=2, dropout=best_params["dropout"])
best_optimizer = optim.Adam(best_model.parameters(), lr=best_params["learning_rate"])
criterion = nn.MSELoss()
train_loader = get_data_loader(X_train, y_train, best_params["batch_size"])

In [26]:
for epoch in range(best_params["epochs"]):
    best_model.train()
    for X_batch, y_batch in train_loader:
        best_optimizer.zero_grad()
        predictions = best_model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        best_optimizer.step()

KeyboardInterrupt: 