# N-BEATS with TDA Features - Crypto Price Prediction

Based on: [Enhancing financial time series forecasting through topological data analysis](https://link.springer.com/article/10.1007/s00521-024-10787-x)

## 1. Setup & GPU Check

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!pip install torch pandas scikit-learn ripser matplotlib tqdm -q

In [None]:
# Clone repository
!git clone https://github.com/hiseongmin/-binance-data-collector.git
%cd -binance-data-collector

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Collect Data (Optional - if data not exists)

In [None]:
# Collect fresh data (takes ~3 min)
# Skip if you already have data
!pip install requests -q
!python main.py --symbol BTCUSDT --days 365 --market futures

In [None]:
# Check data
!ls -la data/

## 3. Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tqdm import tqdm

# Local imports
from ml.features.tda_features import TDAFeatureExtractor
from ml.models.nbeats import NBeatsWithTDA

## 4. Configuration

In [None]:
# Hyperparameters
CONFIG = {
    'data_path': 'data/BTCUSDT_perp_5m.csv',
    'lookback': 96,       # 8 hours (5min * 96)
    'horizon': 12,        # 1 hour prediction
    'tda_window': 50,     # TDA window size
    'batch_size': 128,
    'epochs': 100,
    'lr': 1e-3,
    'hidden_size': 256,
    'n_stacks': 2,
    'n_blocks': 3,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 5. Load & Preprocess Data

In [None]:
# Load data
print("Loading data...")
df = pd.read_csv(CONFIG['data_path'])
print(f"Data shape: {df.shape}")
print(f"Date range: {df['open_time'].iloc[0]} ~ {df['open_time'].iloc[-1]}")
df.head()

In [None]:
# Extract and normalize prices
prices = df['close'].values.astype(np.float32)

price_scaler = StandardScaler()
prices_norm = price_scaler.fit_transform(prices.reshape(-1, 1)).flatten()

print(f"Price range: ${prices.min():.2f} ~ ${prices.max():.2f}")
print(f"Normalized range: {prices_norm.min():.2f} ~ {prices_norm.max():.2f}")

## 6. Extract TDA Features

In [None]:
%%time
print("Extracting TDA features (this may take a few minutes)...")

tda_extractor = TDAFeatureExtractor(window_size=CONFIG['tda_window'])
tda_features = tda_extractor.extract_features(prices_norm)

# Pad to match length
padding = np.zeros((CONFIG['tda_window'] - 1, 3))
tda_features = np.vstack([padding, tda_features]).astype(np.float32)

# Normalize
tda_scaler = StandardScaler()
tda_features = tda_scaler.fit_transform(tda_features)

print(f"TDA features shape: {tda_features.shape}")
print(f"Features: [Entropy, Amplitude, NumPoints]")

In [None]:
# Visualize TDA features
fig, axes = plt.subplots(3, 1, figsize=(14, 8), sharex=True)

sample_range = slice(1000, 3000)

axes[0].plot(tda_features[sample_range, 0], alpha=0.7)
axes[0].set_ylabel('Entropy')
axes[0].set_title('TDA Features Over Time')

axes[1].plot(tda_features[sample_range, 1], alpha=0.7, color='orange')
axes[1].set_ylabel('Amplitude')

axes[2].plot(tda_features[sample_range, 2], alpha=0.7, color='green')
axes[2].set_ylabel('Num Points')
axes[2].set_xlabel('Time')

plt.tight_layout()
plt.show()

## 7. Create Dataset

In [None]:
def create_sequences(prices, tda, lookback, horizon):
    X, Y, T = [], [], []
    for i in range(lookback, len(prices) - horizon):
        X.append(prices[i-lookback:i])
        Y.append(prices[i:i+horizon])
        T.append(tda[i-1])
    return np.array(X), np.array(Y), np.array(T)

X, Y, T = create_sequences(
    prices_norm, tda_features, 
    CONFIG['lookback'], CONFIG['horizon']
)

print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")
print(f"T shape: {T.shape}")

In [None]:
# Train/Val/Test split (70/15/15)
n = len(X)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

X_train, X_val, X_test = X[:train_end], X[train_end:val_end], X[val_end:]
Y_train, Y_val, Y_test = Y[:train_end], Y[train_end:val_end], Y[val_end:]
T_train, T_val, T_test = T[:train_end], T[train_end:val_end], T[val_end:]

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

In [None]:
# Create DataLoaders
train_loader = DataLoader(
    TensorDataset(
        torch.FloatTensor(X_train),
        torch.FloatTensor(T_train),
        torch.FloatTensor(Y_train)
    ),
    batch_size=CONFIG['batch_size'],
    shuffle=True
)

val_loader = DataLoader(
    TensorDataset(
        torch.FloatTensor(X_val),
        torch.FloatTensor(T_val),
        torch.FloatTensor(Y_val)
    ),
    batch_size=CONFIG['batch_size']
)

test_loader = DataLoader(
    TensorDataset(
        torch.FloatTensor(X_test),
        torch.FloatTensor(T_test),
        torch.FloatTensor(Y_test)
    ),
    batch_size=CONFIG['batch_size']
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## 8. Create Model

In [None]:
model = NBeatsWithTDA(
    lookback=CONFIG['lookback'],
    horizon=CONFIG['horizon'],
    n_stacks=CONFIG['n_stacks'],
    n_blocks=CONFIG['n_blocks'],
    hidden_size=CONFIG['hidden_size'],
    use_tda=True
).to(device)

n_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {n_params:,}")
print(model)

## 9. Training

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)
criterion = nn.MSELoss()

train_losses = []
val_losses = []
best_val_loss = float('inf')

In [None]:
# Training loop
for epoch in range(CONFIG['epochs']):
    # Train
    model.train()
    train_loss = 0
    for x, t, y in train_loader:
        x, t, y = x.to(device), t.to(device), y.to(device)
        
        optimizer.zero_grad()
        pred = model(x, t)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, t, y in val_loader:
            x, t, y = x.to(device), t.to(device), y.to(device)
            pred = model(x, t)
            val_loss += criterion(pred, y).item()
    
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    
    scheduler.step(val_loss)
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{CONFIG['epochs']} - Train: {train_loss:.6f}, Val: {val_loss:.6f}")

In [None]:
# Plot training progress
plt.figure(figsize=(10, 4))
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Progress')
plt.legend()
plt.grid(True)
plt.show()

## 10. Evaluation

In [None]:
# Load best model
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

# Predict on test set
all_preds = []
all_targets = []

with torch.no_grad():
    for x, t, y in test_loader:
        x, t = x.to(device), t.to(device)
        pred = model(x, t)
        all_preds.append(pred.cpu().numpy())
        all_targets.append(y.numpy())

preds = np.concatenate(all_preds)
targets = np.concatenate(all_targets)

print(f"Predictions shape: {preds.shape}")
print(f"Targets shape: {targets.shape}")

In [None]:
# Calculate metrics
mse = np.mean((preds - targets) ** 2)
mae = np.mean(np.abs(preds - targets))
rmse = np.sqrt(mse)

# Direction accuracy
pred_dir = np.sign(preds[:, -1] - preds[:, 0])
true_dir = np.sign(targets[:, -1] - targets[:, 0])
dir_acc = np.mean(pred_dir == true_dir) * 100

print("=" * 40)
print("TEST RESULTS")
print("=" * 40)
print(f"MSE:  {mse:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"MAE:  {mae:.6f}")
print(f"Direction Accuracy: {dir_acc:.2f}%")
print("=" * 40)

In [None]:
# Visualize predictions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Predictions vs Actual (first step)
n_show = 300
axes[0, 0].plot(targets[:n_show, 0], label='Actual', alpha=0.7)
axes[0, 0].plot(preds[:n_show, 0], label='Predicted', alpha=0.7)
axes[0, 0].set_title('Predictions vs Actual (1-step ahead)')
axes[0, 0].legend()
axes[0, 0].grid(True)

# 2. Scatter plot
axes[0, 1].scatter(targets[:, 0], preds[:, 0], alpha=0.3, s=5)
axes[0, 1].plot([targets.min(), targets.max()], [targets.min(), targets.max()], 'r--')
axes[0, 1].set_xlabel('Actual')
axes[0, 1].set_ylabel('Predicted')
axes[0, 1].set_title('Scatter Plot')
axes[0, 1].grid(True)

# 3. Error distribution
errors = preds[:, 0] - targets[:, 0]
axes[1, 0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].axvline(0, color='r', linestyle='--')
axes[1, 0].set_xlabel('Prediction Error')
axes[1, 0].set_title('Error Distribution')
axes[1, 0].grid(True)

# 4. Full horizon prediction example
sample_idx = 100
axes[1, 1].plot(range(CONFIG['horizon']), targets[sample_idx], 'b-o', label='Actual')
axes[1, 1].plot(range(CONFIG['horizon']), preds[sample_idx], 'r-o', label='Predicted')
axes[1, 1].set_xlabel('Steps Ahead (5min each)')
axes[1, 1].set_ylabel('Price (normalized)')
axes[1, 1].set_title(f'Full Horizon Prediction (Sample {sample_idx})')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig('results.png', dpi=150)
plt.show()

## 11. Compare with Baseline (No TDA)

In [None]:
# Train baseline model without TDA
print("Training baseline model (without TDA)...")

baseline_model = NBeatsWithTDA(
    lookback=CONFIG['lookback'],
    horizon=CONFIG['horizon'],
    n_stacks=CONFIG['n_stacks'],
    n_blocks=CONFIG['n_blocks'],
    hidden_size=CONFIG['hidden_size'],
    use_tda=False  # No TDA
).to(device)

baseline_optimizer = torch.optim.AdamW(baseline_model.parameters(), lr=CONFIG['lr'])

for epoch in range(CONFIG['epochs']):
    baseline_model.train()
    for x, t, y in train_loader:
        x, y = x.to(device), y.to(device)
        baseline_optimizer.zero_grad()
        pred = baseline_model(x, None)
        loss = criterion(pred, y)
        loss.backward()
        baseline_optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}/{CONFIG['epochs']}")

print("Baseline training complete!")

In [None]:
# Compare results
baseline_model.eval()
baseline_preds = []

with torch.no_grad():
    for x, t, y in test_loader:
        x = x.to(device)
        pred = baseline_model(x, None)
        baseline_preds.append(pred.cpu().numpy())

baseline_preds = np.concatenate(baseline_preds)

# Metrics
baseline_mse = np.mean((baseline_preds - targets) ** 2)
baseline_dir = np.sign(baseline_preds[:, -1] - baseline_preds[:, 0])
baseline_dir_acc = np.mean(baseline_dir == true_dir) * 100

print("\n" + "=" * 50)
print("COMPARISON: TDA vs No-TDA")
print("=" * 50)
print(f"{'Metric':<20} {'With TDA':<15} {'Without TDA':<15}")
print("-" * 50)
print(f"{'MSE':<20} {mse:<15.6f} {baseline_mse:<15.6f}")
print(f"{'Direction Acc':<20} {dir_acc:<15.2f}% {baseline_dir_acc:<15.2f}%")
print("=" * 50)
print(f"\nTDA Improvement: {baseline_mse - mse:.6f} MSE, {dir_acc - baseline_dir_acc:.2f}% Direction")

## 12. Save Model

In [None]:
# Save complete model
torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG,
    'price_scaler_mean': price_scaler.mean_,
    'price_scaler_scale': price_scaler.scale_,
    'tda_scaler_mean': tda_scaler.mean_,
    'tda_scaler_scale': tda_scaler.scale_,
}, 'nbeats_tda_complete.pt')

print("Model saved to nbeats_tda_complete.pt")