## Import data and sort

In [5]:

import pandas as pd
import numpy as np
import torch
import random
import pickle
import os
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [6]:

# df = pd.read_csv('../data/interim/train_clean_2010.csv')
# df = pd.read_csv('../data/interim/train_features_2010.csv')
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)


## Extract Features

In [8]:
CHECKPOINT_FILE = 'dataset_checkpoint.pkl'

if os.path.exists(CHECKPOINT_FILE):
    print(f"Loading preprocessed dataset from {CHECKPOINT_FILE}...")
    with open(CHECKPOINT_FILE, 'rb') as f:
        checkpoint = pickle.load(f)

    ticker_data = checkpoint['ticker_data']
    samples = checkpoint['samples']
    FEATURE_COLS = checkpoint['feature_cols']

    print(f"✓ Loaded {len(samples):,} samples from {len(ticker_data)} tickers")
    print(f"✓ Features: {FEATURE_COLS}")

else:
    print("No checkpoint found. Processing data from scratch...")

    # Load data
    print("Loading CSV...")
    df = pd.read_csv('../data/interim/train_clean_2010.csv')

    # Extract Features
    print("Engineering features...")
    FEATURE_COLS = []
    by_ticker = df.groupby('ticker')

    df['return_1d'] = by_ticker['close'].pct_change()
    FEATURE_COLS.append('return_1d')

    df['log_return'] = np.log(df['close'] / by_ticker['close'].shift(1))
    FEATURE_COLS.append('log_return')

    df['volume_z'] = by_ticker['volume'].transform(
        lambda x: (x - x.mean()) / (x.std() + 1e-6)
    )
    FEATURE_COLS.append('volume_z')

    df['hl_range'] = (df['high'] - df['low']) / df['close']
    FEATURE_COLS.append('hl_range')

    df = df.dropna().reset_index(drop=True)
    print(f"Data shape after cleaning: {df.shape}")

    # Convert to numpy arrays for FAST indexing
    print("Converting to numpy arrays...")
    ticker_data = {}
    samples = []
    window_size = 60
    horizon = 30

    for ticker, group in tqdm(df.groupby('ticker'), desc="Processing tickers"):
        group = group.sort_values('date').reset_index(drop=True)
        n = len(group)

        if n < window_size + horizon:
            continue

        # Store as numpy arrays (CRITICAL for speed!)
        ticker_data[ticker] = {
            'features': group[FEATURE_COLS].values.astype(np.float32),
            'close': group['close'].values.astype(np.float32)
        }

        # Generate sample indices
        for i in range(window_size, n - horizon):
            samples.append((ticker, i))

    print(f"✓ Processed {len(samples):,} samples from {len(ticker_data)} tickers")

    # Save checkpoint
    print(f"Saving checkpoint to {CHECKPOINT_FILE}...")
    checkpoint = {
        'ticker_data': ticker_data,
        'samples': samples,
        'feature_cols': FEATURE_COLS,
        'window_size': window_size,
        'horizon': horizon
    }
    with open(CHECKPOINT_FILE, 'wb') as f:
        pickle.dump(checkpoint, f)
    print("✓ Checkpoint saved!")


No checkpoint found. Processing data from scratch...
Loading CSV...
Engineering features...
Data shape after cleaning: (12525922, 13)
Converting to numpy arrays...


Processing tickers: 100%|██████████| 5000/5000 [00:12<00:00, 404.39it/s] 


✓ Processed 12,075,922 samples from 5000 tickers
Saving checkpoint to dataset_checkpoint.pkl...
✓ Checkpoint saved!


## Sliding window + label generation

In [9]:
from torch.utils.data import Dataset
import torch
import numpy as np
class FastStockDataset(Dataset):
    """Ultra-fast dataset using pre-converted numpy arrays"""
    def __init__(self, ticker_data, samples, window_size=60, horizon=30):
        self.ticker_data = ticker_data
        self.samples = samples
        self.window_size = window_size
        self.horizon = horizon

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ticker, i = self.samples[idx]
        data = self.ticker_data[ticker]

        # Fast numpy slicing (not pandas!)
        X = data['features'][i - self.window_size:i]

        close_now = data['close'][i - 1]
        close_future = data['close'][i + self.horizon]
        y = float(close_future > close_now)

        return torch.from_numpy(X), torch.tensor(y, dtype=torch.float32)

class StockSequenceDataset(Dataset):
    """Lazy-loading dataset - only computes windows when accessed"""
    def __init__(self, df, feature_cols, window_size=60, horizon=30):
        self.feature_cols = feature_cols
        self.window_size = window_size
        self.horizon = horizon

        # Pre-compute and cache ticker groups (lightweight)
        self.ticker_groups = {}
        self.samples = []  # (ticker, end_idx)

        for ticker, group in df.groupby('ticker'):
            # Sort once and store
            group = group.sort_values('date').reset_index(drop=True)
            n = len(group)

            if n < window_size + horizon:
                continue

            # Store the group
            self.ticker_groups[ticker] = group

            # Generate valid sample indices
            for i in range(window_size, n - horizon):
                self.samples.append((ticker, i))

        print(f"Dataset initialized: {len(self.samples)} samples from {len(self.ticker_groups)} tickers")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ticker, i = self.samples[idx]
        group = self.ticker_groups[ticker]

        # Extract window of features
        X = group[self.feature_cols][i - self.window_size:i].values

        # Compute label: will price go up in 'horizon' days?
        close_now = group['close'].iloc[i - 1]
        close_future = group['close'].iloc[i + self.horizon]
        y = float(close_future > close_now)

        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [11]:
# print("\nCreating dataset...")
# import time
# start = time.time()
# dataset = FastStockDataset(df, FEATURE_COLS, window_size=60, horizon=30)
# print(f"Dataset created in {time.time() - start:.2f}s")
#
# # Estimate dataset size
# total_size_gb = (len(dataset) * 60 * len(FEATURE_COLS) * 4) / 1024**3
# print(f"Estimated dataset size: {total_size_gb:.2f} GB")
dataset = FastStockDataset(ticker_data, samples, window_size=60, horizon=30)


In [None]:
import torch
import time
from torch.utils.data import DataLoader

# Test 1: Check if workers are actually starting
print("=" * 60)
print("DIAGNOSTIC TEST")
print("=" * 60)

# Test 2: Single sample speed
print("\n1. Testing single sample access...")
start = time.time()
try:
    x, y = dataset[0]
    elapsed = time.time() - start
    print(f"   ✓ Single sample: {elapsed:.4f}s")
    if elapsed > 0.01:
        print(f"   ⚠ WARNING: Too slow! Should be <0.001s")
    print(f"   Shape: {x.shape}")
except Exception as e:
    print(f"   ✗ ERROR: {e}")

# Test 3: DataLoader with NO workers
print("\n2. Testing DataLoader with num_workers=0...")
test_loader = DataLoader(dataset, batch_size=128, num_workers=0)
start = time.time()
try:
    xb, yb = next(iter(test_loader))
    elapsed = time.time() - start
    print(f"   ✓ First batch (no workers): {elapsed:.2f}s")
    print(f"   Shapes: {xb.shape}, {yb.shape}")
except Exception as e:
    print(f"   ✗ ERROR: {e}")

# Test 4: DataLoader WITH workers
print("\n3. Testing DataLoader with num_workers=4...")
test_loader = DataLoader(
    dataset,
    batch_size=128,
    num_workers=4,
    persistent_workers=True,
    prefetch_factor=2
)
start = time.time()
try:
    xb, yb = next(iter(test_loader))
    elapsed = time.time() - start
    print(f"   ✓ First batch (4 workers): {elapsed:.2f}s")

    # Try second batch
    start = time.time()
    xb, yb = next(iter(test_loader))
    elapsed = time.time() - start
    print(f"   ✓ Second batch: {elapsed:.2f}s")
except Exception as e:
    print(f"   ✗ ERROR: {e}")
    import traceback
    traceback.print_exc()

# Test 5: GPU availability
print("\n4. GPU Check...")
print(f"   CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Test 6: Simple training step
print("\n5. Testing single training step...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_loader = DataLoader(dataset, batch_size=128, num_workers=0)
xb, yb = next(iter(test_loader))

from torch import nn
model = nn.Linear(x.shape[1], 1).to(device)  # Simple model
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

start = time.time()
try:
    xb_flat = xb.view(xb.shape[0], -1).to(device)
    yb = yb.to(device)

    optimizer.zero_grad()
    logits = model(xb_flat).squeeze()
    loss = criterion(logits, yb)
    loss.backward()
    optimizer.step()

    elapsed = time.time() - start
    print(f"   ✓ Training step: {elapsed:.4f}s")
    print(f"   Loss: {loss.item():.4f}")
except Exception as e:
    print(f"   ✗ ERROR: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 60)
print("DIAGNOSTIC COMPLETE")
print("=" * 60)

DIAGNOSTIC TEST

1. Testing single sample access...
   ✓ Single sample: 0.0335s
   Shape: torch.Size([60, 4])

2. Testing DataLoader with num_workers=0...
   ✓ First batch (no workers): 0.05s
   Shapes: torch.Size([128, 60, 4]), torch.Size([128])

3. Testing DataLoader with num_workers=4...


In [8]:
# Optimized DataLoader settings for RTX 3060
loader = DataLoader(
    dataset,
    batch_size=4096,         # Large batch for GPU
    shuffle=True,
    num_workers=4,           # Workers to prepare batches
    pin_memory=True,         # Fast CPU→GPU transfer
    persistent_workers=True, # Keep workers alive
    prefetch_factor=3        # 12 batches ready (4 workers × 3)
)

print(f"DataLoader ready with {len(loader)} batches")

DataLoader ready with 2949 batches


In [14]:
xb , yb = next(iter(loader))
print(xb.shape, yb.shape)

torch.Size([8192, 60, 4]) torch.Size([8192])


# GRU

In [9]:
class GRUClassifier(nn.Module):
    def __init__(self, num_features, hidden_size=256, num_layers=2):  # Bigger!
        super().__init__()
        self.gru = nn.GRU(
            input_size=num_features,
            hidden_size=hidden_size,
            num_layers=num_layers,      # Stack 2 layers
            batch_first=True,
            dropout=0.2                  # Add dropout between layers
        )
        self.fc1 = nn.Linear(hidden_size, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        _, h = self.gru(x)
        x = self.fc1(h[-1])
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x).squeeze(1)

# Create bigger model

## SETUP TRAINING

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

num_features = len(FEATURE_COLS)

model = GRUClassifier(num_features, hidden_size=256, num_layers=2).to(device)

# Mixed precision for RTX 3060
torch.backends.cudnn.benchmark = True
scaler = torch.amp.GradScaler('cuda')

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Using device: cuda


## Training loop

In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for batch_idx, (xb, yb) in enumerate(progress_bar):
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()

        # Mixed precision training
        with torch.amp.autocast('cuda'):
            logits = model(xb)
            loss = criterion(logits, yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        # Update progress bar every 10 batches
        if batch_idx % 10 == 0:
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}: Average Loss = {avg_loss:.4f}")

Epoch 1/1:   0%|          | 0/2949 [00:00<?, ?it/s]

## Accuracy check

In [None]:
print("\nEvaluating model...")
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for xb, yb in tqdm(loader, desc="Evaluating"):
        xb = xb.to(device)
        yb = yb.to(device)

        with torch.cuda.amp.autocast():
            logits = model(xb)

        preds = (torch.sigmoid(logits) > 0.5).long()

        correct += (preds == yb.long()).sum().item()
        total += yb.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(yb.cpu().numpy())

accuracy = correct / total
print(f"\nFinal Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Class balance check
print(f"Positive samples: {sum(all_labels)/len(all_labels):.2%}")

# ============================================================
# 9. SAVE MODEL
# ============================================================
torch.save(model.state_dict(), 'gru_model.pth')
print("\nModel saved to 'gru_model.pth'")