In [1]:
import random
import sys, os
PROJECT_ROOT = r"D:\Stock_trend_project"
sys.path.append(PROJECT_ROOT)
import random

import numpy as np
import torch
from torch.utils.data import DataLoader
from typing import List, Tuple

from src.data.stock_dataset import StockDataset

np.random.seed(42)
torch.manual_seed(42)
random.seed(42)
from tqdm import tqdm
import pandas as pd
def build_samples(window_size = 60):
    print("Loading CSV...")
    df = pd.read_csv('D:\Stock_trend_project\data\processed\\new_stocks_features2.csv')

    ticker_data = {}
    samples = []
    horizon = 30
    FEATURE_COLS = [
    # Price Features (3)
    'daily_return',
    'high_low_ratio',

    # MA-Based (4)
    'price_to_MA5',
    'price_to_MA20',
    'price_to_MA60',
    'MA_60_slope',

    # Volatility (3)
    'volatility_20',
    'RSI_14',
    'parkinson_volatility',

    # Critical Features (4)
    'recent_high_20',
    'distance_from_high',
    'low_to_close_ratio',
    'price_position_20',
    'max_drawdown_20',
    'downside_deviation_10',

    # Temporal (3)
    'month_sin',
    'month_cos',
    'is_up_day',

    # Volume Price Index (3) - Highest MI!
    'PVT_cumsum',           # MI = 0.0426 ⭐️⭐️⭐️
    'MOBV',                 # MI = 0.0209 ⭐️⭐️

    # Directional Movement (4)
    'MTM',                  # MI = 0.0127 ⭐️

    # OverBought & OverSold (1)
    'ADTM',                 # MI = 0.0104

    # Energy & Volatility (2)
    'PSY',                  # MI = 0.0085
    'VHF',                  # MI = 0.0088

    # Stochastic (1)
    'K',                    # MI = 0.0083

    # Raw Features
    ]
    for ticker, group in tqdm(df.groupby('ticker'), desc="Processing tickers"):
        group = group.sort_values('date').reset_index(drop=True)
        n = len(group)

        if n < window_size + horizon:
            continue

        close = group['close'].values.astype(np.float32)
        missing = group["missing_days"].values.astype(np.int8)
        bad = (missing > 0).astype(np.int32)
        bad_cumsum = np.cumsum(bad)

        def has_gap(a, b):
            return bad_cumsum[b] - (bad_cumsum[a - 1] if a > 0 else 0) > 0

        ticker_data[ticker] = group[FEATURE_COLS].values.astype(np.float32)

        dates = group['date'].values
        for i in range(window_size, n - horizon):
            seq_start = i - window_size + 1
            seq_end = i + horizon

            if has_gap(seq_start, seq_end):
                continue

            label = 1 if close[i + horizon] > close[i] else 0

            date = dates[i]
            samples.append((ticker, i, label, date))
    StockDataset.ticker_data = ticker_data
    print(f"✓ Processed {len(samples):,} samples from {len(ticker_data)} tickers")
    return samples


def split_samples_time_based(
        samples: List[Tuple[str, int, int, object]],
        train_ratio: float = 0.7,
        val_ratio: float = 0.15,
):
    samples_sorted = sorted(samples, key=lambda x: x[3])

    n = len(samples_sorted)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)

    train_samples = samples_sorted[:n_train]
    val_samples = samples_sorted[n_train:n_train + n_val]
    test_samples = samples_sorted[n_train + n_val:]
    return train_samples, val_samples, test_samples

In [2]:
PROJECT_ROOT = r"D:\Stock_trend_project"
sys.path.append(PROJECT_ROOT)
from typing import Dict, List, Optional
from src.data.make_torch_datasets import build_samples, split_samples_time_based
from src.data.stock_dataset import StockDataset
from src.models.gru_model import GRUModel
import torch
from torch import nn
from torch.utils.data import DataLoader
from src.configs.training_config import BASELINE, BIDIRECTIONAL_STRONG, DEEP_NETWORK, FAST_EXPERIMENTAL, FIRST_CONFIG

CFG = DEEP_NETWORK

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def run_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: nn.Module,
    train: bool = True,
    optimizer: Optional[torch.optim.Optimizer] = None,
) -> Dict[str, float]:
    if train:
        model.train()
        context = torch.enable_grad()
    else:
        model.eval()
        context = torch.no_grad()

    running_loss = 0.0
    running_correct = 0
    running_total = 0

    with context:
        for X, y in loader:
            X = X.to(device)          # (batch, seq_len, input_size)
            y = y.to(device).float()  # (batch,)

            logits = model(X)         # (batch,)
            loss = criterion(logits, y)

            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            running_loss += loss.item() * y.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            running_correct += (preds == y).sum().item()
            running_total += y.size(0)

    epoch_loss = running_loss / running_total
    epoch_acc = running_correct / running_total

    return {"loss": epoch_loss, "acc": epoch_acc}


def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    num_epochs: int,
    config: object,
):
    model.to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    history: Dict[str, List[float]] = {
        "train_loss": [],
        "train_acc": [],
        "val_loss": [],
        "val_acc": [],
    }

    best_val_loss = float("inf")
    best_state_dict = None

    for epoch in range(1, num_epochs + 1):
        train_metrics = run_epoch(
            model=model,
            loader=train_loader,
            criterion=criterion,
            train=True,
            optimizer=optimizer,
        )

        val_metrics = run_epoch(
            model=model,
            loader=val_loader,
            criterion=criterion,
            train=False,
            optimizer=None,
        )

        history["train_loss"].append(train_metrics["loss"])
        history["train_acc"].append(train_metrics["acc"])
        history["val_loss"].append(val_metrics["loss"])
        history["val_acc"].append(val_metrics["acc"])

        if val_metrics["loss"] < best_val_loss:
            best_val_loss = val_metrics["loss"]
            best_state_dict = model.state_dict()

        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_metrics['loss']:.4f}  "
            f"train_acc={train_metrics['acc']:.4f}  "
            f"val_loss={val_metrics['loss']:.4f}  "
            f"val_acc={val_metrics['acc']:.4f}"
        )

    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    return model, history



In [3]:
    torch.manual_seed(42)

    samples = build_samples(window_size=CFG.window_size)
    train_s, val_s, test_s = split_samples_time_based(samples)

    train_ds = StockDataset(train_s, window_size=CFG.window_size, horizon=30)
    val_ds = StockDataset(val_s, window_size=CFG.window_size, horizon=30)
    test_ds = StockDataset(test_s, window_size=CFG.window_size, horizon=30)

    train_loader = DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=CFG.batch_size, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False)


Loading CSV...


Processing tickers: 100%|██████████| 4925/4925 [00:17<00:00, 284.46it/s]


✓ Processed 11,070,798 samples from 4923 tickers


In [None]:

    X_batch, y_batch = next(iter(train_loader))
    input_size = X_batch.shape[2]

    model = GRUModel(
        input_size=input_size,
        hidden_size=CFG.hidden_size,
        num_layers=CFG.num_layers,
        dropout=CFG.dropout,
        bidirectional=CFG.bidirectional,
    )

    print("Starting training...")
    model, history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        num_epochs=4,
        config=CFG,
    )

    print("Training finished.")

Starting training...
Epoch 001 | train_loss=0.6799  train_acc=0.5708  val_loss=0.7158  val_acc=0.4431
Epoch 002 | train_loss=0.6760  train_acc=0.5741  val_loss=0.7128  val_acc=0.4438
Epoch 003 | train_loss=0.6748  train_acc=0.5756  val_loss=0.7219  val_acc=0.4348
Epoch 004 | train_loss=0.6736  train_acc=0.5777  val_loss=0.7243  val_acc=0.4448
