# Data Cleaning & Feature Extraction

In [1]:
import pandas as pd
import numpy as np


def verify_columns_and_types(df: pd.DataFrame) -> pd.DataFrame:

    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w]", "", regex=True)
        .str.lower()
    )
    required_cols = ["date", "ticker", "open", "high", "low", "close", "volume", "dividends", "stock_splits"]

    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    price_cols = ["open", "high", "low", "close"]
    for c in price_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df["volume"] = pd.to_numeric(df["volume"], errors="coerce")
    df["ticker"] = df["ticker"].astype(str).str.strip()

    return df

def handle_missing_values(
        df: pd.DataFrame,
        price_cols=("open", "high", "low", "close"),
        max_na_fraction=0.10,
) -> pd.DataFrame:
    price_cols = list(price_cols)

    removed_tickers = 0
    removed_rows = 0

    def process_ticker(g: pd.DataFrame) -> pd.DataFrame:
        nonlocal removed_tickers, removed_rows

        rows_before = len(g)
        g = g.sort_values("date")

        while not g.empty and g[price_cols].iloc[0].isna().any():
            g = g.iloc[1:]

        while not g.empty and g[price_cols].iloc[-1].isna().any():
            g = g.iloc[:-1]

        if g.empty:
            removed_tickers += 1
            removed_rows += rows_before
            return g

        na_fraction = g[price_cols].isna().mean().mean()

        if na_fraction > max_na_fraction:
            removed_tickers += 1
            removed_rows += rows_before
            return g.iloc[0:0]

        g[price_cols] = g[price_cols].ffill().bfill()
        removed_rows += (rows_before - len(g))

        return g

    df_clean = (
        df
        .groupby("ticker", group_keys=False)
        .apply(process_ticker)
    )
    return df_clean # TODO : check this


def drop_ticker_date_duplicates(
    df: pd.DataFrame,
    max_duplicates_per_ticker: int = 10
) -> pd.DataFrame:
    dup_counts = (
        df.groupby(["ticker", "date"])
        .size()
        .reset_index(name="n")
    )
    bad_tickers = (
        dup_counts[dup_counts["n"] > 1]
        .groupby("ticker")["n"]
        .sum()
    )
    bad_tickers = bad_tickers[bad_tickers > max_duplicates_per_ticker].index
    df = df[~df["ticker"].isin(bad_tickers)]
    df = df.drop_duplicates(subset=["ticker", "date"], keep="first")
    return df

def remove_invalid_rows(df: pd.DataFrame) -> pd.DataFrame:

    cond_open = df["open"] != 0
    cond_close = df["close"] != 0
    cond_high_low = df["high"] >= df["low"]
    cond_open_range = (df["open"] >= df["low"]) & (df["open"] <= df["high"])
    cond_volume = df["volume"] > 0
    valid_mask = cond_open & cond_close & cond_high_low & cond_open_range & cond_volume

    return df[valid_mask]

def filter_by_start_date(df: pd.DataFrame, start_date: str) -> pd.DataFrame:
    return df[df["date"] >= start_date]

def remove_corrupted_tickers_df(
    df: pd.DataFrame,
    price_col: str = "close",
    iqr_factor: float = 1.5,
    threshold: float = 25.0,
) -> tuple[pd.DataFrame, list[str]]:
    """
    يحسب العوائد + القيم المتطرفة لكل سهم داخلياً،
    ثم يحذف الأسهم التي نسبة القيم المتطرفة فيها تتجاوز threshold٪.
    """
    df = df.sort_values(["ticker", "date"])

    df["return"] = (
        df.groupby("ticker")[price_col]
        .pct_change()
    )

    def mark_outliers(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        valid = g["return"].dropna()

        if valid.empty:
            g["return_is_outlier"] = False
            return g

        q1 = valid.quantile(0.25)
        q3 = valid.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - iqr_factor * iqr
        upper = q3 + iqr_factor * iqr

        g["return_is_outlier"] = (g["return"] < lower) | (g["return"] > upper)
        g.loc[g["return"].isna(), "return_is_outlier"] = False
        return g

    df_marked = (
        df
        .groupby("ticker", group_keys=False)
        .apply(mark_outliers)
    )

    summary = (
        df_marked
        .groupby("ticker")
        .agg(
            n_rows=("return", "count"),
            n_outliers=("return_is_outlier", "sum"),
        )
    )
    summary["outliers_ratio"] = summary["n_outliers"] / summary["n_rows"] * 100

    bad_tickers = summary[summary["outliers_ratio"] > threshold].index.tolist()

    df_cleaned = df_marked[~df_marked["ticker"].isin(bad_tickers)]

    return df_cleaned



def filter_after_2010_df(df: pd.DataFrame) -> pd.DataFrame:
    return filter_by_start_date(df, "2010-01-01")

def remove_global_gaps(df: pd.DataFrame) -> pd.DataFrame:

    df = df.sort_values(["ticker", "date"])

    # mark dates that have missing days before them
    df["prev_date"] = df.groupby("ticker")["date"].shift(1)
    df["gap_days"] = (df["date"] - df["prev_date"]).dt.days
    df["missing_days"] = (df["gap_days"] - 1).fillna(0).astype(int)
    gap_ratio_per_date = df[df["missing_days"] >= 1].groupby("date").size() / df.groupby("date").size()
    gap_ratio_per_date = gap_ratio_per_date.dropna()
    global_gap_dates = gap_ratio_per_date[gap_ratio_per_date >= 0.8].index  # index is date here
    df = df.drop(columns=['prev_date', 'gap_days'])
    df.loc[df["date"].isin(global_gap_dates), "missing_days"] = 0
    return df


def engineer_features(df):

    df = df.sort_values(['ticker', 'date']).reset_index(drop=True)
    grouped = df.groupby('ticker')

    # ========================================================================
    # TARGET VARIABLE
    # ========================================================================
    df['close_30d_future'] = grouped['close'].shift(-30)
    df['target'] = (df['close_30d_future'] > df['close']).astype(int)

    # ------------------------------
    # Price Features
    # ------------------------------
    df['daily_return'] = grouped['close'].pct_change()
    df['high_low_ratio'] = (df['high'] - df['low']) / df['close']

    # ------------------------------
    # Moving Averages
    # ------------------------------
    df['MA_5'] = grouped['close'].transform(lambda x: x.rolling(5, min_periods=1).mean())
    df['MA_20'] = grouped['close'].transform(lambda x: x.rolling(20, min_periods=1).mean())
    df['MA_60'] = grouped['close'].transform(lambda x: x.rolling(60, min_periods=1).mean())

    # ------------------------------
    # MA-Based Features
    # ------------------------------
    df['price_to_MA5'] = (df['close'] - df['MA_5']) / (df['MA_5'] + 1e-8)
    df['price_to_MA20'] = (df['close'] - df['MA_20']) / (df['MA_20'] + 1e-8)
    df['price_to_MA60'] = (df['close'] - df['MA_60']) / (df['MA_60'] + 1e-8)
    df['MA_60_slope'] = grouped['MA_60'].pct_change(30)

    # ------------------------------
    # Volatility Features
    # ------------------------------
    df['volatility_20'] = grouped['daily_return'].transform(
        lambda x: x.rolling(20, min_periods=1).std()
    )

    def calculate_rsi(series, period=14):
        delta = series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
        rs = gain / (loss + 1e-8)
        return 100 - (100 / (1 + rs))

    df['RSI_14'] = grouped['close'].transform(lambda x: calculate_rsi(x, 14))

    df['parkinson_volatility'] = grouped.apply(
        lambda x: np.sqrt(
            1 / (4 * np.log(2)) *
            ((np.log(x['high'] / (x['low'] + 1e-8))) ** 2).rolling(10, min_periods=1).mean()
        )
    ).reset_index(level=0, drop=True)

    # ------------------------------
    # Support/Resistance & Risk
    # ------------------------------
    df['recent_high_20'] = grouped['high'].transform(lambda x: x.rolling(20, min_periods=1).max())
    df['recent_low_20'] = grouped['low'].transform(lambda x: x.rolling(20, min_periods=1).min())
    df['distance_from_high'] = (df['close'] - df['recent_high_20']) / (df['recent_high_20'] + 1e-8)
    df['low_to_close_ratio'] = df['recent_low_20'] / (df['close'] + 1e-8)
    df['price_position_20'] = (
            (df['close'] - df['recent_low_20']) /
            (df['recent_high_20'] - df['recent_low_20'] + 1e-8)
    )

    def max_drawdown(series, window):
        roll_max = series.rolling(window, min_periods=1).max()
        drawdown = (series - roll_max) / (roll_max + 1e-8)
        return drawdown.rolling(window, min_periods=1).min()

    df['max_drawdown_20'] = grouped['close'].transform(lambda x: max_drawdown(x, 20))
    df['downside_deviation_10'] = grouped['daily_return'].transform(
        lambda x: x.where(x < 0, 0).rolling(10, min_periods=1).std()
    )

    # ------------------------------
    # Temporal
    # ------------------------------
    df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12)
    df['is_up_day'] = (df['daily_return'] > 0).astype(int)

    # ------------------------------
    # Volume Price Index (NEW)
    # ------------------------------
    df['price_change'] = grouped['close'].pct_change()
    df['PVT'] = (df['price_change'] * df['volume']).fillna(0)
    df['PVT_cumsum'] = grouped['PVT'].transform(lambda x: x.cumsum())

    df['MOBV_signal'] = np.where(df['price_change'] > 0, df['volume'],
                                 np.where(df['price_change'] < 0, -df['volume'], 0))
    df['MOBV'] = grouped['MOBV_signal'].transform(lambda x: x.cumsum())

    # ------------------------------
    # Directional Movement
    # ------------------------------
    df['MTM'] = df['close'] - grouped['close'].shift(12)

    # ------------------------------
    # OverBought & OverSold
    # ------------------------------
    df['DTM'] = np.where(df['open'] <= grouped['open'].shift(1),
                         0,
                         np.maximum(df['high'] - df['open'], df['open'] - grouped['open'].shift(1)))
    df['DBM'] = np.where(df['open'] >= grouped['open'].shift(1),
                         0,
                         np.maximum(df['open'] - df['low'], df['open'] - grouped['open'].shift(1)))
    df['DTM_sum'] = grouped['DTM'].transform(lambda x: x.rolling(23, min_periods=1).sum())
    df['DBM_sum'] = grouped['DBM'].transform(lambda x: x.rolling(23, min_periods=1).sum())
    df['ADTM'] = (df['DTM_sum'] - df['DBM_sum']) / (df['DTM_sum'] + df['DBM_sum'] + 1e-8)

    # ------------------------------
    # Energy & Volatility
    # ------------------------------
    df['PSY'] = grouped['is_up_day'].transform(lambda x: x.rolling(12, min_periods=1).mean()) * 100

    df['highest_close'] = grouped['close'].transform(lambda x: x.rolling(28, min_periods=1).max())
    df['lowest_close'] = grouped['close'].transform(lambda x: x.rolling(28, min_periods=1).min())
    df['close_diff_sum'] = grouped['close'].transform(lambda x: x.diff().abs().rolling(28, min_periods=1).sum())
    df['VHF'] = (df['highest_close'] - df['lowest_close']) / (df['close_diff_sum'] + 1e-8)

    # ------------------------------
    # Stochastic
    # ------------------------------
    df['lowest_low_9'] = grouped['low'].transform(lambda x: x.rolling(9, min_periods=1).min())
    df['highest_high_9'] = grouped['high'].transform(lambda x: x.rolling(9, min_periods=1).max())
    df['K'] = ((df['close'] - df['lowest_low_9']) / (df['highest_high_9'] - df['lowest_low_9'] + 1e-8)) * 100

    # ------------------------------
    # Cleanup temporary columns 41 - 16 = 26
    # ------------------------------
    drop_columns = [
        'MA_5', 'MA_20', 'MA_60',
        'price_change', 'PVT', 'MOBV_signal',
        'DTM', 'DBM', 'DTM_sum', 'DBM_sum',
        'highest_close', 'lowest_close', 'close_diff_sum',
        'lowest_low_9', 'highest_high_9', 'recent_low_20',
        'close_30d_future', 'target'
        , 'low', 'high', 'open', 'volume', 'dividends',
        'stock_splits', 'return', 'return_is_outlier']
    df = df.drop(columns=drop_columns, errors='igonre')
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    df = df.dropna().reset_index(drop=True)

    df = df.drop(columns=drop_columns, errors='ignore')
    return df

def clean_and_build_features(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = verify_columns_and_types(df_raw)
    df = filter_after_2010_df(df)
    df = handle_missing_values(df)
    df = remove_invalid_rows(df)
    df = drop_ticker_date_duplicates(df)
    df = remove_global_gaps(df)
    df = remove_corrupted_tickers_df(
        df,
        price_col="close",
        iqr_factor=1.5,
        threshold=25.0,
    )
    df = df.sort_values(['ticker', 'date']).reset_index(drop=True)
    df = engineer_features(df)
    return df


In [1]:
import pandas as pd
# df = clean_and_build_features(pd.read_csv('data/raw/train.csv'))
# df.to_csv('data/processed/data.csv')
df = pd.read_csv('data/processed/data_2.csv')

# Model

In [2]:
import torch
import torch.nn as nn
class GRUModel(nn.Module):
    def __init__(
            self,
            input_size: int,
            hidden_size: int = 128,
            num_layers: int = 2,
            dropout: float = 0.3,
            bidirectional: bool = True,
    ):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        self.num_directions = 2 if bidirectional else 1
        self.actual_hidden_size = hidden_size * self.num_directions

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional,
        )

        self.batch_norm = nn.BatchNorm1d(self.actual_hidden_size)

        self.fc1 = nn.Linear(self.actual_hidden_size, 16)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.fc2 = nn.Linear(16, 4)
        self.dropout2 = nn.Dropout(dropout)

        self.fc3 = nn.Linear(4, 1)
        self._init_weights()

    def _init_weights(self):
        for name, param in self.named_parameters():
            if "weight_ih" in name:
                nn.init.xavier_uniform_(param.data)
            elif "weight_hh" in name:
                nn.init.orthogonal_(param.data)
            elif "bias" in name:
                param.data.zero_()
            elif name.startswith("fc") and "weight" in name:
                nn.init.xavier_uniform_(param.data)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        gru_out, h_n = self.gru(x)

        if self.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            context = torch.cat([h_forward, h_backward], dim=1)
        else:
            context = h_n[-1, :, :]

        context = self.batch_norm(context)

        out = self.fc1(context)
        out = self.relu(out)
        out = self.dropout1(out)

        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout2(out)

        out = self.fc3(out)

        return out.squeeze(-1)

# Dataset

In [3]:
import torch
from torch.utils.data import Dataset
class StockDataset(Dataset):  
    def __init__(self, samples, window_size=60, horizon=30, ticker_data=None):
        self.samples = samples
        self.ticker_data = ticker_data
        self.window_size = window_size
        self.horizon = horizon
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        ticker, i, y, date = self.samples[idx]
        data = self.ticker_data[ticker]
        X = data[i - self.window_size + 1:i + 1].copy()
        return torch.from_numpy(X), torch.tensor(y, dtype=torch.float32)

# Splitting Data & Normalization

In [4]:
from typing import Tuple
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler
def split_dataframe_by_date(
    df: pd.DataFrame,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df = df.sort_values("date").reset_index(drop=True)

    n = len(df)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()
    return train_df, val_df, test_df

# Mode 1
no_need_scaling = [
    "is_up_day",
    "month_sin",
    "month_cos",
    "price_position_20",
]

robust_scaling_features = [
    "distance_from_high",
    "downside_deviation_10",
    "high_low_ratio",
    "low_to_close_ratio",
    "max_drawdown_20",
    "parkinson_volatility",
    "recent_high_20",
    "volatility_20",
    "VHF",
    "MOBV",
    "PVT_cumsum",
]

zscore_features = [
    "ADTM",
    "daily_return",
    "MA_60_slope",
    "MTM",
    "price_to_MA5",
    "price_to_MA20",
    "price_to_MA60",
    "PSY",
    "RSI_14",
]
standard_scaler_features = ["K"]


# Mode 2
z_only = ["ADTM", "K", "price_position_20", "RSI_14"]
z_plus_q = [
    "daily_return", "MA_60_slope", "MOBV", "MTM",
    "price_to_MA5", "price_to_MA20", "price_to_MA60",
    "PVT_cumsum", "recent_high_20"
]
robust_only = ["max_drawdown_20", "VHF"]
robust_plus_q = [
    "distance_from_high", "downside_deviation_10",
    "high_low_ratio", "low_to_close_ratio", "parkinson_volatility"
]
robust_plus_z = ["volatility_20"]
max_abs_only = ["PSY"]
no_scaling_mode2 = ["is_up_day", "month_sin", "month_cos"]

def normalize_df(df_train, df_val, df_test, norm_mode="norm1"):
    if norm_mode == "norm1":
        return normalize_df_mode1(df_train, df_val, df_test)
    elif norm_mode == "norm2":
        return normalize_df_mode2(df_train, df_val, df_test)

    return df_train, df_val, df_test


def normalize_df_mode1(df_train, df_val, df_test):
    scalars = {
        "robust": RobustScaler().fit(df_train[robust_scaling_features]),
        "z": StandardScaler().fit(df_train[zscore_features]),
        "std": StandardScaler().fit(df_train[standard_scaler_features]),
    }
    return (
        normalize_df_sc1(df_train, scalars),
        normalize_df_sc1(df_val, scalars),
        normalize_df_sc1(df_test, scalars),
    )


def normalize_df_sc1(df, scalers):
    df_scaled = df.copy()
    df_scaled[robust_scaling_features] = scalers["robust"].transform(
        df[robust_scaling_features]
    )
    df_scaled[zscore_features] = scalers["z"].transform(df[zscore_features])
    df_scaled[standard_scaler_features] = scalers["std"].transform(
        df[standard_scaler_features]
    )
    return df_scaled


def get_q_limits(df, columns):
    limits = {}
    for col in columns:
        limits[col] = (df[col].quantile(0.01), df[col].quantile(0.99))
    return limits


def apply_clipping(df, limits):
    df_clipped = df.copy()
    for col, (low, high) in limits.items():
        if col in df_clipped.columns:
            df_clipped[col] = df_clipped[col].clip(lower=low, upper=high)
    return df_clipped


def normalize_df_mode2(df_train, df_val, df_test):
    q_columns = z_plus_q + robust_plus_q
    q_limits = get_q_limits(df_train, q_columns)

    train_c = apply_clipping(df_train, q_limits)
    val_c = apply_clipping(df_val, q_limits)
    test_c = apply_clipping(df_test, q_limits)

    scalers = {
        "z": StandardScaler().fit(train_c[z_only + z_plus_q]),
        "robust": RobustScaler().fit(train_c[robust_only + robust_plus_q]),
        "max_abs": MaxAbsScaler().fit(train_c[max_abs_only]),
        "vol_robust": RobustScaler().fit(train_c[robust_plus_z])
    }

    vol_robust_train = scalers["vol_robust"].transform(train_c[robust_plus_z])
    scalers["vol_z"] = StandardScaler().fit(vol_robust_train)

    return (
        normalize_df_sc2(train_c, scalers),
        normalize_df_sc2(val_c, scalers),
        normalize_df_sc2(test_c, scalers)
    )

def normalize_df_sc2(df_in, scalers):
    df_out = df_in.copy()
    # Z-Score
    df_out[z_only + z_plus_q] = scalers["z"].transform(df_in[z_only + z_plus_q])
    # Robust
    df_out[robust_only + robust_plus_q] = scalers["robust"].transform(df_in[robust_only + robust_plus_q])
    # Max Abs
    df_out[max_abs_only] = scalers["max_abs"].transform(df_in[max_abs_only])
    # Volatility (R + Z)
    vol_r = scalers["vol_robust"].transform(df_in[robust_plus_z])
    df_out[robust_plus_z] = scalers["vol_z"].transform(vol_r)

    return df_out

# Sampling

In [5]:
import random
from typing import Dict, List
import numpy as np
import torch
from tqdm import tqdm

np.random.seed(42)
torch.manual_seed(42)
random.seed(42)
FEATURE_COLS = [
        "daily_return",
        "high_low_ratio",
        "price_to_MA5",
        "price_to_MA20",
        "price_to_MA60",
        "MA_60_slope",
        "volatility_20",
        "RSI_14",
        "parkinson_volatility",
        "recent_high_20",
        "distance_from_high",
        "low_to_close_ratio",
        "price_position_20",
        "max_drawdown_20",
        "downside_deviation_10",
        "month_sin",
        "month_cos",
        "is_up_day",
        "PVT_cumsum",
        "MOBV",
        "MTM",
        "ADTM",
        "PSY",
        "VHF",
        "K",
    ]
def build_samples_from_df(
    df: pd.DataFrame,
    window_size: int = 60,
    horizon: int = 30,
) -> Tuple[List, Dict]:
    ticker_data = {}
    samples = []


    for ticker, group in tqdm(df.groupby("ticker"), desc="Building samples"):
        group = group.sort_values("date").reset_index(drop=True)
        n = len(group)

        if n < window_size + horizon:
            continue

        close = group["close"].values.astype(np.float32)
        missing = group["missing_days"].values.astype(np.int8)
        bad = (missing > 0).astype(np.int32)
        bad_cumsum = np.cumsum(bad)

        def has_gap(a, b):
            return bad_cumsum[b] - (bad_cumsum[a - 1] if a > 0 else 0) > 0

        # Store feature data
        ticker_data[ticker] = group[FEATURE_COLS].values.astype(np.float32)

        dates = group["date"].values
        for i in range(window_size - 1, n - horizon):
            seq_start = i - window_size + 1
            seq_end = i + horizon

            if has_gap(seq_start, seq_end):
                continue

            label = 1 if close[i + horizon] > close[i] else 0
            date = dates[i]
            samples.append((ticker, i, label, date))

    return samples, ticker_data


    

# Training Configs

In [None]:

from dataclasses import dataclass

@dataclass
class TrainingConfig:
    name: str = "default"
    window_size: int = 60
    # Model
    model_type: str = "gru"   
    hidden_size: int = 32
    num_layers: int = 2
    bidirectional: bool = False
    dropout: float = 0.4

    # Training
    batch_size: int = 64
    learning_rate: float = 3e-4
    weight_decay: float = 1e-4
    optimizer: str = "Adam"

# Visualization

In [7]:
import matplotlib.pyplot as plt

def plot_training_curves(history, filename="training_curves.png"):
    epochs = range(1, len(history["train_loss"]) + 1)

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # ---- Loss plot ----
    axes[0].plot(epochs, history["train_loss"], label="Train Loss")
    axes[0].plot(epochs, history["val_loss"], label="Val Loss")
    axes[0].set_title("Loss")
    axes[0].set_xlabel("Epoch")
    axes[0].set_ylabel("Loss")
    axes[0].legend()
    axes[0].grid(True)

    # ---- Accuracy plot ----
    axes[1].plot(epochs, history["train_acc"], label="Train Acc")
    axes[1].plot(epochs, history["val_acc"], label="Val Acc")
    axes[1].set_title("Accuracy")
    axes[1].set_xlabel("Epoch")
    axes[1].set_ylabel("Accuracy")
    axes[1].legend()
    axes[1].grid(True)

    plt.tight_layout()
    plt.savefig(filename, dpi=150, bbox_inches="tight")
    plt.show()



# Training

In [None]:
from typing import Dict, List, Optional
import os
from datetime import datetime

from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_MIXED_PRECISION = torch.cuda.is_available()
if USE_MIXED_PRECISION:
    torch.backends.cudnn.benchmark = True
print(device, USE_MIXED_PRECISION)


def run_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: nn.Module,
    train: bool = True,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scaler: Optional[torch.amp.GradScaler] = None,
) -> Dict[str, float]:
    if train:
        model.train()
        context = torch.enable_grad()
    else:
        model.eval()
        context = torch.no_grad()

    running_loss = 0.0
    running_correct = 0
    running_total = 0
    progress_bar = tqdm(loader, desc="train" if train else "val")

    with context:
        for idx, (X, y) in enumerate(progress_bar):
            X = X.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True).float()

            if train:
                optimizer.zero_grad()
            if USE_MIXED_PRECISION and train:
                with torch.amp.autocast("cuda"):
                    logits = model(X)
                    loss = criterion(logits, y)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                logits = model(X)
                loss = criterion(logits, y)

                if train:
                    loss.backward()
                    optimizer.step()

            running_loss += loss.item() * y.size(0)
            if idx % 10 == 0:
                progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            running_correct += (preds == y).sum().item()
            running_total += y.size(0)

    epoch_loss = running_loss / running_total
    epoch_acc = running_correct / running_total

    return {"loss": epoch_loss, "acc": epoch_acc}


def train_loop(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    num_epochs: int,
    cfg: TrainingConfig,
    optimizer: Optional[torch.optim.Optimizer] = None,
    history: Optional[Dict[str, List[float]]] = None,
    best_val_loss: float = float("inf"),
    start_epoch: int = 0,
):
    model.to(device)
    scaler = torch.amp.GradScaler("cuda") if USE_MIXED_PRECISION else None

    criterion = nn.BCEWithLogitsLoss()
    if optimizer is None:
        optimizer = torch.optim.Adam(
            model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay
        )

    if history is None:
        history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

    best_state_dict = None

    for epoch in range(start_epoch + 1, start_epoch + num_epochs + 1):
        train_metrics = run_epoch(
            model=model,
            loader=train_loader,
            criterion=criterion,
            train=True,
            optimizer=optimizer,
            scaler=scaler,
        )
        val_metrics = run_epoch(
            model=model,
            loader=val_loader,
            criterion=criterion,
            train=False,
            optimizer=None,
            scaler=None,
        )

        history["train_loss"].append(train_metrics["loss"])
        history["train_acc"].append(train_metrics["acc"])
        history["val_loss"].append(val_metrics["loss"])
        history["val_acc"].append(val_metrics["acc"])

        if val_metrics["loss"] < best_val_loss:
            best_val_loss = val_metrics["loss"]
            best_state_dict = model.state_dict()

        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_metrics['loss']:.4f}  "
            f"train_acc={train_metrics['acc']:.4f}  "
            f"val_loss={val_metrics['loss']:.4f}  "
            f"val_acc={val_metrics['acc']:.4f}"
        )

    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    models_dir = "models"
    vis_dir = "figures"

    config_name = getattr(cfg, "name", "unnamed")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    prefix = f"gru_{config_name}_{timestamp}"
    filename = f"{prefix}.pt"
    plot_name = f"{prefix}.png"
    plot_training_curves(history, plot_name)
    torch.save(
        {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "config": cfg.__dict__,
            "history": history,
            "best_val_loss": best_val_loss,
            "use_mixed_precision": USE_MIXED_PRECISION,
        },
        filename,
    )
    print(f"\nSaved checkpoint to: {filename}")
    print(f"\nSaved Plot to: {plot_name}")

    return model, history

# Main

### Training Configs

In [None]:
cfg  = TrainingConfig(
    name="fifth",
    model_type="gru",
    hidden_size=128,
    num_layers=2,
    bidirectional=False,
    dropout=0.3,
    batch_size=512,
    learning_rate=3e-4,
    window_size=60,
)

### Splitting Data

In [None]:
train_df, val_df, test_df = split_dataframe_by_date(
df, train_ratio=0.7, val_ratio=0.15
)

### Normalizing

In [None]:
train_df,val_df,test_df =  normalize_df(train_df, val_df,test_df,'norm1')

### Sampling

In [None]:
from torch.utils.data import DataLoader
import gc

train_samples, train_ticker_data = build_samples_from_df(
    train_df, window_size=cfg.window_size, horizon=30
)
del train_df
gc.collect()
val_samples, val_ticker_data = build_samples_from_df(
    val_df, window_size=cfg.window_size, horizon=30
)
del val_df
gc.collect()


### Datasets & DataLoaders

In [None]:

train_ds = StockDataset(
    train_samples,
    window_size=cfg.window_size,
    horizon=30,
    ticker_data=train_ticker_data,
)
val_ds = StockDataset(
    val_samples, window_size=cfg.window_size, horizon=30, ticker_data=val_ticker_data
)

import torch.multiprocessing as mp
mp.set_start_method("spawn", force=True)

# workers = 2
train_loader = DataLoader(
    train_ds,
    batch_size=cfg.batch_size,
    shuffle=True,
    # num_workers=workers,
    # pin_memory=True,
    # prefetch_factor=2,
    # persistent_workers=True,
)
val_loader = DataLoader(
    val_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    # num_workers=workers,
    # persistent_workers=True,
    # pin_memory=True,
    # prefetch_factor=2,
)

### Training Loop

In [None]:
x0, _ = train_ds[0]
input_size = x0.shape[-1]
model = GRUModel(
    input_size=input_size,
    hidden_size=cfg.hidden_size,
    num_layers=cfg.num_layers,
    dropout=cfg.dropout,
    bidirectional=cfg.bidirectional,
).to(device)
model, history = train_loop(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            num_epochs=5,
            cfg=cfg,
        )

# Predicting

In [None]:
print("\nLoading test data...")
test_df = pd.read_csv('data/raw/test.csv')
test_df['Date'] = pd.to_datetime(test_df['Date'])
ticker_dict = {}
for ticker, group in tqdm(df.groupby('ticker'), desc="Processing tickers"):
    group = group.sort_values('date').reset_index(drop=True)
    ticker_dict[ticker] = {
        'dates': group['date'].values,
        'features': group[FEATURE_COLS].values.astype(np.float32)
    }

model.eval()
window_size = 60
predictions = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting"):
    ticker_id = row['ID']
    target_date = pd.Timestamp(row['Date'])

    if ticker_id not in ticker_dict:
        predictions.append(1)
        continue

    ticker_data = ticker_dict[ticker_id]
    dates = pd.to_datetime(ticker_data['dates'])
    features = ticker_data['features']

    mask = dates < target_date
    if mask.sum() < window_size:
        predictions.append(1)
        continue

    valid_indices = np.where(mask)[0]

    last_window = valid_indices[-window_size:]

    X = features[last_window].copy()

    X_tensor = torch.from_numpy(X).unsqueeze(0).to(device)

    with torch.no_grad():
        with torch.amp.autocast('cuda'):  
            logits = model(X_tensor)

        prob = torch.sigmoid(logits).item()
        pred = 1 if prob > 0.5 else 0

    predictions.append(pred)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Pred': predictions
})

submission.to_csv('submission.csv', index=False)
