# Data Load & Clean (Run Once, Do Not Edit)

In [None]:

import pandas as pd


def verify_columns_and_types(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w]", "", regex=True)
        .str.lower()
    )
    required_cols = ["date", "ticker", "open", "high", "low", "close", "volume", "dividends", "stock_splits"]

    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    price_cols = ["open", "high", "low", "close"]
    for c in price_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df["volume"] = pd.to_numeric(df["volume"], errors="coerce")
    df["ticker"] = df["ticker"].astype(str).str.strip()

    return df


def handle_missing_values(
        df: pd.DataFrame,
        price_cols=("open", "high", "low", "close"),
        max_na_fraction=0.10,
) -> pd.DataFrame:
    price_cols = list(price_cols)

    removed_tickers = 0
    removed_rows = 0

    def process_ticker(g: pd.DataFrame) -> pd.DataFrame:
        nonlocal removed_tickers, removed_rows

        rows_before = len(g)
        g = g.sort_values("date")

        while not g.empty and g[price_cols].iloc[0].isna().any():
            g = g.iloc[1:]

        while not g.empty and g[price_cols].iloc[-1].isna().any():
            g = g.iloc[:-1]

        if g.empty:
            removed_tickers += 1
            removed_rows += rows_before
            return g

        na_fraction = g[price_cols].isna().mean().mean()

        if na_fraction > max_na_fraction:
            removed_tickers += 1
            removed_rows += rows_before
            return g.iloc[0:0]

        g[price_cols] = g[price_cols].ffill().bfill()
        removed_rows += (rows_before - len(g))

        return g

    df_clean = (
        df
        .groupby("ticker", group_keys=False)
        .apply(process_ticker)
    )
    return df_clean  # TODO : check this


def drop_ticker_date_duplicates(
        df: pd.DataFrame,
        max_duplicates_per_ticker: int = 10
) -> pd.DataFrame:
    dup_counts = (
        df.groupby(["ticker", "date"])
        .size()
        .reset_index(name="n")
    )
    bad_tickers = (
        dup_counts[dup_counts["n"] > 1]
        .groupby("ticker")["n"]
        .sum()
    )
    bad_tickers = bad_tickers[bad_tickers > max_duplicates_per_ticker].index
    df = df[~df["ticker"].isin(bad_tickers)]
    df = df.drop_duplicates(subset=["ticker", "date"], keep="first")
    return df


def remove_invalid_rows(df: pd.DataFrame) -> pd.DataFrame:
    cond_open = df["open"] != 0
    cond_close = df["close"] != 0
    cond_high_low = df["high"] >= df["low"]
    cond_open_range = (df["open"] >= df["low"]) & (df["open"] <= df["high"])
    cond_volume = df["volume"] > 0
    valid_mask = cond_open & cond_close & cond_high_low & cond_open_range & cond_volume

    return df[valid_mask].copy()


def filter_by_start_date(df: pd.DataFrame, start_date: str) -> pd.DataFrame:
    return df[df["date"] >= start_date]


def remove_corrupted_tickers_df(
        df: pd.DataFrame,
        price_col: str = "close",
        iqr_factor: float = 1.5,
        threshold: float = 25.0,
) -> tuple[pd.DataFrame, list[str]]:
    """
    يحسب العوائد + القيم المتطرفة لكل سهم داخلياً،
    ثم يحذف الأسهم التي نسبة القيم المتطرفة فيها تتجاوز threshold٪.
    """
    df = df.sort_values(["ticker", "date"])

    df["return"] = (
        df.groupby("ticker")[price_col]
        .pct_change()
    )

    def mark_outliers(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        valid = g["return"].dropna()

        if valid.empty:
            g["return_is_outlier"] = False
            return g

        q1 = valid.quantile(0.25)
        q3 = valid.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - iqr_factor * iqr
        upper = q3 + iqr_factor * iqr

        g["return_is_outlier"] = (g["return"] < lower) | (g["return"] > upper)
        g.loc[g["return"].isna(), "return_is_outlier"] = False
        return g

    df_marked = (
        df
        .groupby("ticker", group_keys=False)
        .apply(mark_outliers)
    )

    summary = (
        df_marked
        .groupby("ticker")
        .agg(
            n_rows=("return", "count"),
            n_outliers=("return_is_outlier", "sum"),
        )
    )
    summary["outliers_ratio"] = summary["n_outliers"] / summary["n_rows"] * 100

    bad_tickers = summary[summary["outliers_ratio"] > threshold].index.tolist()

    df_cleaned = df_marked[~df_marked["ticker"].isin(bad_tickers)].copy()

    return df_cleaned, bad_tickers


def run_basic_clean_df(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = verify_columns_and_types(df_raw)
    df = filter_after_2010_df(df)
    df = handle_missing_values(df)
    df = remove_invalid_rows(df)
    df = drop_ticker_date_duplicates(df)
    df = remove_global_gaps(df)
    return df


def filter_after_2010_df(df: pd.DataFrame) -> pd.DataFrame:
    return filter_by_start_date(df, "2010-01-01")


def remove_global_gaps(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["ticker", "date"]).copy()

    # mark dates that have missing days before them
    df["prev_date"] = df.groupby("ticker")["date"].shift(1)
    df["gap_days"] = (df["date"] - df["prev_date"]).dt.days
    df["missing_days"] = (df["gap_days"] - 1).fillna(0).astype(int)
    gap_ratio_per_date = df[df["missing_days"] >= 1].groupby("date").size() / df.groupby("date").size()
    gap_ratio_per_date = gap_ratio_per_date.dropna()
    global_gap_dates = gap_ratio_per_date[gap_ratio_per_date >= 0.8].index  # index is date here
    df = df.drop(columns=['prev_date', 'gap_days'])
    df.loc[df["date"].isin(global_gap_dates), "missing_days"] = 0
    return df


df = pd.read_csv('/kaggle/input/predicting-stock-trends-rise-or-fall/train.csv')
df = run_basic_clean_df(df)

df, removed_tickers = remove_corrupted_tickers_df(
    df,
    price_col="close",
    iqr_factor=1.5,
    threshold=25.0,
)

final_interim_path = "/kaggle/data/interim/"
import os

os.makedirs(final_interim_path, exist_ok=True)
df.to_csv(final_interim_path + 'data.csv', index=False)

# Feature Extraction (Run Once, Do Not Edit)

# Generating Samples (Do Not Edit)

In [None]:
import random

import numpy as np
import torch
from typing import List, Tuple, Dict, Optional
from tqdm import tqdm
import pandas as pd

np.random.seed(42)
torch.manual_seed(42)
random.seed(42)


def build_samples(window_size=60, feature_cols=None, horizon=30):
    if feature_cols is None:
        feature_cols = []

    df = pd.read_csv('/kaggle/data/processed/data.csv')

    ticker_data = {}
    samples = []

    for ticker, group in tqdm(df.groupby('ticker'), desc="Processing tickers"):
        group = group.sort_values('date').reset_index(drop=True)
        n = len(group)

        if n < window_size + horizon:
            continue

        close = group['close'].values.astype(np.float32)
        missing = group["missing_days"].values.astype(np.int8)
        bad = (missing > 0).astype(np.int32)
        bad_cumsum = np.cumsum(bad)

        def has_gap(a, b):
            return bad_cumsum[b] - (bad_cumsum[a - 1] if a > 0 else 0) > 0

        ticker_data[ticker] = group[feature_cols].values.astype(np.float32)

        dates = group['date'].values
        for i in range(window_size, n - horizon):
            seq_start = i - window_size + 1
            seq_end = i + horizon

            if has_gap(seq_start, seq_end):
                continue

            label = 1 if close[i + horizon] > close[i] else 0

            date = dates[i]
            samples.append((ticker, i, label, date))
    StockDataset.ticker_data = ticker_data
    print(f"✓ Processed {len(samples):,} samples from {len(ticker_data)} tickers")
    return samples


def split_samples_time_based(
        samples: List[Tuple[str, int, int, object]],
        train_ratio: float = 0.7,
        val_ratio: float = 0.15,
):
    samples_sorted = sorted(samples, key=lambda x: x[3])

    n = len(samples_sorted)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)

    train_samples = samples_sorted[:n_train]
    val_samples = samples_sorted[n_train:n_train + n_val]
    test_samples = samples_sorted[n_train + n_val:]
    return train_samples, val_samples, test_samples


# Dataset (Do Not Edit)

In [None]:
import torch
from torch.utils.data import Dataset


class StockDataset(Dataset):
    ticker_data = {}

    def __init__(self, samples, window_size=60, horizon=30):
        self.samples = samples
        self.window_size = window_size
        self.horizon = horizon

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ticker, i, y, date = self.samples[idx]
        data = StockDataset.ticker_data[ticker]
        X = data[i - self.window_size + 1:i + 1].copy()  # i+1 is excluded
        return torch.from_numpy(X), torch.tensor(y, dtype=torch.float32)


# Model

In [None]:


import torch
import torch.nn as nn


class GRUModel(nn.Module):
    def __init__(
            self,
            input_size: int,
            hidden_size: int = 128,
            num_layers: int = 2,
            bidirectional: bool = False,
    ):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        self.num_directions = 2 if bidirectional else 1
        self.actual_hidden_size = hidden_size * self.num_directions

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )

        self.fc1 = nn.Linear(self.actual_hidden_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self._init_weights()

    def _init_weights(self):
        for name, param in self.named_parameters():
            if "weight_ih" in name:
                nn.init.xavier_uniform_(param.data)
            elif "weight_hh" in name:
                nn.init.orthogonal_(param.data)
            elif "bias" in name:
                param.data.zero_()
            elif name.startswith("fc") and "weight" in name:
                nn.init.xavier_uniform_(param.data)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (batch, seq_len, input_size)
        gru_out, h_n = self.gru(x)
        # h_n: (num_layers*num_directions, batch, hidden_size)

        if self.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            context = torch.cat([h_forward, h_backward], dim=1)  # (batch, 2*hidden)
        else:
            context = h_n[-1, :, :]

        out = self.fc1(context)
        out = self.relu(out)

        out = self.fc2(out)
        out = self.relu(out)

        out = self.fc3(out)

        return out.squeeze(-1)  # (batch,)




# Training Setup

#### GPU configs (Do not Edit)

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")
# torch.backends.cudnn.benchmark = True


#### create datasets   **Editable**

In [None]:
from torch.utils.data import DataLoader
import torch

window_size = 60  # editable
horizon = 30  # Do not edit this
feature = []  # editable
samples = build_samples(window_size, feature, horizon)

train_s, val_s, test_s = split_samples_time_based(samples)

train_ds = StockDataset(train_s, window_size=window_size, horizon=horizon)
val_ds = StockDataset(val_s, window_size=window_size, horizon=horizon)
test_ds = StockDataset(test_s, window_size=window_size, horizon=horizon)
batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=4,  # can be edited , depending on hardware
                          pin_memory=True,
                          persistent_workers=True,
                          prefetch_factor=3

                          )
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
                        num_workers=4,  # can be edited , depending on hardware
                        pin_memory=True,
                        persistent_workers=True,
                        prefetch_factor=3)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
                         num_workers=4,  # can be edited , depending on hardware
                         pin_memory=True,
                         persistent_workers=True,
                         prefetch_factor=3)

INPUT_SIZE = len(feature)
SEQ_LEN = window_size
BATCH_SIZE = batch_size




#### model config

In [None]:
model = GRUModel(
    input_size=INPUT_SIZE,
    hidden_size=128,
    num_layers=2,
    bidirectional=True,
).to(device)

#### training loop

In [None]:
loss_fun = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

##### Training Functions

In [None]:
def run_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: nn.Module,
    train: bool = True,
    optimizer: Optional[torch.optim.Optimizer] = None,
) -> Dict[str, float]:
    if train:
        model.train()
        context = torch.enable_grad()
    else:
        model.eval()
        context = torch.no_grad()

    running_loss = 0.0
    running_correct = 0
    running_total = 0

    with context:
        for X, y in loader:
            X = X.to(device,non_blocking=True)          # (batch, seq_len, input_size)
            y = y.to(device,non_blocking=True).float()  # (batch,)

            logits = model(X)         # (batch,)
            loss = criterion(logits, y)

            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            running_loss += loss.item() * y.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            running_correct += (preds == y).sum().item()
            running_total += y.size(0)

    epoch_loss = running_loss / running_total
    epoch_acc = running_correct / running_total

    return {"loss": epoch_loss, "acc": epoch_acc}


def train_model(
    model: nn.Module,
    num_epochs: int,
    lr: float = 1e-3,
):

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    history: Dict[str, List[float]] = {
        "train_loss": [],
        "train_acc": [],
        "val_loss": [],
        "val_acc": [],
    }

    best_val_loss = float("inf")
    best_state_dict = None

    for epoch in range(1, num_epochs + 1):
        train_metrics = run_epoch(
            model=model,
            loader=train_loader,
            criterion=criterion,
            train=True,
            optimizer=optimizer,
        )

        val_metrics = run_epoch(
            model=model,
            loader=val_loader,
            criterion=criterion,
            train=False,
            optimizer=None,
        )

        history["train_loss"].append(train_metrics["loss"])
        history["train_acc"].append(train_metrics["acc"])
        history["val_loss"].append(val_metrics["loss"])
        history["val_acc"].append(val_metrics["acc"])

        if val_metrics["loss"] < best_val_loss:
            best_val_loss = val_metrics["loss"]
            best_state_dict = model.state_dict()

        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_metrics['loss']:.4f}  "
            f"train_acc={train_metrics['acc']:.4f}  "
            f"val_loss={val_metrics['loss']:.4f}  "
            f"val_acc={val_metrics['acc']:.4f}"
        )

    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    return model, history
model, history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        num_epochs=30,
        lr=1e-2,
    )

##### Testing

In [None]:
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for xb, yb in tqdm(loader, desc="Evaluating"):
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        with torch.amp.autocast('cuda'):
            logits = model(xb)

        preds = (torch.sigmoid(logits) > 0.5).long()
        correct += (preds == yb.long()).sum().item()
        total += yb.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(yb.cpu().numpy())

accuracy = correct / total