In [1]:
import numpy as np
import pandas as pd
import methods.selenium_patch as selenium_patch
from tqdm import tqdm
from methods.scraper import *
from methods.model_methods import *

In [2]:
tickers = pd.read_csv("../data/tickers/simple_tickers.csv")["Ticker"].to_list()

In [6]:
data = []
for ticker in tqdm(tickers, smoothing=0):
    try:
        data.append(get_data(ticker, frequency="quarterly"))
    except:
        pass

100%|██████████| 83/83 [10:11<00:00,  7.37s/it]


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import numpy as np
import copy
import time

# Optional experiment tracking
try:
    import wandb
except ImportError:
    wandb = None

# Train one model per dataframe in `data`, then aggregate metrics + predictions.

target_col = "Future Change%"
drop_cols = ["Ticker", "Close Price", "Future Change%"]

# Hyperparameters
HIDDEN_SIZE = 30
NUM_LAYERS = 2
DROPOUT = 0.3
EPOCHS = 25
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-4
WINDOW_SIZE = 7

# Weights & Biases settings
USE_WANDB = True
WANDB_PROJECT = "Stock Price Predictor"
WANDB_ENTITY = "victor-vangkilde-university-of-copenhagen"
WANDB_RUN_NAME = f"lstm-run-{int(time.time())}"


class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob):
        super(StockLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout_prob if num_layers > 1 else 0.0,
        )
        self.fc_1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc_2 = nn.Linear(16, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # last time step
        out = self.fc_1(out)
        out = self.relu(out)
        out = self.fc_2(out)
        return out


def create_sliding_dataset(X_scaled, y_scaled, indices, seq_len):
    xs = []
    ys = []

    for i in indices:
        if i + seq_len <= len(X_scaled):
            window = X_scaled[i : i + seq_len]
            window = window[::-1]  # old -> new
            xs.append(window)
            ys.append(y_scaled[i])

    if not xs:
        return None, None

    return (
        torch.tensor(np.array(xs), dtype=torch.float32),
        torch.tensor(np.array(ys), dtype=torch.float32),
    )


def train_single_dataframe(df, dataset_idx, wandb_run=None):
    # Basic checks
    if df is None or not isinstance(df, pd.DataFrame) or df.empty:
        return None, f"Dataset {dataset_idx}: skipped (empty or invalid dataframe)."

    required = {"Future Change%"}
    if not required.issubset(set(df.columns)):
        return None, f"Dataset {dataset_idx}: skipped (missing required columns)."

    local_drop_cols = [c for c in drop_cols if c in df.columns]
    feature_df = df.drop(local_drop_cols, axis=1)

    if feature_df.shape[1] == 0:
        return None, f"Dataset {dataset_idx}: skipped (no usable features after dropping columns)."

    n_rows = len(df)
    if n_rows < 8:
        return None, f"Dataset {dataset_idx}: skipped (not enough rows: {n_rows})."

    # Time-based split: 0 is prediction row (newest)
    idx_pred = [0]
    n_val = max(2, int(0.2 * n_rows))
    n_val = min(n_val, n_rows - 3)  # keep enough train rows

    if n_val < 2:
        return None, f"Dataset {dataset_idx}: skipped (not enough rows for validation)."

    idx_val = list(range(1, 1 + n_val))
    idx_train = list(range(1 + n_val, n_rows))

    if len(idx_train) < 2:
        return None, f"Dataset {dataset_idx}: skipped (not enough training rows)."

    X = feature_df.values
    y = df[[target_col]].values

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    X_train_raw = X[idx_train]
    y_train_raw = y[idx_train]

    scaler_X.fit(X_train_raw)
    scaler_y.fit(y_train_raw)

    X_scaled = scaler_X.transform(X)
    y_scaled = scaler_y.transform(y)

    window_sizes = [WINDOW_SIZE]
    best_window_loss = float("inf")
    best_window_r2 = -float("inf")
    best_window_size = -1
    best_model_wts = None

    ticker_value = df.iloc[0]["Ticker"] if "Ticker" in df.columns else f"dataset_{dataset_idx}"

    for w_size in window_sizes:
        X_train_t, y_train_t = create_sliding_dataset(X_scaled, y_scaled, idx_train, w_size)
        X_val_t, y_val_t = create_sliding_dataset(X_scaled, y_scaled, idx_val, w_size)

        if X_train_t is None or X_val_t is None or y_train_t is None or y_val_t is None or len(X_val_t) < 2:
            continue

        input_features = X_train_t.shape[2]
        torch.manual_seed(42)
        model = StockLSTM(
            input_features,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            output_size=1,
            dropout_prob=DROPOUT,
        )

        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=500, factor=0.5)

        best_val_loss_local = float("inf")
        best_val_r2_local = -float("inf")
        best_wts_local = copy.deepcopy(model.state_dict())

        for _ in range(EPOCHS):
            model.train()
            optimizer.zero_grad()
            train_out = model(X_train_t)
            train_loss = criterion(train_out, y_train_t)
            train_loss.backward()
            optimizer.step()

            model.eval()
            with torch.no_grad():
                val_out = model(X_val_t)
                val_loss = criterion(val_out, y_val_t).item()

                y_true = y_val_t.detach().cpu().numpy().reshape(-1)
                y_pred = val_out.detach().cpu().numpy().reshape(-1)
                val_r2 = r2_score(y_true, y_pred) if len(y_true) >= 2 else np.nan

                if val_loss < best_val_loss_local:
                    best_val_loss_local = val_loss
                    best_val_r2_local = val_r2
                    best_wts_local = copy.deepcopy(model.state_dict())

            scheduler.step(val_loss)

        if wandb_run is not None:
            wandb_run.log(
                {
                    "dataset_idx": dataset_idx,
                    "ticker": str(ticker_value),
                    "window_size": w_size,
                    "window_best_val_loss": float(best_val_loss_local),
                    "window_best_val_r2": float(best_val_r2_local) if not np.isnan(best_val_r2_local) else np.nan,
                    "window_last_train_loss": float(train_loss.item()),
                }
            )

        if best_val_loss_local < best_window_loss:
            best_window_loss = best_val_loss_local
            best_window_r2 = best_val_r2_local
            best_window_size = w_size
            best_model_wts = best_wts_local

    if best_model_wts is None:
        return None, f"Dataset {dataset_idx}: skipped (no valid window size found)."

    # Predict first row (index 0) with best model/window
    X_pred_t, _ = create_sliding_dataset(X_scaled, y_scaled, idx_pred, best_window_size)
    if X_pred_t is None:
        return None, f"Dataset {dataset_idx}: skipped (could not build prediction window)."

    input_features = X_pred_t.shape[2]
    model = StockLSTM(
        input_features,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
        output_size=1,
        dropout_prob=DROPOUT,
    )
    model.load_state_dict(best_model_wts)
    model.eval()

    with torch.no_grad():
        pred_out = model(X_pred_t)
        pred_scaled = pred_out.detach().cpu().numpy()
        pred = scaler_y.inverse_transform(pred_scaled)[0][0]

    result = {
        "dataset_idx": dataset_idx,
        "ticker": ticker_value,
        "best_val_loss": best_window_loss,
        "best_val_r2": best_window_r2,
        "best_window_size": best_window_size,
        "prediction_first_row": float(pred),
    }

    if wandb_run is not None:
        wandb_run.log(
            {
                "dataset_best_val_loss": float(best_window_loss),
                "dataset_best_val_r2": float(best_window_r2) if not np.isnan(best_window_r2) else np.nan,
                "dataset_best_window_size": int(best_window_size),
                "dataset_prediction_first_row": float(pred),
                "dataset_idx": dataset_idx,
                "ticker": str(ticker_value),
            }
        )

    return result, f"Dataset {dataset_idx}: done (best loss={best_window_loss:.6f}, best R2={best_window_r2:.4f}, pred={pred:.4f})."


wandb_run = None
if USE_WANDB:
    if wandb is None:
        print("wandb is not installed. Install it with: pip install wandb")
    else:
        try:
            wandb_run = wandb.init(
                project=WANDB_PROJECT,
                entity=WANDB_ENTITY,
                name=WANDB_RUN_NAME,
                config={
                    "hidden_size": HIDDEN_SIZE,
                    "num_layers": NUM_LAYERS,
                    "dropout": DROPOUT,
                    "epochs": EPOCHS,
                    "learning_rate": LEARNING_RATE,
                    "weight_decay": WEIGHT_DECAY,
                    "window_size": WINDOW_SIZE,
                    "target_col": target_col,
                    "num_datasets": len(data),
                },
            )
            print(f"wandb tracking enabled: project={WANDB_PROJECT}, run={WANDB_RUN_NAME}")
        except Exception as e:
            print(f"wandb init failed, continuing without tracking: {e}")
            wandb_run = None

all_results = []
logs = []
start = time.time()

for idx, df in tqdm(enumerate(data), smoothing=0):
    result, msg = train_single_dataframe(df, idx, wandb_run=wandb_run)
    logs.append(msg)
    if result is not None:
        all_results.append(result)

print("\n".join(logs))

if all_results:
    results_df = pd.DataFrame(all_results)

    avg_val_loss = results_df["best_val_loss"].mean()
    avg_val_r2 = results_df["best_val_r2"].mean(skipna=True)

    print("\n===== Aggregate Metrics =====")
    print(f"Models trained: {len(results_df)} / {len(data)}")
    print(f"Average best validation loss: {avg_val_loss:.6f}")
    print(f"Average best validation R^2: {avg_val_r2:.4f}")

    print("\n===== Predictions for First Row (per dataset) =====")
    display(results_df[["dataset_idx", "ticker", "prediction_first_row", "best_val_loss", "best_val_r2", "best_window_size"]])

    if wandb_run is not None:
        wandb_run.log(
            {
                "aggregate_models_trained": int(len(results_df)),
                "aggregate_total_datasets": int(len(data)),
                "aggregate_avg_best_val_loss": float(avg_val_loss),
                "aggregate_avg_best_val_r2": float(avg_val_r2) if not np.isnan(avg_val_r2) else np.nan,
            }
        )
        pred_table = wandb.Table(dataframe=results_df)
        wandb_run.log({"predictions_table": pred_table})
else:
    print("No models were successfully trained.")

runtime_seconds = time.time() - start
print(f"\nTotal runtime: {runtime_seconds:.2f} seconds")

if wandb_run is not None:
    wandb_run.log({"runtime_seconds": float(runtime_seconds)})
    wandb_run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mvictor-vangkilde[0m ([33mvictor-vangkilde-university-of-copenhagen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


wandb tracking enabled: project=Stock Price Predictor, run=lstm-run-1770976977


73it [10:09,  8.35s/it]

Dataset 0: done (best loss=0.019361, best R2=0.0419, pred=-7.1289).
Dataset 1: done (best loss=0.016635, best R2=-0.0129, pred=0.7651).
Dataset 2: done (best loss=0.020135, best R2=0.0674, pred=7.1752).
Dataset 3: done (best loss=0.029357, best R2=-0.0178, pred=15.9843).
Dataset 4: done (best loss=0.145250, best R2=-0.0135, pred=7.4421).
Dataset 5: done (best loss=0.415643, best R2=-0.2622, pred=-4.3195).
Dataset 6: done (best loss=0.057379, best R2=-4.8958, pred=-20.5093).
Dataset 7: done (best loss=0.095895, best R2=0.0037, pred=10.2186).
Dataset 8: done (best loss=0.140859, best R2=-0.0268, pred=9.4024).
Dataset 9: done (best loss=0.044117, best R2=-3.5699, pred=0.6647).
Dataset 10: done (best loss=1.012897, best R2=-1.9955, pred=-13.4894).
Dataset 11: done (best loss=0.016038, best R2=-0.0967, pred=3.0056).
Dataset 12: done (best loss=0.125790, best R2=-0.0189, pred=7.1701).
Dataset 13: done (best loss=0.309757, best R2=-45.7277, pred=12.0267).
Dataset 14: done (best loss=0.092389,




Unnamed: 0,dataset_idx,ticker,prediction_first_row,best_val_loss,best_val_r2,best_window_size
0,0,MATAS.CO,-7.128919,0.019361,0.041862,3
1,1,TRIFOR.CO,0.765103,0.016635,-0.012902,12
2,2,RHM.DE,7.175158,0.020135,0.067357,8
3,3,SAAB-B.ST,15.984280,0.029357,-0.017802,8
4,4,KOG.OL,7.442135,0.145250,-0.013501,13
...,...,...,...,...,...,...
65,68,MTHH.CO,-17.410130,0.159252,-2.608117,14
66,69,AOJ-B.CO,0.338953,0.009272,-1.092447,13
67,70,LMT,8.877452,0.081335,-0.058603,11
68,71,RTX,12.126162,0.002356,-0.251122,15



Total runtime: 611.82 seconds


0,1
aggregate_avg_best_val_loss,▁
aggregate_avg_best_val_r2,▁
aggregate_models_trained,▁
aggregate_total_datasets,▁
dataset_best_val_loss,▁▂▁▁▂▁▂▂▂▄▁▂▁▁▁▂▁▂▁▂▁▁▅▁▁▃▁▄▃▁▁█▁▁▂▂▁▁▁▁
dataset_best_val_r2,████▇████▁█▅██████████████▇█▇███████████
dataset_best_window_size,▆▄▄▆▇▃▄▃▂▄▅▅▇▆▅▃▆█▃▅▁▃▃▂█▆▇▅▅▇▇▅▇▄▆▇▁▆▅▇
dataset_idx,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇██
dataset_prediction_first_row,▅▆█▄▁▅▃▇▆▅▄▄▃▆▃▁▅█▆▄▆▆▄▅▆▇▆▇▅▄▄▂▄▄▄▅▄▂▅▆
runtime_seconds,▁

0,1
aggregate_avg_best_val_loss,0.18152
aggregate_avg_best_val_r2,-9.32083
aggregate_models_trained,70
aggregate_total_datasets,73
dataset_best_val_loss,0.10125
dataset_best_val_r2,-0.09049
dataset_best_window_size,14
dataset_idx,72
dataset_prediction_first_row,4.17173
runtime_seconds,611.82175
