# Stock Prediction Capstone - Single Notebook

This notebook aggregates all functions from the src modules.

Run cells top-to-bottom to load functions, then use evaluate/predict sections.

## src/config.py

Project config constants for tickers, date splits, and minimum history.

In [None]:
# --- src/config.py ---
DEFAULT_TARGET = "AAPL"
DEFAULT_MARKET = "QQQ"
DEFAULT_SECTOR_ETF = None

# A small peer basket; filtered by availability in data/ at runtime.
DEFAULT_PEERS = [
    "MSFT",
    "AMZN",
    "GOOGL",
    "GOOG",
    "META",
    "NFLX",
    "NVDA",
    "INTC",
    "ADBE",
    "CSCO",
    "ORCL",
    "CRM",
    "TSLA",
    "AVGO",
    "QCOM",
    "AMD",
]

TRAIN_START = "2018-01-01"
TRAIN_END = "2018-12-31"
VAL_START = "2019-01-01"
VAL_END = "2019-12-31"
TEST_START = "2020-01-01"
TEST_END = "2020-03-31"

MIN_HISTORY_DAYS = 70


## src/utils.py

Utility helpers for loading CSVs, computing returns, metrics, and JSON IO.

In [None]:
# --- src/utils.py ---
import json
import math
from pathlib import Path

import numpy as np
import pandas as pd


In [None]:
def find_ticker_path(data_dir, ticker):
    data_dir = Path(data_dir)
    if not data_dir.exists():
        raise FileNotFoundError(f"data dir not found: {data_dir}")
    candidates = [p for p in data_dir.glob("*.csv") if p.stem.upper() == ticker.upper()]
    if candidates:
        return candidates[0]
    # Fallback: case-insensitive search
    for p in data_dir.glob("*.csv"):
        if p.stem.lower() == ticker.lower():
            return p
    return None


In [None]:
def load_ticker_csv(data_dir, ticker):
    path = find_ticker_path(data_dir, ticker)
    if path is None:
        raise FileNotFoundError(f"ticker csv not found for {ticker} in {data_dir}")
    df = pd.read_csv(path, parse_dates=["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df


In [None]:
def compute_returns(df):
    out = df.copy()
    out["Adj Close"] = pd.to_numeric(out["Adj Close"], errors="coerce")
    out["return"] = out["Adj Close"].pct_change()
    return out


In [None]:
def to_date_index(df):
    df = df.copy()
    df = df.set_index("Date").sort_index()
    return df


In [None]:
def date_mask(df, start, end):
    return (df.index >= pd.to_datetime(start)) & (df.index <= pd.to_datetime(end))


In [None]:
def safe_log(series):
    return np.log(series.replace(0, np.nan))


In [None]:
def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))


In [None]:
def rmse(y_true, y_pred):
    return float(math.sqrt(np.mean((y_true - y_pred) ** 2)))


In [None]:
def sign_accuracy(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return float(np.mean(np.sign(y_true) == np.sign(y_pred)))


In [None]:
def save_json(path, payload):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="ascii") as f:
        json.dump(payload, f, indent=2)


In [None]:
def load_json(path):
    with Path(path).open("r", encoding="ascii") as f:
        return json.load(f)


## src/sector.py

Sector construction utilities: peer selection, sector returns, and breadth.

In [None]:
# --- src/sector.py ---
def available_tickers(data_dir):
    data_dir = Path(data_dir)
    return sorted([p.stem.upper() for p in data_dir.glob("*.csv")])

In [None]:
def build_peer_sector(data_dir, target_ticker, peer_candidates, train_start, train_end, k=10):
    target_df = compute_returns(load_ticker_csv(data_dir, target_ticker))
    target_df = to_date_index(target_df)
    target_train = target_df.loc[train_start:train_end]["return"].dropna()

    peers = []
    scores = []
    for ticker in peer_candidates:
        if ticker.upper() == target_ticker.upper():
            continue
        try:
            peer_df = compute_returns(load_ticker_csv(data_dir, ticker))
        except FileNotFoundError:
            continue
        peer_df = to_date_index(peer_df)
        aligned = pd.concat([target_train, peer_df["return"]], axis=1, join="inner").dropna()
        if aligned.shape[0] < 30:
            continue
        corr = aligned.iloc[:, 0].corr(aligned.iloc[:, 1])
        if pd.isna(corr):
            continue
        peers.append(ticker)
        scores.append(corr)

    if not peers:
        raise ValueError("no peers found for sector construction")

    order = np.argsort(scores)[::-1]
    top_peers = [peers[i] for i in order[:k]]
    return top_peers


In [None]:
def sector_return_from_peers(data_dir, peers):
    returns = []
    for ticker in peers:
        df = compute_returns(load_ticker_csv(data_dir, ticker))
        df = to_date_index(df)
        returns.append(df["return"].rename(ticker))
    combined = pd.concat(returns, axis=1, join="outer")
    sector_ret = combined.mean(axis=1, skipna=True)
    return sector_ret.to_frame("return")


In [None]:
def get_sector_series(data_dir, target_ticker, sector_etf, peer_candidates, train_start, train_end, k=10):
    if sector_etf:
        try:
            etf_df = compute_returns(load_ticker_csv(data_dir, sector_etf))
        except FileNotFoundError:
            sector_etf = None
        else:
            etf_df = to_date_index(etf_df)
            return etf_df[["return"]].rename(columns={"return": "sec_return"}), [], sector_etf

    peers = build_peer_sector(
        data_dir=data_dir,
        target_ticker=target_ticker,
        peer_candidates=peer_candidates,
        train_start=train_start,
        train_end=train_end,
        k=k,
    )
    sec_df = sector_return_from_peers(data_dir, peers)
    sec_df = sec_df.rename(columns={"return": "sec_return"})
    return sec_df, peers, None


In [None]:
def compute_breadth(data_dir, peers):
    if not peers:
        raise ValueError("breadth peers list is empty")
    returns = []
    for ticker in peers:
        df = compute_returns(load_ticker_csv(data_dir, ticker))
        df = to_date_index(df)
        returns.append(df["return"].rename(ticker))
    combined = pd.concat(returns, axis=1, join="outer")
    breadth = (combined > 0).sum(axis=1) / combined.shape[1]
    return breadth.to_frame("breadth")


## src/features.py

Feature engineering for target, market, sector, and calendar signals.

In [None]:
# --- src/features.py ---
def _rolling_zscore(series, window):
    mean = series.rolling(window).mean()
    std = series.rolling(window).std()
    # Avoid division by zero by adding small epsilon
    std = std.replace(0, np.nan)
    return (series - mean) / std

In [None]:
def _add_return_lags(df, col, prefix, max_lag=5):
    out = {}
    for i in range(max_lag + 1):
        name = f"{prefix}lag_{i}"
        out[name] = df[col].shift(i)
    return out


In [None]:
def _add_rolling_stats(df, col, prefix, windows):
    out = {}
    for w in windows:
        out[f"{prefix}roll_mean_{w}"] = df[col].rolling(w).mean()
        out[f"{prefix}roll_vol_{w}"] = df[col].rolling(w).std()
    return out


In [None]:
def _compute_expanding_betas(y, x1, x2, min_obs=30):
    n = len(y)
    betas = np.full((n, 3), np.nan)
    resid = np.full(n, np.nan)

    sum1 = sum2 = sum11 = sum22 = sum12 = 0.0
    sumy = sum1y = sum2y = 0.0
    count = 0

    for i in range(n):
        yi = y[i]
        x1i = x1[i]
        x2i = x2[i]
        if np.isnan(yi) or np.isnan(x1i) or np.isnan(x2i):
            betas[i] = np.array([np.nan, np.nan, np.nan])
            resid[i] = np.nan
            continue
        count += 1
        sum1 += x1i
        sum2 += x2i
        sum11 += x1i * x1i
        sum22 += x2i * x2i
        sum12 += x1i * x2i
        sumy += yi
        sum1y += x1i * yi
        sum2y += x2i * yi

        if count < min_obs:
            betas[i] = np.array([np.nan, np.nan, np.nan])
            resid[i] = np.nan
            continue

        xtx = np.array(
            [
                [count, sum1, sum2],
                [sum1, sum11, sum12],
                [sum2, sum12, sum22],
            ]
        )
        xty = np.array([sumy, sum1y, sum2y])
        try:
            coeffs = np.linalg.solve(xtx, xty)
        except np.linalg.LinAlgError:
            coeffs = np.array([np.nan, np.nan, np.nan])
        betas[i] = coeffs
        resid[i] = yi - (coeffs[0] + coeffs[1] * x1i + coeffs[2] * x2i)

    return betas, resid


In [None]:
def make_feature_frame(target_df, market_ret, sector_ret, breadth, min_history_days=70, drop_target_na=True):
    target = to_date_index(target_df)
    market_ret = market_ret.rename(columns={market_ret.columns[0]: "mkt_return"})
    sector_ret = sector_ret.rename(columns={sector_ret.columns[0]: "sec_return"})
    breadth = breadth.rename(columns={breadth.columns[0]: "breadth"})

    base = target[["Adj Close", "High", "Low", "Close", "Volume", "return"]].copy()
    base = base.join(market_ret, how="left").join(sector_ret, how="left").join(breadth, how="left")

    features = {}

    # Own stock features
    features.update(_add_return_lags(base, "return", "own_", max_lag=5))
    features.update(_add_rolling_stats(base, "return", "own_", [5, 20, 63]))

    # Avoid division by zero (though Close should never be 0 for valid stock data)
    close = base["Close"].replace(0, np.nan)
    hlc = (base["High"] - base["Low"]) / close
    features["own_hlc"] = hlc
    features["own_hlc_z20"] = _rolling_zscore(hlc, 20)
    features["own_hlc_z63"] = _rolling_zscore(hlc, 63)

    adj = base["Adj Close"]
    for w in [5, 20, 63]:
        sma = adj.rolling(w).mean()
        # Avoid division by zero (though sma should never be 0 for valid stock data)
        sma = sma.replace(0, np.nan)
        features[f"own_sma_gap_{w}"] = (adj / sma) - 1.0

    log_vol = safe_log(base["Volume"])
    features["own_log_vol"] = log_vol
    features["own_log_vol_z21"] = _rolling_zscore(log_vol, 21)
    features["own_log_vol_delta_5"] = log_vol - log_vol.shift(5)
    features["own_log_vol_delta_20"] = log_vol - log_vol.shift(20)

    # Market features
    features["mkt_return"] = base["mkt_return"]
    features.update(_add_return_lags(base, "mkt_return", "mkt_", max_lag=5))
    features.update(_add_rolling_stats(base, "mkt_return", "mkt_", [20, 63]))

    # Sector features
    features["sec_return"] = base["sec_return"]
    features.update(_add_return_lags(base, "sec_return", "sec_", max_lag=5))
    features.update(_add_rolling_stats(base, "sec_return", "sec_", [20, 63]))

    # Breadth
    features["breadth"] = base["breadth"]
    for i in range(1, 6):
        features[f"breadth_lag_{i}"] = base["breadth"].shift(i)

    # Factor betas and residuals
    y = base["return"].to_numpy()
    x1 = base["mkt_return"].to_numpy()
    x2 = base["sec_return"].to_numpy()
    betas, resid = _compute_expanding_betas(y, x1, x2, min_obs=30)
    features["factor_alpha"] = betas[:, 0]
    features["factor_beta_mkt"] = betas[:, 1]
    features["factor_beta_sec"] = betas[:, 2]
    features["factor_resid"] = resid

    # Calendar
    features["cal_dow"] = base.index.dayofweek
    features["cal_month"] = base.index.month

    feature_df = pd.DataFrame(features, index=base.index)
    feature_df = pd.get_dummies(feature_df, columns=["cal_dow", "cal_month"], drop_first=False)

    feature_df["target"] = base["return"].shift(-1)
    feature_df["adj_close"] = base["Adj Close"]

    # Drop early rows with insufficient history
    if min_history_days:
        feature_df = feature_df.iloc[min_history_days:]

    feature_cols = [c for c in feature_df.columns if c != "target"]
    feature_df = feature_df.dropna(subset=feature_cols)
    if drop_target_na:
        feature_df = feature_df.dropna(subset=["target"])
    return feature_df


In [None]:
def feature_groups(columns):
    own = [c for c in columns if c.startswith("own_")]
    mkt = [c for c in columns if c.startswith("mkt_") or c == "mkt_return"]
    sec = [c for c in columns if c.startswith("sec_") or c == "sec_return"]
    breadth = [c for c in columns if c.startswith("breadth")]
    factor = [c for c in columns if c.startswith("factor_")]
    cal = [c for c in columns if c.startswith("cal_")]
    return {
        "own": own,
        "market": mkt + breadth,
        "sector": sec + factor,
        "calendar": cal,
    }


## src/models.py

Model builders for base learners and the stacking meta-model.

In [None]:
# --- src/models.py ---
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [None]:
def build_base_models(random_state=42):
    elastic = Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "model",
                ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=5000, random_state=random_state),
            ),
        ]
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=6,
        min_samples_leaf=10,
        random_state=random_state,
        n_jobs=-1,
    )

    gbrt = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=random_state,
    )

    return {
        "ElasticNet": elastic,
        "RandomForest": rf,
        "GBRT": gbrt,
    }


In [None]:
def build_meta_model(random_state=42):
    return Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "model",
                ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=5000, random_state=random_state),
            ),
        ]
    )


## src/stack.py

Stacking helpers to generate out-of-fold predictions for the meta model.

In [None]:
# --- src/stack.py ---
from sklearn.model_selection import TimeSeriesSplit


In [None]:
def oof_predictions(base_models, X_train, y_train, X_val, y_val, n_splits=4):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    oof = {name: np.full(len(y_val), np.nan) for name in base_models}

    for train_idx, test_idx in tscv.split(X_val):
        X_fold_train = np.vstack([X_train, X_val[train_idx]])
        y_fold_train = np.concatenate([y_train, y_val[train_idx]])
        X_fold_test = X_val[test_idx]

        for name, model in base_models.items():
            model.fit(X_fold_train, y_fold_train)
            oof[name][test_idx] = model.predict(X_fold_test)

    return oof


In [None]:
def make_meta_matrix(pred_dict):
    names = [k for k in pred_dict.keys() if k != "target"]
    Z = np.column_stack([pred_dict[name] for name in names])
    return Z, names


## src/evaluate.py

Training and evaluation pipeline that builds features, fits models, and writes artifacts.

In [None]:
# --- src/evaluate.py ---
import joblib
from sklearn.linear_model import ElasticNet

In [None]:
def _decision_labels(r_hat, theta):
    labels = []
    for r in r_hat:
        if r >= theta:
            labels.append("Buy")
        elif r <= -theta:
            labels.append("Sell")
        else:
            labels.append("Hold")
    return labels


In [None]:
def _tune_threshold(y_true, y_pred):
    grid = np.linspace(0.002, 0.02, 10)
    best = (None, -1, -1)
    for theta in grid:
        labels = _decision_labels(y_pred, theta)
        # Map labels to sign for rough alignment
        pred_sign = np.array([1 if x == "Buy" else -1 if x == "Sell" else 0 for x in labels])
        true_sign = np.sign(y_true)
        score = np.mean(pred_sign == true_sign)
        # Prefer larger theta when score ties to keep neutral zone wide.
        if score > best[1] or (score == best[1] and theta > best[0]):
            best = (theta, score, len(labels))
    return best[0]


In [None]:
def _metrics_table(name, y_true, y_pred):
    return {
        "Model": name,
        "MAE": round(mae(y_true, y_pred), 6),
        "RMSE": round(rmse(y_true, y_pred), 6),
        "SignAcc": round(sign_accuracy(y_true, y_pred), 6),
    }


## src/predict.py

Single-date prediction routine that loads artifacts and emits a JSON decision payload.

In [None]:
# --- src/predict.py ---
# Note: The main() function from predict.py is designed for command-line use.
# For notebook usage, you can call the individual functions directly or create a wrapper.