In [67]:
# Crack-Spread Nowcaster
# A lightweight AI/ML project applying financial feature engineering & classification to commodity markets

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
)

plt.style.use("seaborn-v0_8-darkgrid")


In [68]:
def load_yahoo(symbols=("CL=F", "RB=F", "HO=F"), start="2014-01-01") -> pd.DataFrame:
    """
    Download WTI (CL), RBOB (RB), and ULSD/Heating Oil (HO) futures from Yahoo Finance.
    Returns a clean DataFrame with columns: CL, RB, HO.
    Defensive against yfinance returning a DataFrame for 'Close'.
    """
    data = []
    tickers = ["CL", "RB", "HO"]

    for symbol, ticker in zip(symbols, tickers):
        df = yf.download(symbol, start=start, progress=False, auto_adjust=True)

        # Get the Close column robustly as a Series (never DataFrame)
        close = df.loc[:, ["Close"]].squeeze("columns")
        if isinstance(close, pd.DataFrame):
            # ultra-defensive fallback (shouldn't happen, but just in case)
            close = close.iloc[:, 0]

        close.name = ticker  # set the Series name (this is valid for Series)
        data.append(close)

    out = pd.concat(data, axis=1)
    return out.dropna().sort_index()




In [69]:
def compute_crack(df: pd.DataFrame) -> pd.Series:
    """
    3-2-1 crack spread: 2*RB + HO - 3*CL
    Measures refinery margin per barrel.
    """
    crack = 2 * df["RB"] + df["HO"] - 3 * df["CL"]
    crack = pd.Series(crack.values, index=df.index, name="CRACK")
    return crack


def make_features(prices: pd.DataFrame, windows=(5, 10, 20)) -> pd.DataFrame:
    """
    Build interpretable financial features for CL, RB, HO and CRACK:
      - daily returns
      - momentum (% change over lookbacks)
      - z-scores (standardized deviations)
      - day-of-week & month
    """
    feats = pd.DataFrame(index=prices.index)

    # daily returns
    for col in prices.columns:
        feats[f"{col}_ret1"] = prices[col].pct_change(1)

    # crack spread
    crack = compute_crack(prices)
    feats["CRACK"] = crack
    feats["CRACK_ret1"] = crack.pct_change(1)

    # rolling momentum and z-scores
    for w in windows:
        for col in ["CL", "RB", "HO", "CRACK"]:
            s = prices[col] if col in prices.columns else crack
            feats[f"{col}_mom_{w}"] = s.pct_change(w)
            roll = s.rolling(w)
            feats[f"{col}_z_{w}"] = (s - roll.mean()) / (roll.std() + 1e-9)

    # calendar features
    feats["dow"] = feats.index.dayofweek
    feats["month"] = feats.index.month

    return feats.dropna()


In [70]:
def make_labels(crack_series: pd.Series) -> pd.Series:
    """Binary label: 1 if crack(t+1) > crack(t), else 0."""
    return (crack_series.shift(-1) > crack_series).astype(int).rename("y")


def time_split(idx, train_end="2022-12-31"):
    """Split by date (no shuffle)."""
    idx = pd.to_datetime(idx)
    train_mask = idx <= pd.to_datetime(train_end)
    test_mask  = idx >  pd.to_datetime(train_end)
    return train_mask, test_mask


In [71]:
def train_and_eval(X: pd.DataFrame, y: pd.Series, train_end="2022-12-31"):
    train_mask, test_mask = time_split(X.index, train_end=train_end)
    X_train, y_train = X.loc[train_mask], y.loc[train_mask]
    X_test,  y_test  = X.loc[test_mask],  y.loc[test_mask]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X_train, y_train)

    prob = pipe.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)

    metrics = {
        "accuracy":  float(accuracy_score(y_test, pred)),
        "precision": float(precision_score(y_test, pred)),
        "recall":    float(recall_score(y_test, pred)),
        "auc":       float(roc_auc_score(y_test, prob)),
    }

    fpr, tpr, _ = roc_curve(y_test, prob)
    crack_test = X.loc[test_mask, "CRACK"]
    crack_ret  = crack_test.pct_change(1).fillna(0.0)
    equity     = (1 + (pred * crack_ret)).cumprod()

    return pipe, metrics, (fpr, tpr), equity, test_mask


In [72]:
def save_artifacts(metrics, roc_data, equity, outdir_fig="figures", outdir_rep="reports"):
    os.makedirs(outdir_fig, exist_ok=True)
    os.makedirs(outdir_rep, exist_ok=True)

    fpr, tpr = roc_data
    auc = metrics["auc"]

    # ROC curve
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0, 1], [0, 1], "--")
    plt.title("ROC — 3-2-1 Crack Nowcaster")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(outdir_fig, "roc_curve.png"), dpi=200)
    plt.close()

    # Equity curve
    plt.figure()
    equity.plot()
    plt.title("Toy Strategy Equity Curve (No Costs)")
    plt.xlabel("Date")
    plt.ylabel("Equity")
    plt.tight_layout()
    plt.savefig(os.path.join(outdir_fig, "equity_curve.png"), dpi=200)
    plt.close()

    # Save metrics JSON
    with open(os.path.join(outdir_rep, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    print("✅ Saved:")
    print(f"   {os.path.join(outdir_fig, 'roc_curve.png')}")
    print(f"   {os.path.join(outdir_fig, 'equity_curve.png')}")
    print(f"   {os.path.join(outdir_rep, 'metrics.json')}")


In [73]:
# Step 1: Fetch and preview data
prices = load_yahoo(start="2014-01-01")
display(prices.tail())
print("Prices shape:", prices.shape)
print(type(prices["CL"]))   # should be pandas.core.series.Series



# Step 2: Build features and labels
X = make_features(prices, windows=(5, 10, 20))
y = make_labels(X["CRACK"])
Xy = X.join(y).dropna()
y = Xy.pop("y")
X = Xy
print("Feature matrix:", X.shape, "| Labels:", y.shape)

# Step 3: Train + evaluate
model, metrics, roc_data, equity, test_mask = train_and_eval(X, y, train_end="2022-12-31")

# Step 4: Print metrics
print("\nTest metrics")
for k, v in metrics.items():
    print(f"{k.capitalize():<10}: {v:.3f}")

# Step 5: Save results (for GitHub)
save_artifacts(metrics, roc_data, equity)


Unnamed: 0_level_0,CL,RB,HO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-08,62.549999,1.9095,2.2913
2025-10-09,61.509998,1.8826,2.2803
2025-10-10,58.900002,1.8204,2.2044
2025-10-13,59.490002,1.8438,2.2497
2025-10-14,58.419998,1.8233,2.2186


Prices shape: (2964, 3)
<class 'pandas.core.series.Series'>
Feature matrix: (2944, 31) | Labels: (2944,)

Test metrics
Accuracy  : 0.540
Precision : 0.541
Recall    : 0.443
Auc       : 0.539
✅ Saved:
   figures/roc_curve.png
   figures/equity_curve.png
   reports/metrics.json
