# Plan cd5ec794-e628-42a0-90ea-76f1d62bd1a7

This notebook was generated automatically from Plan JSON v1.1.
It follows the declared dataset, model, and configuration using a
deterministic CPU-only workflow.

In [None]:
import json
import os
import random
import sys
from pathlib import Path

import numpy as np

try:
    import torch
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False

EVENTS_PATH = Path("events.jsonl")
METRICS_PATH = Path("metrics.json")

if EVENTS_PATH.exists():
    EVENTS_PATH.unlink()
if METRICS_PATH.exists():
    METRICS_PATH.unlink()

def log_event(event_type: str, payload: dict) -> None:
    EVENTS_PATH.parent.mkdir(parents=True, exist_ok=True)
    with EVENTS_PATH.open("a", encoding="utf-8") as stream:
        stream.write(json.dumps({"event": event_type, **payload}) + "\n")

def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    if TORCH_AVAILABLE:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            raise RuntimeError("E_GPU_REQUESTED: CUDA devices are not permitted during runs")
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)
log_event("stage_update", {"stage": "seed_check", "seed": SEED})
print("Notebook generated for Plan cd5ec794-e628-42a0-90ea-76f1d62bd1a7")
print("Python version:", sys.version)
print("Seed set to", SEED)
if TORCH_AVAILABLE:
    print("Torch version:", torch.__version__)
else:
    print("Torch not installed (not required for this plan)")

In [None]:
from io import BytesIO
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
import pandas as pd
import requests

In [None]:
# Dataset: penaltyshootouts (Uploaded with paper - Supabase Storage)
log_event("stage_update", {"stage": "dataset_load", "dataset": "penaltyshootouts"})

# Download dataset from Supabase signed URL
# URL is injected at runtime by the sandbox (24-hour expiration)
dataset_url = os.getenv("DATASET_URL")
if not dataset_url:
    raise ValueError("DATASET_URL environment variable not set. Cannot download uploaded dataset.")

log_event("info", {"message": f"Downloading dataset from Supabase: {dataset_url[:50]}..."})

response = requests.get(dataset_url, timeout=300)  # 5 minute timeout for large datasets
response.raise_for_status()

# Load Excel file from memory
df = pd.read_excel(BytesIO(response.content))

log_event("metric_update", {"metric": "dataset_rows", "value": len(df)})

# Detect target column (common names)
target_column = None
for col in ["Win", "win", "target", "label", "class", "y", "Target", "Label"]:
    if col in df.columns:
        target_column = col
        break

if target_column is None:
    # Fall back to last column
    target_column = df.columns[-1]
    log_event("warning", {"message": f"No standard target column found. Using last column: {target_column}"})

# Separate features and target
y = df[target_column].values
X_df = df.drop(columns=[target_column])

# Drop high-cardinality string columns (team names, competition names, etc.)
# Keep only columns with reasonable cardinality (<50 unique values)
for col in X_df.columns:
    if X_df[col].dtype == 'object':  # String column
        if X_df[col].nunique() > 50:
            X_df = X_df.drop(columns=[col])
            log_event("info", {"message": f"Dropped high-cardinality column: {col}"})

# Encode categorical features
label_encoders = {}
for col in X_df.columns:
    if X_df[col].dtype == 'object':  # String categorical
        le = LabelEncoder()
        X_df[col] = le.fit_transform(X_df[col].astype(str))
        label_encoders[col] = le

# Convert to numpy array
X = X_df.values

# Subsample for CPU budget (only if dataset is large)
MAX_SAMPLES = int(os.getenv("MAX_TRAIN_SAMPLES", "5000"))
if len(X) > MAX_SAMPLES:
    indices = np.random.RandomState(SEED).choice(len(X), MAX_SAMPLES, replace=False)
    X, y = X[indices], y[indices]
    log_event("info", {"message": f"Subsampled {len(X)} â†’ {MAX_SAMPLES} rows"})

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

log_event("metric_update", {"metric": "dataset_samples", "value": len(X)})
log_event("metric_update", {"metric": "dataset_features", "value": X.shape[1]})

In [None]:
log_event("stage_update", {"stage": "model_build", "model": "simple baseline model"})
model = LogisticRegression(
    max_iter=max(100, 5 * 10),
    solver="lbfgs",
    random_state=SEED,
)

log_event("stage_update", {"stage": "train"})
model.fit(X_train, y_train)

log_event("stage_update", {"stage": "evaluate"})
y_pred = model.predict(X_test)
accuracy = float(accuracy_score(y_test, y_pred))
precision = float(precision_score(y_test, y_pred, zero_division=0))
recall = float(recall_score(y_test, y_pred, zero_division=0))

metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
}
GOAL_VALUE = 0.650000
if GOAL_VALUE is not None:
    metrics["accuracy_gap"] = accuracy - GOAL_VALUE

METRICS_PATH.write_text(json.dumps({"metrics": metrics}, indent=2), encoding="utf-8")
print(json.dumps({"metrics": metrics}, indent=2))
log_event("metric_update", {"metric": "accuracy", "value": accuracy})
if len(y_pred) > 0:
    log_event("sample_pred", {"label": int(y_pred[0]), "stage": "evaluate"})
log_event("stage_update", {"stage": "complete"})