# Plan 0d4f0ff4-730f-47dc-a387-7943806fe990

This notebook was generated automatically from Plan JSON v1.1.
It follows the declared dataset, model, and configuration using a
deterministic CPU-only workflow.

In [None]:
import json
        import os
        import random
        import sys
        from pathlib import Path

        import numpy as np

        try:
            import torch
            TORCH_AVAILABLE = True
        except ImportError:
            TORCH_AVAILABLE = False

        EVENTS_PATH = Path("events.jsonl")
        METRICS_PATH = Path("metrics.json")

        if EVENTS_PATH.exists():
            EVENTS_PATH.unlink()
        if METRICS_PATH.exists():
            METRICS_PATH.unlink()

        def log_event(event_type: str, payload: dict) -> None:
            EVENTS_PATH.parent.mkdir(parents=True, exist_ok=True)
            with EVENTS_PATH.open("a", encoding="utf-8") as stream:
                stream.write(json.dumps({"event": event_type, **payload}) + "
")

        def seed_everything(seed: int) -> None:
            random.seed(seed)
            np.random.seed(seed)
            if TORCH_AVAILABLE:
                torch.manual_seed(seed)
                if torch.cuda.is_available():
                    raise RuntimeError("E_GPU_REQUESTED: CUDA devices are not permitted during runs")
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        SEED = 42
        seed_everything(SEED)
        log_event("stage_update", {"stage": "seed_check", "seed": SEED})
        print("Notebook generated for Plan 0d4f0ff4-730f-47dc-a387-7943806fe990")
        print("Python version:", sys.version)
        print("Seed set to", SEED)
        if TORCH_AVAILABLE:
            print("Torch version:", torch.__version__)
        else:
            print("Torch not installed (not required for this plan)")

In [None]:
# Dataset: sst2 (HuggingFace - cached download)
CACHE_DIR = os.getenv("DATASET_CACHE_DIR", "./data/cache")
OFFLINE_MODE = os.getenv("OFFLINE_MODE", "false").lower() == "true"

log_event("stage_update", {"stage": "dataset_load", "dataset": "sst2"})

# Load with caching (downloads only if not cached)
dataset = load_dataset(
    "glue", "sst2",
    cache_dir=CACHE_DIR,
    download_mode="reuse_dataset_if_exists",  # Reuse cache if available
)

# Extract split
split_name = "train" if "train" in dataset else "train"
train_data = dataset[split_name]

# Convert to sklearn-compatible format
# Phase 2: Simple bag-of-words (Phase 3 will add real NLP models)

# Detect text field (common field names)
text_field = None
for field in ["sentence", "text", "content", "review"]:
    if field in train_data.features:
        text_field = field
        break

if text_field is None:
    raise ValueError(f"Could not find text field in dataset. Available fields: {list(train_data.features.keys())}")

# Extract texts and labels
texts = [row[text_field] for row in train_data]

# Detect label field
label_field = "label" if "label" in train_data.features else list(train_data.features.keys())[1]
labels = [row[label_field] for row in train_data]

# Vectorize text (bag-of-words for sklearn compatibility)
MAX_FEATURES = int(os.getenv("MAX_BOW_FEATURES", "1000"))
vectorizer = CountVectorizer(max_features=MAX_FEATURES, random_state=SEED)
X = vectorizer.fit_transform(texts).toarray()
y = np.array(labels)

# Subsample for CPU budget
MAX_SAMPLES = int(os.getenv("MAX_TRAIN_SAMPLES", "5000"))
if len(X) > MAX_SAMPLES:
    indices = np.random.RandomState(SEED).choice(len(X), MAX_SAMPLES, replace=False)
    X, y = X[indices], y[indices]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

log_event("metric_update", {"metric": "dataset_samples", "value": len(X)})

In [None]:
log_event("stage_update", {"stage": "model_build", "model": "Convolutional Neural Network"})
model = LogisticRegression(
    max_iter=max(100, 15 * 10),
    solver="lbfgs",
    random_state=SEED,
)

log_event("stage_update", {"stage": "train"})
model.fit(X_train, y_train)

log_event("stage_update", {"stage": "evaluate"})
y_pred = model.predict(X_test)
accuracy = float(accuracy_score(y_test, y_pred))
precision = float(precision_score(y_test, y_pred, zero_division=0))
recall = float(recall_score(y_test, y_pred, zero_division=0))

metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
}
GOAL_VALUE = 87.200000
if GOAL_VALUE is not None:
    metrics["accuracy_gap"] = accuracy - GOAL_VALUE

METRICS_PATH.write_text(json.dumps({"metrics": metrics}, indent=2), encoding="utf-8")
print(json.dumps({"metrics": metrics}, indent=2))
log_event("metric_update", {"metric": "accuracy", "value": accuracy})
if len(y_pred) > 0:
    log_event("sample_pred", {"label": int(y_pred[0]), "stage": "evaluate"})
log_event("stage_update", {"stage": "complete"})