# Kaggle Test Notebook: LLM Agent Pipeline (Pre-WAN)

This notebook validates that the LLM-only agent pipeline is working before feeding prompts into WAN video generation.

It will:
1. Configure OpenRouter credentials safely
2. Call the LLM directly for a seed prompt smoke test
3. Run `llm_pipeline.py`
4. Validate generated artifacts
5. Export results in an embedded (vectorized) form

## 1) Install dependencies

In [None]:
!pip -q install --upgrade pip
!pip -q install openai requests scikit-learn pandas

## 2) Clone repository (if needed) and set working directory

In [None]:
import os
from pathlib import Path

WORK_DIR = Path("/kaggle/working")
CLONE_DIR = WORK_DIR / "tonmoy99_create-RL-Brain"

if not CLONE_DIR.exists():
    !git clone https://github.com/Tonmoy221/tonmoy99_create-RL-Brain.git "{CLONE_DIR.name}"

search_roots = [
    WORK_DIR / "tonmoy99_Vedio-Gen",
    CLONE_DIR,
]

matches = []
for root in search_roots:
    if root.exists():
        matches.extend(root.rglob("llm_pipeline.py"))

if not matches:
    raise FileNotFoundError(
        "llm_pipeline.py not found under /kaggle/working. "
        "Expected in your project folder after clone."
    )

# Prefer the intended project folder if multiple matches exist.
matches = sorted(
    matches,
    key=lambda p: (
        "tonmoy99_Vedio-Gen" not in str(p.parent),
        len(str(p.parent)),
    ),
)

script_path = matches[0]
repo_path = script_path.parent

os.chdir(repo_path)
print("Current working directory:", os.getcwd())
print("Using llm_pipeline.py:", script_path)
print("llm_pipeline.py exists:", Path("llm_pipeline.py").exists())

## 3) Configure API credentials (safe way)
Use Kaggle Secrets for `OPENROUTER_API_KEY` instead of hardcoding keys in notebook cells.

In [None]:
from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()
OPENROUTER_API_KEY = secrets.get_secret("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    raise RuntimeError("Missing Kaggle secret OPENROUTER_API_KEY")

os.environ["OPENAI_API_KEY"] = OPENROUTER_API_KEY
os.environ["OPENAI_BASE_URL"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_HTTP_REFERER"] = "https://www.kaggle.com"
os.environ["OPENAI_X_TITLE"] = "LLM Agent Pipeline Test"

print("Environment configured for OpenRouter through OpenAI-compatible client.")

## 4) Direct LLM smoke test (OpenRouter)
This verifies API/model is reachable and generates a cinematic seed prompt.

In [None]:
from openai import OpenAI

smoke_client = OpenAI(
    base_url=os.environ["OPENAI_BASE_URL"],
    api_key=os.environ["OPENAI_API_KEY"],
)

smoke_completion = smoke_client.chat.completions.create(
    extra_headers={
        "HTTP-Referer": os.environ.get("OPENAI_HTTP_REFERER", "https://www.kaggle.com"),
        "X-Title": os.environ.get("OPENAI_X_TITLE", "LLM Agent Pipeline Test"),
    },
    model="google/gemma-3n-e4b-it:free",
    messages=[
        {
            "role": "user",
            "content": "Generate one cinematic seed prompt for a 4-scene short film with strong character and location continuity.",
        }
    ],
)

seed_prompt = smoke_completion.choices[0].message.content.strip()
print("Generated seed prompt:\n")
print(seed_prompt)

## 5) Run the LLM Agent Pipeline (no WAN)
Runs your standalone `llm_pipeline.py` and writes artifacts.

In [None]:
import os
import sys
import traceback
from pathlib import Path

repo_root = Path.cwd()
script_path = repo_root / "llm_pipeline.py"
if not script_path.exists():
    raise FileNotFoundError(f"Cannot run pipeline: missing {script_path}")

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

try:
    import llm_pipeline as lp
    from openai import OpenAI

    def _openai_compat_call(self, system_prompt: str, user_prompt: str) -> str:
        api_key = os.getenv("OPENAI_API_KEY", "").strip()
        if not api_key:
            raise RuntimeError("Missing OPENAI_API_KEY environment variable")

        base_url = os.getenv("OPENAI_BASE_URL", "").strip() or None
        client_kwargs = {"api_key": api_key}
        if base_url is not None:
            client_kwargs["base_url"] = base_url

        client = OpenAI(**client_kwargs)

        extra_headers = {}
        http_referer = os.getenv("OPENAI_HTTP_REFERER", "").strip()
        x_title = os.getenv("OPENAI_X_TITLE", "").strip()
        if http_referer:
            extra_headers["HTTP-Referer"] = http_referer
        if x_title:
            extra_headers["X-Title"] = x_title

        # Try normal system+user first; fallback to user-only for models/providers
        # that reject developer/system instructions (e.g. some free Gemma endpoints).
        try:
            response = client.chat.completions.create(
                model=self.model_name,
                temperature=0.4,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                extra_headers=extra_headers if extra_headers else None,
            )
            return response.choices[0].message.content or ""
        except Exception as first_exc:
            text = str(first_exc).lower()
            needs_fallback = "developer instruction is not enabled" in text
            if not needs_fallback:
                raise

            merged_user_prompt = (
                "Follow these instructions exactly:\n"
                f"{system_prompt}\n\n"
                "User request:\n"
                f"{user_prompt}"
            )
            response = client.chat.completions.create(
                model=self.model_name,
                temperature=0.4,
                messages=[{"role": "user", "content": merged_user_prompt}],
                extra_headers=extra_headers if extra_headers else None,
            )
            return response.choices[0].message.content or ""

    # Runtime monkey patch for compatibility in Kaggle clone.
    lp.LLMDirectorAgent._call_openai = _openai_compat_call

    report_out = lp.run_llm_pipeline(
        seed_prompt=seed_prompt,
        output_root=".",
        provider="openai",
        model_name="google/gemma-3n-e4b-it:free",
        resume=False,
    )
    print("Pipeline completed successfully.")
    print("Report path:", report_out)
except Exception as exc:
    print("Pipeline failed with exception:", str(exc))
    print("Detailed traceback:\n")
    traceback.print_exc()
    raise

## 6) Validate pipeline artifacts and pass/fail status

In [None]:
import json
from pathlib import Path

creative_doc_path = Path("story_bible/llm_only/creative_document_llm.json")
scene_prompts_path = Path("output/llm_only/scene_prompts_llm.json")
report_path = Path("output/llm_only/llm_pipeline_report.json")
memory_path = Path("memory_llm/state_llm.json")

required_paths = [creative_doc_path, scene_prompts_path, report_path, memory_path]
missing = [str(p) for p in required_paths if not p.exists()]

if missing:
    raise FileNotFoundError(f"Missing expected artifacts: {missing}")

creative_doc = json.loads(creative_doc_path.read_text(encoding="utf-8"))
scene_prompts = json.loads(scene_prompts_path.read_text(encoding="utf-8"))
report = json.loads(report_path.read_text(encoding="utf-8"))
memory_state = json.loads(memory_path.read_text(encoding="utf-8"))

scene_count = len(creative_doc.get("scenes", []))
mean_score = float(report.get("mean_critique_score", 0.0))
continuity_count = len(memory_state.get("continuity_log", []))

print("PASS: LLM pipeline artifacts generated")
print("scene_count:", scene_count)
print("mean_critique_score:", round(mean_score, 4))
print("continuity_log_count:", continuity_count)
print("report:", report_path)

## 7) Embedded result export (vector form)
Convert generated prompts into numeric vectors and save as artifact before WAN.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

texts = [item.get("prompt", "") for item in scene_prompts if item.get("prompt")]
if not texts:
    raise ValueError("No prompts found in scene_prompts_llm.json")

vectorizer = TfidfVectorizer(max_features=64)
embeddings = vectorizer.fit_transform(texts).toarray()

embedded_rows = []
for idx, row in enumerate(scene_prompts, start=1):
    vec = embeddings[idx - 1].tolist() if idx - 1 < len(embeddings) else []
    embedded_rows.append(
        {
            "scene_id": row.get("scene_id"),
            "prompt": row.get("prompt", ""),
            "critique_score": row.get("critique", {}).get("score", 0.0),
            "embedding": vec,
        }
    )

embedded_path = Path("output/llm_only/scene_prompts_embedded.json")
embedded_path.write_text(json.dumps(embedded_rows, indent=2, ensure_ascii=False), encoding="utf-8")

preview = pd.DataFrame(
    {
        "scene_id": [r["scene_id"] for r in embedded_rows],
        "critique_score": [r["critique_score"] for r in embedded_rows],
        "embedding_dim": [len(r["embedding"]) for r in embedded_rows],
    }
)

print("Embedded artifact saved:", embedded_path)
display(preview)

## 8) Ready-for-WAN checklist
If all checks pass, use `output/llm_only/scene_prompts_llm.json` or `output/llm_only/scene_prompts_embedded.json` as input planning artifacts for WAN generation.