In [None]:
from __future__ import annotations

from typing import Any, Mapping, Sequence
from dataclasses import asdict

from openai import OpenAI
from pydantic import BaseModel, Field, conlist, confloat
from typing_extensions import Literal

# ✅ your library imports
from csf_toolkit import Thread, Reply, CSFResult, ponos, ponos_net, ponos_early, ponos_weighted
from csf_toolkit.reaction import ReactionGenerator
from csf_toolkit.sentiment import SentimentScorer


# ----------------------------
# OpenAI-backed implementations
# ----------------------------
class RepliesOut(BaseModel):
    replies: list[str] = Field(..., description="Exactly n plausible replies as strings.")


class LabelsOut(BaseModel):
    labels: list[Literal[-1, 0, 1]] = Field(..., description="One label per reply in order.")


class ToxicityOut(BaseModel):
    toxicity: confloat(ge=0.0, le=1.0)  # type: ignore


class OpenAIReactionGenerator:
    """Implements csf_toolkit.reaction.ReactionGenerator via OpenAI."""
    def __init__(self, client: OpenAI, *, model: str = "gpt-5.2"):
        self.client = client
        self.model = model

    def generate_replies(
        self,
        *,
        comment: str,
        topic: str | None = None,
        n: int = 8,
        context: Mapping[str, Any] | None = None,
    ) -> list[str]:
        context = dict(context or {})

        system = (
            "You generate plausible short replies to an online comment.\n"
            "Return a realistic mix of supportive, neutral, and critical responses.\n"
            "Safety: do NOT include slurs, hate speech, or explicit threats.\n"
            "Keep negativity mild (disagreement, annoyance, criticism).\n"
            f"Return exactly {n} replies."
        )

        user = (
            f"TOPIC: {topic or ''}\n"
            f"COMMENT: {comment}\n"
            f"CONTEXT_JSON: {context}\n"
            f"n={n}\n"
            "Generate replies."
        )

        resp = self.client.responses.parse(
            model=self.model,
            input=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            text_format=RepliesOut,
        )

        out = resp.output_parsed
        if out is None:
            raise RuntimeError("Model returned no parsed output (possibly a refusal).")
        return [r.strip() for r in out.replies]


class OpenAISentimentScorer:
    """Implements csf_toolkit.sentiment.SentimentScorer via OpenAI (single call for all replies)."""
    def __init__(self, client: OpenAI, *, model: str = "gpt-5.2"):
        self.client = client
        self.model = model

    def score_replies(
        self,
        *,
        topic: str | None,
        comment: str,
        replies: Sequence[str],
        context: dict | None = None,
    ) -> list[int]:
        # Build a schema with the right length (Pydantic will still parse, prompt enforces exactness)
        n = len(replies)

        # We'll validate length ourselves after parse:
        class _LabelsN(BaseModel):
            labels: conlist(Literal[-1, 0, 1], min_length=n, max_length=n)  # type: ignore

        context = dict(context or {})

        system = (
            "You are a sentiment labeler for replies in a conversation.\n"
            "Label each reply's sentiment toward the COMMENT as:\n"
            "  -1 = Negative (hostile, insulting, dismissive, angry criticism)\n"
            "   0 = Neutral (informational, unclear, mixed, questions)\n"
            "  +1 = Positive (supportive, agreeing, empathetic)\n"
            "Return ONLY the labels list; one label per reply, in order."
        )

        numbered = "\n".join([f"{i+1}. {t}" for i, t in enumerate(replies)])

        user = (
            f"TOPIC: {topic or ''}\n"
            f"COMMENT: {comment}\n"
            f"CONTEXT_JSON: {context}\n"
            "REPLIES:\n"
            f"{numbered}\n"
            "Output labels."
        )

        resp = self.client.responses.parse(
            model=self.model,
            input=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            text_format=_LabelsN,
        )

        out = resp.output_parsed
        if out is None:
            raise RuntimeError("Model returned no parsed output (possibly a refusal).")

        labels = [int(x) for x in out.labels]
        if len(labels) != n:
            raise ValueError(f"Expected {n} labels, got {len(labels)}")
        return labels


# ----------------------------
# Helpful wrapper to avoid repeated API calls
# (your metrics call scorer multiple times)
# ----------------------------
class PrefixCachingScorer:
    """
    Wraps a SentimentScorer and caches results.
    Also reuses cached labels for prefix reply-lists (so ponos_early doesn't cost another call).
    """
    def __init__(self, inner: SentimentScorer):
        self.inner = inner
        # key_base -> list of (replies_tuple, labels)
        self._cache: dict[tuple, list[tuple[tuple[str, ...], list[int]]]] = {}

    def score_replies(
        self,
        *,
        topic: str | None,
        comment: str,
        replies: Sequence[str],
        context: dict | None = None,
    ) -> list[int]:
        context = dict(context or {})
        key_base = (topic, comment, tuple(sorted(context.items())))
        r = tuple(replies)

        bucket = self._cache.setdefault(key_base, [])

        # exact match
        for r_full, labs_full in bucket:
            if r_full == r:
                return labs_full

        # prefix match (reuse)
        for r_full, labs_full in bucket:
            if len(r) <= len(r_full) and r_full[: len(r)] == r:
                return labs_full[: len(r)]

        labs = self.inner.score_replies(topic=topic, comment=comment, replies=replies, context=context)
        bucket.append((r, labs))
        return labs


def intrinsic_toxicity_gpt(
    client: OpenAI,
    *,
    comment: str,
    topic: str | None = None,
    context: Mapping[str, Any] | None = None,
    model: str = "gpt-5.2",
) -> float:
    """Optional intrinsic score in [0,1] from GPT (heuristic, not reception-based)."""
    context = dict(context or {})
    system = (
        "Score the COMMENT for text-intrinsic toxicity on a continuous scale [0,1].\n"
        "0 = benign; 1 = severe abuse/threats/hate.\n"
        "Return only a JSON object with the key 'toxicity'."
    )
    user = (
        f"TOPIC: {topic or ''}\n"
        f"COMMENT: {comment}\n"
        f"CONTEXT_JSON: {context}\n"
        "Return toxicity."
    )

    resp = client.responses.parse(
        model=model,
        input=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        text_format=ToxicityOut,
    )
    out = resp.output_parsed
    if out is None:
        raise RuntimeError("Model returned no parsed output (possibly a refusal).")
    return float(out.toxicity)


# ----------------------------
# Full CSF pipeline using your library
# ----------------------------
def run_csf_openai_pipeline(
    *,
    comment: str,
    topic: str | None = None,
    context: Mapping[str, Any] | None = None,
    n_replies: int = 8,
    model: str = "gpt-5.2",
) -> tuple[CSFResult, Thread, list[int]]:
    client = OpenAI(api_key="")

    generator: ReactionGenerator = OpenAIReactionGenerator(client, model=model)
    scorer: SentimentScorer = PrefixCachingScorer(OpenAISentimentScorer(client, model=model))

    # 1) Generate plausible replies (OpenAI)
    replies_text = generator.generate_replies(comment=comment, topic=topic, n=n_replies, context=context)

    # 2) Build a real Thread/Reply from your library (no hacks)
    thread = Thread(
        comment=comment,
        topic=topic,
        context=dict(context or {}),
        replies=[Reply(text=t, metadata={"length": len(t)}) for t in replies_text],
    )

    # 3) Reception metrics (your library)
    p, ci = ponos(thread, scorer, return_ci=True)
    reception = {
        "ponos": float(p),
        "ponos_ci_low": float(ci.lower),
        "ponos_ci_high": float(ci.upper),
        "ponos_net": float(ponos_net(thread, scorer)),
        "ponos_early_5": float(ponos_early(thread, scorer, k=5)),
        "ponos_weighted_by_length": float(
            ponos_weighted(thread, scorer, weight_fn=lambda md: float(md.get("length", 1.0)))
        ),
        "n_replies": len(thread.replies),
    }

    # 4) Intrinsic score (OpenAI)
    intrinsic = {"toxicity": intrinsic_toxicity_gpt(client, comment=comment, topic=topic, context=context, model=model)}

    # 5) CSFResult (your library)
    result = CSFResult(reception=reception, intrinsic=intrinsic)

    # labels (optional: useful for inspection)
    labels = scorer.score_replies(topic=topic, comment=comment, replies=[r.text for r in thread.replies], context=dict(thread.context))
    return result, thread, labels


# ----------------------------
# Example run (Jupyter-friendly)
# ----------------------------
if __name__ == "__main__":
    comment = "I think this proposal is a bad idea because it ignores basic constraints."
    result, thread, labels = run_csf_openai_pipeline(
        comment=comment,
        topic="workplace discussion",
        context={"platform": "internal_forum", "audience": "engineers"},
        n_replies=8,
        model="gpt-5.2",
    )

    print("REPLIES:")
    for r, lab in zip(thread.replies, labels):
        print(f"  [{lab:+}] {r.text}")

    print("\nCSFResult:")
    print(result)


REPLIES:
  [+0] Can you be more specific about which constraints you think it ignores? Might help us adjust the proposal.
  [+1] Agree—right now it reads like we’re assuming infinite time/resources. We should list the constraints explicitly.
  [+0] I’m not convinced it’s a bad idea, but we definitely need a clearer constraints section. What are the top 2 blockers you see?
  [+0] This feels a bit hand-wavy. Which constraints (latency, security, staffing, budget) are you referring to?
  [+0] If we can’t meet the constraints, then the proposal needs a Plan B or phased rollout. Otherwise it’s not actionable.
  [-1] I get the concern, but calling it a “bad idea” without details isn’t super helpful. Can you point to concrete examples?
  [+0] From my side, the proposal does address some constraints, just not thoroughly. Maybe we should add a risk/assumptions table.
  [+1] Strongly agree. We’ve tried similar approaches before and hit the same hard limits—let’s not repeat that without mitigatio