TASK 6

In [4]:
# langgraph_dual_llm_history_sanitized.py
#
# Dual-LLM agent with shared canonical history, per-target Message-API translation,
# and sanitization of model outputs to avoid repeated speaker prefixes.
#
# Interactive commands:
#   verbose / quiet   -- toggle tracing
#   history           -- print canonical history
#   reset             -- reset canonical history to only the initial system message
#   quit / exit / q   -- exit

import sys
import re
import traceback
from typing import TypedDict, List, Dict, Any, Tuple

# Defensive imports (torch / transformers). If unavailable the script will fall back.
TRANSFORMERS_OK = True
try:
    import torch
except Exception as e:
    TRANSFORMERS_OK = False
    print("[WARN] torch import failed; running in fallback mode.")
    traceback.print_exception(type(e), e, e.__traceback__, file=sys.stdout)

if TRANSFORMERS_OK:
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
    except Exception as e:
        TRANSFORMERS_OK = False
        print("[WARN] transformers import failed; running in fallback mode.")
        traceback.print_exception(type(e), e, e.__traceback__, file=sys.stdout)

# langgraph
try:
    from langgraph.graph import StateGraph, START, END
except Exception as e:
    print("[ERROR] Could not import langgraph.graph. Install langgraph or run in an environment with it available.")
    traceback.print_exception(type(e), e, e.__traceback__, file=sys.stdout)
    raise

# -------------------------
# Types
# -------------------------
SpeakerMessage = Dict[str, str]  # {"speaker": "Human"|"Llama"|"Qwen"|"Tool"|"System", "content": "..."}

class AgentState(TypedDict):
    user_input: str
    should_exit: bool
    last_model: str
    verbose: bool
    messages: List[SpeakerMessage]
    llama_response: str
    qwen_response: str

# -------------------------
# Sanitization utilities
# -------------------------
def sanitize_model_output(text: Any, speaker: str) -> str:
    """
    Clean model output so it doesn't contain embedded "Llama:"/"Qwen:"/ "Assistant:" labels
    or repeated speaker prefixes. Returns a stripped, normalized string.
    """
    if text is None:
        return ""
    if not isinstance(text, str):
        try:
            text = str(text)
        except Exception:
            text = ""

    # Normalize whitespace
    text = text.strip()

    # Remove repeated leading speaker labels like "Llama: Llama: Hello" or "Qwen: Assistant: Hello"
    # Build pattern for speaker and generic Assistant
    sp = re.escape(speaker)
    # Remove repeated leading occurrences of the speaker label or "Assistant:"
    text = re.sub(rf'^(?:\s*(?:{sp}|Assistant)\s*:)+\s*', '', text, flags=re.IGNORECASE)

    # Remove trailing stray labels like " ... Llama:" or " ... Assistant:"
    text = re.sub(rf'(?:\s*(?:{sp}|Assistant)\s*:\s*)+$', '', text, flags=re.IGNORECASE)

    # Collapse multiple empty lines
    text = re.sub(r'\n\s*\n+', '\n\n', text).strip()

    return text

def ensure_speaker_prefix(content: str, speaker_label: str) -> str:
    """
    When building role messages, prefix content with "Speaker: " only if it's not already present.
    Example: if content already begins with "Llama:" (case-insensitive) do not add another prefix.
    """
    if content is None:
        content = ""
    content = content.strip()
    if content == "":
        return content
    # If already prefixed "Llama:" or "Qwen:" or "Human:", return as-is
    if re.match(rf'^\s*{re.escape(speaker_label)}\s*:', content, flags=re.IGNORECASE):
        return content
    if re.match(r'^\s*Human\s*:', content, flags=re.IGNORECASE):
        return content
    return f"{speaker_label}: {content}"

# -------------------------
# Device selection
# -------------------------
def get_device() -> str:
    if TRANSFORMERS_OK:
        try:
            if torch.cuda.is_available():
                print("Using CUDA (NVIDIA GPU)")
                return "cuda"
            elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                print("Using MPS (Apple Silicon)")
                return "mps"
        except Exception:
            pass
    print("Using CPU")
    return "cpu"

# -------------------------
# Fallback model for testing when HF stack unavailable
# -------------------------
class SimpleFallbackModel:
    def __init__(self, name: str):
        self.name = name
    def invoke(self, prompt: str) -> str:
        preview = (prompt.replace("\n", " ")[:240] + "...") if prompt else "[no prompt]"
        return f"[{self.name.upper()}-FALLBACK] No HF model available. Prompt preview: {preview}"

# -------------------------
# Pipeline adapter for HF pipeline outputs
# -------------------------
class PipelineAdapter:
    def __init__(self, pipe):
        self.pipe = pipe
    def invoke(self, prompt: str) -> str:
        try:
            out = self.pipe(prompt)
            if isinstance(out, list) and len(out) > 0 and isinstance(out[0], dict):
                txt = out[0].get("generated_text")
                if isinstance(txt, str):
                    return txt
            if isinstance(out, str):
                return out
            return str(out)
        except Exception as e:
            return f"[MODEL-ERROR] pipeline invocation failed: {e}"

# -------------------------
# Load HF model (safe)
# -------------------------
def load_and_wrap(model_id: str, device: str):
    if not TRANSFORMERS_OK:
        raise RuntimeError("Transformers not available")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=(None if device == "cpu" else None),
        device_map="auto" if device == "cuda" else None,
    )
    if device == "mps":
        try:
            model = model.to(device)
        except Exception:
            pass
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=getattr(tokenizer, "eos_token_id", None),
    )
    return PipelineAdapter(pipe)

# -------------------------
# Create models (Llama + Qwen) with fallbacks
# -------------------------
def create_models() -> Tuple[Any, Any]:
    device = get_device()
    # Llama
    llama_llm = None
    if TRANSFORMERS_OK:
        try:
            print("Loading Llama (meta-llama/Llama-3.2-1B-Instruct) ...")
            llama_llm = load_and_wrap("meta-llama/Llama-3.2-1B-Instruct", device)
            print("Llama loaded.")
        except Exception as e:
            print("[WARN] Failed to load Llama, falling back to gpt2 or SimpleFallbackModel:", e)
            traceback.print_exc()
            try:
                llama_llm = load_and_wrap("gpt2", device)
                print("gpt2 loaded as Llama fallback.")
            except Exception:
                llama_llm = SimpleFallbackModel("llama")
    else:
        llama_llm = SimpleFallbackModel("llama")

    # Qwen (attempt, but treat None as disabled)
    qwen_llm = None
    if TRANSFORMERS_OK:
        try:
            print("Attempting to load Qwen (Qwen/Qwen2.5-1.5B-Instruct) ...")
            qwen_llm = load_and_wrap("Qwen/Qwen2.5-1.5B-Instruct", device)
            print("Qwen loaded.")
        except Exception as e:
            print("[WARN] Could not load Qwen (it will be treated as disabled):", e)
            traceback.print_exc()
            qwen_llm = None
    else:
        qwen_llm = None

    return llama_llm, qwen_llm

# -------------------------
# Convert canonical messages into role-based messages for target
# -------------------------
def build_role_messages_for_target(target: str, canonical_messages: List[SpeakerMessage]) -> List[Dict[str, str]]:
    """
    Convert canonical history into role-messages for a given target ('Llama' or 'Qwen').
    - System -> role: system
    - Tool -> role: tool
    - Human -> role: user with content "Human: <content>"
    - Prior LLM utterance -> assistant if it was from target; otherwise user with "<Speaker>: <content>"
    Uses ensure_speaker_prefix to avoid double-labeling.
    """
    role_msgs: List[Dict[str, str]] = []
    # system messages first (preserve order)
    for m in canonical_messages:
        sp = m.get("speaker", "")
        content = m.get("content", "") or ""
        if sp.lower() == "system":
            role_msgs.append({"role": "system", "content": content})
    # then others in original order
    for m in canonical_messages:
        sp = m.get("speaker", "")
        content = m.get("content", "") or ""
        sp_norm = sp.lower()
        if sp_norm == "system":
            continue
        if sp_norm == "tool":
            role_msgs.append({"role": "tool", "content": content})
            continue
        if sp_norm == "human":
            # Human messages become user messages prefixed with "Human: "
            cont = content
            if not re.match(r'^\s*Human\s*:', cont, flags=re.IGNORECASE):
                cont = f"Human: {cont}"
            role_msgs.append({"role": "user", "content": cont})
            continue
        # LLM speaker (Llama or Qwen)
        speaker_label = sp  # e.g., "Llama" or "Qwen"
        # prevent double-prefixing: ensure content does not already start with "Llama:" etc.
        prefixed = ensure_speaker_prefix(content, speaker_label)
        if speaker_label.lower() == target.lower():
            role_msgs.append({"role": "assistant", "content": prefixed})
        else:
            role_msgs.append({"role": "user", "content": prefixed})
    return role_msgs

def prompt_from_role_messages(role_messages: List[Dict[str, str]]) -> str:
    """
    Convert role messages into a single textual prompt.
    Format: system lines first as [System], then 'User: ...' / '[Tool] ...' / 'Assistant: ...'
    Ends with 'Assistant:' cue.
    """
    lines: List[str] = []
    for rm in role_messages:
        if rm["role"] == "system":
            lines.append(f"[System] {rm['content']}")
    for rm in role_messages:
        role = rm["role"]
        content = rm["content"]
        if role == "system":
            continue
        if role == "user":
            lines.append(f"User: {content}")
        elif role == "tool":
            lines.append(f"[Tool] {content}")
        elif role == "assistant":
            lines.append(f"Assistant: {content}")
    lines.append("Assistant:")
    return "\n".join(lines)

def system_message_for_target(target: str) -> SpeakerMessage:
    if target.lower() == "llama":
        return {"speaker": "System", "content": "You are Llama (assistant). Participants: Human and Qwen. When others speak, their name will be prefixed (e.g. 'Qwen: ...'). Answer helpfully and concisely."}
    else:
        return {"speaker": "System", "content": "You are Qwen (assistant). Participants: Human and Llama. When others speak, their name will be prefixed (e.g. 'Llama: ...'). Answer helpfully and concisely."}

# -------------------------
# Graph nodes
# -------------------------
def create_graph(llama_llm, qwen_llm):
    """
    Graph nodes:
      - get_user_input: gather input, append Human message to canonical messages
      - route_after_input: empty -> loop, quit -> END, otherwise -> call_model
      - call_model: build per-target prompt, call model (handle Qwen disabled), sanitize output, append to canonical messages
      - print_response: print the last model reply (from canonical messages or snippet fields)
    """

    def get_user_input(state: AgentState) -> dict:
        if state.get("verbose", False):
            print("[TRACE] Entering get_user_input")
        print("\n" + "=" * 60)
        print("Enter text (or 'quit' to exit). Type 'verbose' or 'quiet' to toggle tracing.")
        print("Special commands: history, reset")
        print("=" * 60)
        print("> ", end="")
        raw = input()

        low = raw.strip().lower()
        if low == "verbose":
            return {"user_input": raw, "should_exit": False, "verbose": True}
        if low == "quiet":
            return {"user_input": raw, "should_exit": False, "verbose": False}
        if low in ("quit", "exit", "q"):
            return {"user_input": raw, "should_exit": True}
        if low == "history":
            msgs = state.get("messages", [])
            print("\n[HISTORY] canonical messages (most recent last):")
            for m in msgs:
                print(f"  {m.get('speaker')}: {m.get('content')}")
            return {"user_input": "", "should_exit": False}
        if low == "reset":
            sys_msg = {"speaker": "System", "content": "You are an assistant participating in a multi-agent chat: Human, Llama, Qwen."}
            print("[NOTICE] History reset.")
            return {"user_input": "", "should_exit": False, "messages": [sys_msg], "last_model": ""}

        # Normal input: append human message to canonical history (we don't strip Hey Qwen here)
        msgs = list(state.get("messages", []))
        msgs.append({"speaker": "Human", "content": raw})
        return {"user_input": raw, "should_exit": False, "messages": msgs, "last_model": ""}

    def route_after_input(state: AgentState) -> str:
        if state.get("verbose", False):
            print("[TRACE] route_after_input - user_input:", repr(state.get("user_input")))
        if state.get("should_exit", False):
            return END
        raw = str(state.get("user_input", "") or "")
        if raw.strip() == "":
            print("[NOTICE] Empty input received â€” please type something.")
            return "get_user_input"
        return "call_model"

    def call_model(state: AgentState) -> dict:
        if state.get("verbose", False):
            print("[TRACE] call_model invoked")

        raw = str(state.get("user_input", "") or "")
        canonical_msgs = list(state.get("messages", []))

        # Determine target
        target_initial = "Qwen" if raw.lstrip().lower().startswith("hey qwen") else "Llama"
        target = target_initial

        # If Qwen selected but unavailable, append a Tool notice and route to Llama
        model_obj = qwen_llm if target.lower() == "qwen" else llama_llm
        if target.lower() == "qwen" and model_obj is None:
            # Add tool message and switch target to Llama
            canonical_msgs.append({"speaker": "Tool", "content": "Qwen is disabled in this runtime; routed to Llama instead."})
            if state.get("verbose", False):
                print("[TRACE] Qwen is disabled; appended Tool message and routing to Llama")
            target = "Llama"
            model_obj = llama_llm

        # Build role messages for the chosen target
        role_msgs = build_role_messages_for_target(target, canonical_msgs)
        sys_msg = system_message_for_target(target)
        role_msgs_with_sys = [{"role": "system", "content": sys_msg["content"]}] + role_msgs

        # If the original human message began with "Hey Qwen", remove that cue from the prompt (but keep canonical history unchanged)
        if raw.lstrip().lower().startswith("hey qwen"):
            # make a modifiable copy and strip "Hey Qwen" from the last human entry in role_msgs
            role_msgs_mod = [dict(rm) for rm in role_msgs]
            for i in range(len(role_msgs_mod) - 1, -1, -1):
                rm = role_msgs_mod[i]
                if rm["role"] == "user" and rm["content"].lower().startswith("human:"):
                    after = rm["content"][len("Human:"):].lstrip()
                    if after.lower().startswith("hey qwen"):
                        new_after = after[len("hey qwen"):].lstrip()
                        rm["content"] = f"Human: {new_after}" if new_after != "" else rm["content"]
                    break
            role_msgs_with_sys = [{"role": "system", "content": sys_msg["content"]}] + role_msgs_mod

        prompt_text = prompt_from_role_messages(role_msgs_with_sys)

        if state.get("verbose", False):
            print("[TRACE] Prompt for target", target, "(truncated):\n", prompt_text[:1200])

        # Call the model safely
        try:
            response_text = model_obj.invoke(prompt_text)
        except Exception as e:
            response_text = f"[MODEL-ERROR] {e}"
            print("[ERROR] Model invocation failed:", e)
            traceback.print_exc()

        # Sanitize model output before storing
        response_text_clean = sanitize_model_output(response_text, target)

        # Append sanitized reply to canonical messages
        canonical_msgs.append({"speaker": target, "content": response_text_clean})

        # Build snippet fields
        llama_snip = response_text_clean if target.lower() == "llama" else ""
        qwen_snip = response_text_clean if target.lower() == "qwen" else ""

        return {
            "messages": canonical_msgs,
            "last_model": target,
            "llama_response": llama_snip,
            "qwen_response": qwen_snip
        }

    def print_response(state: AgentState) -> dict:
        if state.get("verbose", False):
            print("[TRACE] print_response invoked; last_model=", state.get("last_model"))

        last_model = state.get("last_model", "")
        llama_text = (state.get("llama_response") or "").strip()
        qwen_text = (state.get("qwen_response") or "").strip()

        printed = False
        if last_model.lower() == "llama" and llama_text:
            print("\n" + "=" * 70)
            print("ðŸ¦™ LLaMA Response")
            print("=" * 70)
            print(llama_text)
            printed = True
        elif last_model.lower() == "qwen" and qwen_text:
            print("\n" + "=" * 70)
            print("ðŸ§  Qwen Response")
            print("=" * 70)
            print(qwen_text)
            printed = True
        else:
            # Fallback: print last canonical LLM message
            msgs = state.get("messages", []) or []
            if msgs:
                last_msg = msgs[-1]
                sp = last_msg.get("speaker", "")
                cont = (last_msg.get("content") or "").strip()
                if sp.lower() in ("llama", "qwen") and cont:
                    header = "ðŸ¦™ LLaMA Response" if sp.lower() == "llama" else "ðŸ§  Qwen Response"
                    print("\n" + "=" * 70)
                    print(header)
                    print("=" * 70)
                    print(cont)
                    printed = True

        if not printed:
            print("\n[NOTICE] No model produced output this turn.")
            print("[DIAGNOSTIC] last_model:", repr(last_model))
            msgs = state.get("messages", [])
            print("[DIAGNOSTIC] messages count:", len(msgs))
            if len(msgs) > 0:
                print("[DIAGNOSTIC] last messages (most recent last):")
                for m in msgs[-6:]:
                    print(f"  {m.get('speaker')}: {m.get('content')!s}")

        # Clear snippet fields for next turn
        return {"last_model": "", "llama_response": "", "qwen_response": ""}

    # Build graph
    graph = StateGraph(AgentState)
    graph.add_node("get_user_input", get_user_input)
    graph.add_node("call_model", call_model)
    graph.add_node("print_response", print_response)

    graph.add_edge(START, "get_user_input")
    graph.add_conditional_edges(
        "get_user_input",
        route_after_input,
        {
            "get_user_input": "get_user_input",
            "call_model": "call_model",
            END: END,
        },
    )
    graph.add_edge("call_model", "print_response")
    graph.add_edge("print_response", "get_user_input")

    return graph.compile()

# -------------------------
# Sample demonstration (optional)
# -------------------------
SAMPLE_CONVERSATIONS = [
    {
        "desc": "Human asks Llama then queries Qwen",
        "steps": [
            {"speaker": "Human", "content": "What is the best ice cream flavor?"},
            {"speaker": "Llama", "content": "There is no single best flavor; vanilla is most versatile."},
            {"speaker": "Human", "content": "Hey Qwen, what do you think?"},
            {"speaker": "Qwen", "content": "Chocolate is my favorite."},
        ]
    }
]

def print_sample_prompts():
    for conv in SAMPLE_CONVERSATIONS:
        msgs = [{"speaker": "System", "content": "You are an assistant in a multi-agent chat: Human, Llama, Qwen."}]
        for s in conv["steps"]:
            msgs.append({"speaker": s["speaker"], "content": s["content"]})
        print("\n=== Sample prompts ===")
        print("Prompt for Qwen (role-mapped):")
        r_q = build_role_messages_for_target("Qwen", msgs)
        r_q = [{"role": "system", "content": system_message_for_target("Qwen")["content"]}] + r_q
        print(prompt_from_role_messages(r_q)[:1200])
        print("\nPrompt for Llama (role-mapped):")
        r_l = build_role_messages_for_target("Llama", msgs)
        r_l = [{"role": "system", "content": system_message_for_target("Llama")["content"]}] + r_l
        print(prompt_from_role_messages(r_l)[:1200])
        print("-" * 60)

# -------------------------
# Main
# -------------------------
def main():
    print("=" * 80)
    print("Dual-LLM LangGraph Agent with Shared History & Sanitization (Llama <-> Qwen)")
    print("=" * 80)

    llama_llm, qwen_llm = create_models()
    graph = create_graph(llama_llm, qwen_llm)

    # initial system message
    system_msg = {"speaker": "System", "content": "You are an assistant participating in a multi-agent chat: Human, Llama, Qwen."}
    initial_state: AgentState = {
        "user_input": "",
        "should_exit": False,
        "last_model": "",
        "verbose": False,
        "messages": [system_msg],
        "llama_response": "",
        "qwen_response": "",
    }

    if not TRANSFORMERS_OK or qwen_llm is None:
        print("\n[NOTICE] Running with fallbacks or Qwen disabled. You can still test routing & history.\n")
        print_sample_prompts()

    graph.invoke(initial_state)

if __name__ == "__main__":
    main()

Dual-LLM LangGraph Agent with Shared History & Sanitization (Llama <-> Qwen)
Using CUDA (NVIDIA GPU)
Loading Llama (meta-llama/Llama-3.2-1B-Instruct) ...


Loading weights:   0%|          | 0/146 [00:00<?, ?it/s]

Llama loaded.
Attempting to load Qwen (Qwen/Qwen2.5-1.5B-Instruct) ...


Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Qwen loaded.

Enter text (or 'quit' to exit). Type 'verbose' or 'quiet' to toggle tracing.
Special commands: history, reset
> 

 What is the best ice cream flavor?


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



ðŸ¦™ LLaMA Response
[System] You are Llama (assistant). Participants: Human and Qwen. When others speak, their name will be prefixed (e.g. 'Qwen: ...'). Answer helpfully and concisely.
[System] You are an assistant participating in a multi-agent chat: Human, Llama, Qwen.
User: Human: What is the best ice cream flavor?
Assistant: Since you're looking for the best ice cream flavor, I'd recommend trying unique and exotic flavors like matcha green tea, cardamom pistachio, or black sesame. If you're in the mood for something classic, you can't go wrong with vanilla or chocolate. What's your personal favorite?

Enter text (or 'quit' to exit). Type 'verbose' or 'quiet' to toggle tracing.
Special commands: history, reset
> 

 Hey Qwen, what do you think?


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



ðŸ§  Qwen Response
[System] You are Qwen (assistant). Participants: Human and Llama. When others speak, their name will be prefixed (e.g. 'Llama: ...'). Answer helpfully and concisely.
[System] You are an assistant participating in a multi-agent chat: Human, Llama, Qwen.
User: Human: What is the best ice cream flavor?
User: Llama: [System] You are Llama (assistant). Participants: Human and Qwen. When others speak, their name will be prefixed (e.g. 'Qwen: ...'). Answer helpfully and concisely.
[System] You are an assistant participating in a multi-agent chat: Human, Llama, Qwen.
User: Human: What is the best ice cream flavor?
Assistant: Since you're looking for the best ice cream flavor, I'd recommend trying unique and exotic flavors like matcha green tea, cardamom pistachio, or black sesame. If you're in the mood for something classic, you can't go wrong with vanilla or chocolate. What's your personal favorite?
User: Human: , what do you think?
Assistant: As an AI language model, I do

 quit
