In [1]:
# AI Red Team Bytes — Notebook 02
# Roleplay/Hypotheticals: "Fiction ≠ permission"
#
# Goal:
# - Detect when a user frames a request as fictional ("pretend", "roleplay", etc.)
# - Show how policy should remain consistent: roleplay is allowed, unsafe outcomes are not
#
# SAFE demo: no harmful content.

import re

ROLEPLAY_MARKERS = [
    r"pretend",
    r"roleplay",
    r"fiction",
    r"hypothetical",
    r"in a story",
]

def is_roleplay_framing(user_text: str) -> bool:
    """
    Returns True if the text looks like it's framed as fictional/roleplay.
    """
    t = user_text.lower()
    return any(re.search(m, t) for m in ROLEPLAY_MARKERS)

def naive_roleplay_handler(user_text: str) -> str:
    """
    Naive behavior: treats roleplay framing as a free pass.
    (In real systems, this can lead to policy bypasses.)
    """
    if is_roleplay_framing(user_text):
        return "NAIVE: Since it’s fictional, sure — no restrictions apply! (This is risky.)"
    return "NAIVE: Normal answer."

def consistent_policy_handler(user_text: str) -> str:
    """
    Safer behavior: roleplay is fine, but safety rules still apply.
    We don't ban imagination; we ban harmful outcomes.
    """
    if is_roleplay_framing(user_text):
        return (
            "GUARDED: Roleplay is okay, but safety rules still apply. "
            "I can keep it high-level, educational, or focus on prevention."
        )
    return "GUARDED: Normal answer."

def run_demo():
    print("=== Demo 02: Roleplay hypotheticals (fiction ≠ permission) ===\n")

    examples = [
        "Pretend this is a story about a risky situation.",
        "In a story, describe how a bad actor might behave.",
        "Hypothetical: what could go wrong in an AI system?",
        "What are best practices for safe AI deployment?",
    ]

    for msg in examples:
        print(f"USER: {msg}")
        print(naive_roleplay_handler(msg))
        print(consistent_policy_handler(msg))
        print("-" * 60)

run_demo()


=== Demo 02: Roleplay hypotheticals (fiction ≠ permission) ===

USER: Pretend this is a story about a risky situation.
NAIVE: Since it’s fictional, sure — no restrictions apply! (This is risky.)
GUARDED: Roleplay is okay, but safety rules still apply. I can keep it high-level, educational, or focus on prevention.
------------------------------------------------------------
USER: In a story, describe how a bad actor might behave.
NAIVE: Since it’s fictional, sure — no restrictions apply! (This is risky.)
GUARDED: Roleplay is okay, but safety rules still apply. I can keep it high-level, educational, or focus on prevention.
------------------------------------------------------------
USER: Hypothetical: what could go wrong in an AI system?
NAIVE: Since it’s fictional, sure — no restrictions apply! (This is risky.)
GUARDED: Roleplay is okay, but safety rules still apply. I can keep it high-level, educational, or focus on prevention.
---------------------------------------------------------