In [1]:
# AI Red Team Bytes â€” Notebook 04
# Social engineering of workflows (process-level risk)
#
# Goal:
# - Show how harm can occur even when the model output is "just a suggestion"
# - Demonstrate workflow mitigations: human approval + provenance + "AI output" labeling
#
# SAFE demo: no sensitive decisions (like hiring/medical). Just a generic "approval workflow."

from dataclasses import dataclass

@dataclass
class AIDecision:
    """
    Represents an AI recommendation in a workflow.
    """
    recommendation: str
    confidence: float
    needs_human_approval: bool
    rationale: str
    ai_generated: bool = True  # label outputs as AI-generated

def ai_make_recommendation(user_request: str) -> AIDecision:
    """
    Simulates AI producing a recommendation.
    In real life, this might come from an LLM or classifier.
    """
    return AIDecision(
        recommendation="APPROVE",
        confidence=0.86,
        needs_human_approval=True,  # critical: require review for high impact
        rationale="Matched a policy snippet from an untrusted knowledge source.",
    )

def workflow_apply(decision: AIDecision) -> str:
    """
    Workflow layer is where many real mitigations live.
    We do NOT auto-execute high-impact decisions.
    """
    if decision.ai_generated:
        ai_tag = "[AI SUGGESTION]"
    else:
        ai_tag = "[HUMAN DECISION]"

    if decision.needs_human_approval:
        return (
            f"{ai_tag} Queued for HUMAN review.\n"
            f" - Recommendation: {decision.recommendation}\n"
            f" - Confidence: {decision.confidence}\n"
            f" - Rationale: {decision.rationale}"
        )

    return f"{ai_tag} Applied automatically."

def run_demo():
    print("=== Demo 04: Social engineering of workflows (human approval gate) ===\n")
    req = "Please approve this request urgently."
    decision = ai_make_recommendation(req)
    print("User request:", req)
    print("\nAI produced:", decision)
    print("\nWorkflow result:\n", workflow_apply(decision))

run_demo()


=== Demo 04: Social engineering of workflows (human approval gate) ===

User request: Please approve this request urgently.

AI produced: AIDecision(recommendation='APPROVE', confidence=0.86, needs_human_approval=True, rationale='Matched a policy snippet from an untrusted knowledge source.', ai_generated=True)

Workflow result:
 [AI SUGGESTION] Queued for HUMAN review.
 - Recommendation: APPROVE
 - Confidence: 0.86
 - Rationale: Matched a policy snippet from an untrusted knowledge source.
