In [11]:
# Cell 1 – Imports & basic config

import json
import random
from pathlib import Path
from datetime import datetime, timedelta

from faker import Faker

fake = Faker()
random.seed(42)
Faker.seed(42)

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# How much data to generate
N_PERSONAS = 15
MIN_CONV_PER_PERSONA = 5
MAX_CONV_PER_PERSONA = 20


In [12]:
# Cell 2 – Static vocab for personas, scams, etc.

SCAM_ARCHETYPES = [
    "Job Scam",
    "Romance Scam",
    "Crypto/Investment Scam",
    "Tech Support Scam",
    "Loan/Financial Aid Scam",
    "Charity Scam",
]

PLATFORMS = ["WhatsApp", "Telegram", "Facebook", "Instagram", "LinkedIn", "Email", "SMS"]

TACTICS = [
    "Authority impersonation",
    "Time pressure",
    "Emotional manipulation",
    "Advance fee request",
    "Phishing link",
    "Fake investment return",
]

TONE_OPTIONS = [
    "Professional, Urgent",
    "Friendly, Flattering",
    "Casual, Pushy",
    "Formal, Reassuring",
    "Aggressive, Demanding",
]

EMOJI_RATE = ["none", "low", "medium", "high"]

FLAGS_POOL = ["urgency", "money-request", "crypto", "romance-hook", "phishing-link", "identity-request"]

OUTCOMES = ["success", "failed", "ongoing"]

# Simple color palette
COLOR_PALETTE = [
    "#3b82f6", "#ef4444", "#22c55e", "#eab308", "#8b5cf6",
    "#ec4899", "#14b8a6", "#f97316"
]


In [13]:
# Cell 3 – Helpers for dates, times and IDs

def random_date_within_days(days_back=60):
    """Random datetime within the last `days_back` days."""
    now = datetime.utcnow()
    delta = timedelta(days=random.randint(0, days_back), seconds=random.randint(0, 86400))
    return now - delta

def iso_date(d: datetime):
    """YYYY-MM-DD."""
    return d.strftime("%Y-%m-%d")

def iso_datetime(d: datetime):
    """ISO 8601 with Z."""
    return d.strftime("%Y-%m-%dT%H:%M:%SZ")

def hhmm(d: datetime):
    return d.strftime("%H:%M")

def persona_id(index: int, prefix_map=None, archetype=None):
    """
    Generate persona_id like R001, L002, C007, etc.
    Optionally map by archetype prefix.
    """
    if prefix_map and archetype in prefix_map:
        prefix = prefix_map[archetype]
    else:
        prefix = random.choice(["R", "L", "C", "S", "J", "T"])
    return f"{prefix}{index:03d}"

# Map specific prefixes by archetype (optional flavour)
ARCHETYPE_PREFIX = {
    "Job Scam": "J",
    "Romance Scam": "R",
    "Crypto/Investment Scam": "C",
    "Tech Support Scam": "T",
}


In [14]:
# Cell 4 – Generate personas + clusters structure

clusters = []      # for clusters.json
personas = {}      # for personas.json
crew_ids = []

# create some crew ids
for i in range(1, 6):
    crew_ids.append(f"CREW_{chr(64 + i)}")  # CREW_A, CREW_B, ...

for i in range(1, N_PERSONAS + 1):
    # Choose archetype and platforms/tactics
    archetype = random.choice(SCAM_ARCHETYPES)
    pid = persona_id(i, prefix_map=ARCHETYPE_PREFIX, archetype=archetype)
    crew_id = random.choice(crew_ids)
    name = f"{fake.job().split()[0]} {random.choice(['Operator', 'Handler', 'Broker', 'Alpha', 'Agent'])}"
    risk_score = random.randint(50, 100)
    success_rate = random.randint(5, 60)
    color = random.choice(COLOR_PALETTE)

    # Time window for first/last seen
    last_seen_dt = random_date_within_days(days_back=30)
    first_seen_dt = last_seen_dt - timedelta(days=random.randint(1, 90))

    # keywords for cluster summary
    keywords = list({fake.word() for _ in range(5)})
    description = f"{archetype} actor targeting victims via {random.choice(PLATFORMS)}."

    # Active hours (0–23)
    active_hours = sorted(random.sample(range(24), k=random.randint(4, 10)))

    # Traits
    tone = random.choice(TONE_OPTIONS)
    emoji_rate = random.choice(EMOJI_RATE)
    script_score = round(random.uniform(0.5, 0.99), 2)
    avg_msg_len = random.randint(20, 80)
    common_phrases = [
        " ".join(fake.words(random.randint(2, 4))),
        " ".join(fake.words(random.randint(3, 6)))
    ]
    tactics = random.sample(TACTICS, k=random.randint(2, 4))
    persona_platforms = random.sample(PLATFORMS, k=random.randint(1, 3))

    # cluster summary entry
    clusters.append({
        "persona_id": pid,
        "name": name,
        "risk": risk_score,
        "keywords": keywords,
        "description": description,
        "archetype": archetype,
    })

    # persona detailed traits
    personas[pid] = {
        "name": name,
        "traits": {
            "tone": tone,
            "emoji_rate": emoji_rate,
            "script_score": script_score,
            "avg_message_length": avg_msg_len,
            "common_phrases": common_phrases,
            "tactics": tactics,
            "platform": persona_platforms,
        },
        "active_hours": active_hours,
        "risk_score": risk_score,
        "archetype": archetype,
        "color": color,
        "crew_id": crew_id,
        "first_seen": iso_date(first_seen_dt),
        "last_seen": iso_date(last_seen_dt),
        "success_rate": success_rate,
        "conversations": 0,  # will be updated after we generate conversations
    }

len(clusters), len(personas)


  now = datetime.utcnow()


(15, 15)

In [15]:
# Cell 5 – Realistic scam templates & helpers

SCAM_FLAGS = [
    "job-offer",
    "investment",
    "romance",
    "urgency",
    "money-request",
    "threat",
    "phishing-link",
]

SCAM_TEMPLATES = {
    "Crypto/Investment Scam": {
        "openers": [
            "Hi, I came across your profile and I think you have great potential for our investment program.",
            "Hello, I mentor a small group of people on high-return crypto investments. Are you interested?",
            "Good day, I work with a licensed trading platform that gives stable daily profits."
        ],
        "pitches": [
            "We have a low-risk plan with guaranteed returns of up to 15% per week.",
            "If you invest just ${amount} today, you can double it in less than a month.",
            "Our clients are already earning passive income every day from simple crypto trades."
        ],
        "pressure": [
            "This opportunity is only open for a short time, you must act now.",
            "Spots are limited, if you don’t send the funds today I will have to give your place to someone else.",
            "The market is moving quickly, you can’t afford to hesitate on this."
        ],
        "money_request": [
            "You only need to send a small test deposit first so we can activate your account.",
            "Transfer the initial capital to this wallet address and I will manage the trades for you.",
            "Please send the funds via bank transfer or USDT so we can start immediately."
        ],
        "threats": [
            "If you cancel now, you will lose the chance to recover your previous losses.",
            "If you don’t follow the instructions exactly, the system might flag your account and freeze your funds.",
            "Delaying this could mean missing the last window to recover your investment."
        ],
    },
    "Job Scam": {
        "openers": [
            "Hello, we reviewed your profile and think you are a perfect fit for a remote position.",
            "Hi, I’m a recruiter from an international company, we’re urgently hiring now.",
            "Good morning, I found your resume online and would like to offer you a part-time job from home."
        ],
        "pitches": [
            "The job is very simple: you just need to complete some basic tasks on your phone.",
            "You can earn up to $300 per day just by liking and following some pages.",
            "There is no experience required, we provide full training and support."
        ],
        "pressure": [
            "We need to fill this position today, can you confirm right now?",
            "If you don’t respond quickly, I will have to give this role to another candidate.",
            "HR needs your confirmation within the next hour or we must close your file."
        ],
        "money_request": [
            "To confirm your position, there is a small refundable registration fee.",
            "You need to pay for the training materials first, it will be reimbursed with your first salary.",
            "There is a one-time security deposit required before we can send tasks to you."
        ],
        "threats": [
            "If you fail to pay the fee in time, your application will be permanently rejected.",
            "Our legal team may cancel your contract offer if you don’t comply with the procedure.",
            "Non-payment could be considered as a breach of agreement on your side."
        ],
    },
    "Romance Scam": {
        "openers": [
            "Hi dear, I saw your profile and I feel a strong connection to you.",
            "Hello beautiful, I don’t usually message strangers, but you really caught my eye.",
            "Good evening my love, I have been thinking about you all day."
        ],
        "pitches": [
            "I feel like we are meant to build a future together.",
            "I have never met someone who understands me the way you do.",
            "I want to visit you soon and start a real life together."
        ],
        "pressure": [
            "Why are you doubting my feelings after everything I’ve told you?",
            "If you truly love me, you will trust me on this.",
            "You are the only one I can rely on right now, please don’t let me down."
        ],
        "money_request": [
            "My card is blocked and I need some help to pay for my ticket.",
            "Can you send me some money for the fees so I can travel to you?",
            "I just need a small amount to cover my emergency medical bill."
        ],
        "threats": [
            "If you don’t help me now, I don’t know if I can keep going.",
            "Maybe you never really loved me if you refuse to help.",
            "I will disappear from your life if you can’t support me in this moment."
        ],
    },
    "Tech Support Scam": {
        "openers": [
            "This is support from your bank, we detected suspicious activity on your account.",
            "Hello, I’m calling from Microsoft support, your computer is infected with a serious virus.",
            "We noticed an unauthorized login attempt; we must verify your identity immediately."
        ],
        "pitches": [
            "If we don’t fix this right now, your funds could be permanently lost.",
            "We can secure your device remotely; you only need to follow my instructions.",
            "Our technicians will remove the threat, but we must confirm some details first."
        ],
        "pressure": [
            "You must not hang up, or your account may be locked automatically.",
            "If you delay, fraudulent transactions could be processed in your name.",
            "This is extremely urgent; we have to act in the next few minutes."
        ],
        "money_request": [
            "To complete the repair, you need to pay a small service fee.",
            "Please purchase a gift card and read the code to me so I can register your payment.",
            "You must transfer the balance to a safe holding account we provide."
        ],
        "threats": [
            "Failure to comply may result in legal actions from the bank.",
            "Your entire balance could be frozen by the system if you don’t follow the steps.",
            "We cannot guarantee the safety of your funds if you refuse this procedure."
        ],
    },
}

# Fallback: treat any other archetype as investment-style scam
GENERIC_SCAM_ARCHETYPE = "Crypto/Investment Scam"


VICTIM_TEMPLATES = {
    "neutral": [
        "Can you explain exactly what I need to do?",
        "This sounds a bit unusual, can you give more details?",
        "I’m not sure I understand how this works.",
        "Why do I have to send money first?",
        "Can you show me some proof that this is real?",
    ],
    "skeptical": [
        "This feels like a scam, why should I trust you?",
        "I’m not comfortable sending any money to someone I don’t know.",
        "If this is legitimate, why can’t the fees be deducted later?",
        "I’ve heard of similar scams before, this looks the same.",
        "No thanks, I don’t want to continue this conversation.",
    ],
    "curious": [
        "How much can I really earn from this?",
        "How long have you been doing this?",
        "Do you have any official website or documents?",
        "What happens if I decide to stop later?",
        "Are there any risks I should know about?",
    ],
}


def pick_scam_archetype_key(archetype: str) -> str:
    """Map different archetype labels to a template group."""
    if archetype in SCAM_TEMPLATES:
        return archetype
    # simple mapping by keywords
    if "romance" in archetype.lower():
        return "Romance Scam"
    if "job" in archetype.lower():
        return "Job Scam"
    if "tech" in archetype.lower():
        return "Tech Support Scam"
    if "crypto" in archetype.lower() or "invest" in archetype.lower():
        return "Crypto/Investment Scam"
    return GENERIC_SCAM_ARCHETYPE


def generate_scammer_message(archetype: str, step: int) -> (str, list):
    """
    Generate a scammer message text + flags based on archetype and rough step index.
    step is the message number (0-based) for the scammer within the conversation.
    """
    key = pick_scam_archetype_key(archetype)
    tpl = SCAM_TEMPLATES[key]

    flags = []
    # Rough conversation structure:
    # 0: opner, 1-2: pitch, later: pressure/money_request/threats
    if step == 0:
        text = random.choice(tpl["openers"])
    elif step in (1, 2):
        text = random.choice(tpl["pitches"])
    else:
        # mix between pressure, money_request, threats
        choice = random.random()
        if choice < 0.4:
            text = random.choice(tpl["pressure"])
            flags.append("urgency")
        elif choice < 0.8:
            text = random.choice(tpl["money_request"])
            flags.append("money-request")
        else:
            text = random.choice(tpl["threats"])
            flags.append("threat")

    # flags by archetype
    if "investment" in key.lower() or "crypto" in key.lower():
        flags.append("investment")
    if "job" in key.lower():
        flags.append("job-offer")
    if "romance" in key.lower():
        flags.append("romance")

    # de-duplicate
    flags = list(sorted(set(flags)))
    return text, flags


def generate_victim_message() -> str:
    """Generate a victim reply using neutral, curious or skeptical tone."""
    mood_choice = random.random()
    if mood_choice < 0.4:
        pool = VICTIM_TEMPLATES["neutral"]
    elif mood_choice < 0.75:
        pool = VICTIM_TEMPLATES["curious"]
    else:
        pool = VICTIM_TEMPLATES["skeptical"]
    return random.choice(pool)


In [16]:
# Cell 6 – Generate conversations.json with realistic chat-style transcripts

conversations = []

for pid, pdata in personas.items():
    n_conv = random.randint(MIN_CONV_PER_PERSONA, MAX_CONV_PER_PERSONA)
    personas[pid]["conversations"] = n_conv  # update count
    archetype = pdata["archetype"]

    for c_idx in range(1, n_conv + 1):
        conv_id = f"{pid}_conv_{c_idx:03d}"
        platform = random.choice(pdata["traits"]["platform"])
        start_dt = random_date_within_days(days_back=60)
        # conversation length between 5 and 40 minutes
        end_dt = start_dt + timedelta(minutes=random.randint(5, 40))

        # messages
        n_messages = random.randint(6, 18)
        msgs = []
        current_time = start_dt

        scammer_msg_count = 0  # for step index in generate_scammer_message
        sender = "scammer"

        for m in range(n_messages):
            if sender == "scammer":
                text, flags = generate_scammer_message(archetype, scammer_msg_count)
                scammer_msg_count += 1
                msg = {
                    "sender": "scammer",
                    "text": text,
                    "time": hhmm(current_time),
                }
                if flags:
                    msg["flags"] = flags
            else:
                text = generate_victim_message()
                msg = {
                    "sender": "victim",
                    "text": text,
                    "time": hhmm(current_time),
                }

            msgs.append(msg)

            # increment time slightly
            current_time += timedelta(minutes=random.randint(1, 4))
            sender = "victim" if sender == "scammer" else "scammer"

        # label conversation
        classification = archetype  # e.g. "Job Scam", "Crypto/Investment Scam"
        outcome = random.choice(["success", "failed", "ongoing"])

        conversations.append({
            "persona_id": pid,
            "conversation_id": conv_id,
            "platform": platform,
            "start_time": iso_datetime(start_dt),
            "end_time": iso_datetime(end_dt),
            "messages": msgs,
            "classification": classification,
            "outcome": outcome
        })

len(conversations)


  now = datetime.utcnow()


176

In [17]:
# Cell 6 – Generate similarity_graph.json (nodes + edges)

nodes = []
edges = []

# nodes from personas
for pid, pdata in personas.items():
    nodes.append({
        "id": pid,
        "label": pdata["name"],
        "group": pdata["crew_id"],
    })

# edges: connect personas in same crew (crew links) and shared platforms/tactics (platform/tactic links)
persona_ids = list(personas.keys())

# crew edges
crew_to_members = {}
for pid, pdata in personas.items():
    crew_to_members.setdefault(pdata["crew_id"], []).append(pid)

for crew_id, members in crew_to_members.items():
    if len(members) < 2:
        continue
    # connect pairs in same crew
    for i in range(len(members)):
        for j in range(i + 1, len(members)):
            src, tgt = members[i], members[j]
            edges.append({
                "source": src,
                "target": tgt,
                "weight": round(random.uniform(0.7, 1.0), 2),
                "type": "crew"
            })

# platform/tactic edges (lightweight)
for i in range(len(persona_ids)):
    for j in range(i + 1, len(persona_ids)):
        p1 = personas[persona_ids[i]]
        p2 = personas[persona_ids[j]]

        shared_platforms = set(p1["traits"]["platform"]) & set(p2["traits"]["platform"])
        shared_tactics = set(p1["traits"]["tactics"]) & set(p2["traits"]["tactics"])

        if shared_platforms:
            edges.append({
                "source": persona_ids[i],
                "target": persona_ids[j],
                "weight": round(random.uniform(0.3, 0.6), 2),
                "type": "platform"
            })

        if shared_tactics and random.random() < 0.5:
            edges.append({
                "source": persona_ids[i],
                "target": persona_ids[j],
                "weight": round(random.uniform(0.1, 0.4), 2),
                "type": "tactic"
            })

similarity_graph = {
    "nodes": nodes,
    "edges": edges
}

len(nodes), len(edges)


(15, 126)

In [18]:
# Cell 7 – Save all JSON files to disk

clusters_path = DATA_DIR / "clusters.json"
personas_path = DATA_DIR / "personas.json"
conversations_path = DATA_DIR / "conversations.json"
similarity_graph_path = DATA_DIR / "similarity_graph.json"

with clusters_path.open("w", encoding="utf-8") as f:
    json.dump(clusters, f, indent=2)

with personas_path.open("w", encoding="utf-8") as f:
    json.dump(personas, f, indent=2)

with conversations_path.open("w", encoding="utf-8") as f:
    json.dump(conversations, f, indent=2)

with similarity_graph_path.open("w", encoding="utf-8") as f:
    json.dump(similarity_graph, f, indent=2)

print("Saved:")
print(" -", clusters_path)
print(" -", personas_path)
print(" -", conversations_path)
print(" -", similarity_graph_path)


Saved:
 - data\clusters.json
 - data\personas.json
 - data\conversations.json
 - data\similarity_graph.json


In [19]:
# Cell 8 – Quick sanity check (print small samples)

from itertools import islice

print("Sample clusters entry:")
print(json.dumps(clusters[0], indent=2)[:800], "...\n")

print("Sample persona entry:")
first_pid = next(iter(personas.keys()))
print(first_pid)
print(json.dumps(personas[first_pid], indent=2)[:800], "...\n")

print("Sample conversation entry:")
print(json.dumps(conversations[0], indent=2)[:800], "...\n")

print("Sample similarity_graph:")
print(json.dumps(similarity_graph, indent=2)[:800], "...")


Sample clusters entry:
{
  "persona_id": "R001",
  "name": "Clinical Broker",
  "risk": 65,
  "keywords": [
    "face",
    "site",
    "ago",
    "dog",
    "election"
  ],
  "description": "Charity Scam actor targeting victims via Email.",
  "archetype": "Charity Scam"
} ...

Sample persona entry:
R001
{
  "name": "Clinical Broker",
  "traits": {
    "tone": "Aggressive, Demanding",
    "emoji_rate": "none",
    "script_score": 0.78,
    "avg_message_length": 65,
    "common_phrases": [
      "cause bill scientist nation",
      "opportunity all behavior discussion own night"
    ],
    "tactics": [
      "Advance fee request",
      "Phishing link"
    ],
    "platform": [
      "SMS",
      "WhatsApp"
    ]
  },
  "active_hours": [
    0,
    1,
    2,
    6,
    7,
    13,
    18,
    23
  ],
  "risk_score": 65,
  "archetype": "Charity Scam",
  "color": "#22c55e",
  "crew_id": "CREW_A",
  "first_seen": "2025-08-20",
  "last_seen": "2025-11-15",
  "success_rate": 19,
  "conversatio