### Load + preprocess dataset

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan
import json
from datetime import datetime
import re

# Load dataset
df = pd.read_csv("datasets/Nigerian_Fraud.csv")

# Combine subject+body as full conversation text
df["full_text"] = (
    df["subject"].fillna("") + " " + df["body"].fillna("")
)

# Basic cleaning
def clean(t):
    t = str(t).lower()
    t = re.sub(r"http\S+", "", t)   # remove URLs
    t = re.sub(r"[^a-z0-9 ]", " ", t)
    return t

df["clean_text"] = df["full_text"].apply(clean)


### tf-idf features

In [5]:
tfidf = TfidfVectorizer(max_features=3000, stop_words="english")
X = tfidf.fit_transform(df["clean_text"])

### clustering

In [6]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,
    metric="euclidean"
)

df["cluster"] = clusterer.fit_predict(X)




## build json files for figma

In [None]:
# clusters.json

clusters_json = []

for cid in sorted(df["cluster"].unique()):
    if cid == -1:
        continue  # skip noise
    
    subset = df[df["cluster"] == cid]
    keywords = (
        " ".join(subset["clean_text"].tolist())
        .split()
    )
    top_keywords = pd.Series(keywords).value_counts().head(8).index.tolist()

    clusters_json.append({
        "persona_id": f"C{cid:03d}",
        "name": f"Persona {cid}",
        "risk": int(np.random.randint(40, 96)),
        "keywords": top_keywords,
        "description": "Auto-generated scam persona based on text similarity.",
        "archetype": "Generic Scam"
    })

with open("json_files/clusters.json", "w") as f:
    json.dump(clusters_json, f, indent=4)


In [None]:
# personas.json

personas_json = {}

for cid in sorted(df["cluster"].unique()):
    if cid == -1: 
        continue
    
    subset = df[df["cluster"] == cid]
    avg_len = int(subset["clean_text"].apply(len).mean())
    
    personas_json[f"C{cid:03d}"] = {
        "name": f"Persona {cid}",
        "traits": {
            "tone": "Formal",
            "emoji_rate": "low",
            "script_score": float(np.random.rand()),
            "avg_message_length": avg_len,
            "common_phrases": [],
            "tactics": ["urgency", "money-request"],
            "platform": ["Email"]
        },
        "active_hours": [9, 10, 11, 12, 13],
        "risk_score": int(np.random.randint(50, 100)),
        "archetype": "Generic Scam",
        "color": "#3b82f6",
        "crew_id": f"CREW_{cid}",
        "first_seen": "2024-01-01",
        "last_seen": "2024-12-31",
        "success_rate": int(np.random.randint(1, 40)),
        "conversations": len(subset)
    }

with open("json_files/personas.json", "w") as f:
    json.dump(personas_json, f, indent=4)


In [9]:
# conversations.json

conversations_json = []

for i, row in df.iterrows():
    conversations_json.append({
        "persona_id": f"C{int(row['cluster']):03d}",
        "conversation_id": f"conv_{i:04d}",
        "platform": "Email",
        "start_time": datetime.utcnow().isoformat(),
        "end_time": datetime.utcnow().isoformat(),
        "messages": [
            {
                "sender": "scammer",
                "text": row["full_text"][:500],
                "time": "10:23",
                "flags": ["urgency"] if "urgent" in row["clean_text"] else []
            }
        ],
        "classification": "Generic Scam",
        "outcome": "ongoing"
    })

with open("json_files/conversations.json", "w") as f:
    json.dump(conversations_json, f, indent=4)


  "start_time": datetime.utcnow().isoformat(),
  "end_time": datetime.utcnow().isoformat(),


In [None]:
# similarity_graph.json

import itertools

nodes = []
edges = []

# Create nodes
for cid in sorted(df["cluster"].unique()):
    if cid == -1:
        continue
    nodes.append({
        "id": f"C{cid:03d}",
        "label": f"Persona {cid}",
        "group": f"CREW_{cid}"
    })

# Create simple similarity edges
for (cid1, cid2) in itertools.combinations(sorted(df["cluster"].unique()), 2):
    if cid1 == -1 or cid2 == -1:
        continue
    
    edges.append({
        "source": f"C{cid1:03d}",
        "target": f"C{cid2:03d}",
        "weight": float(np.random.uniform(0.1, 1.0)),
        "type": "tactic"
    })

graph_json = {"nodes": nodes, "edges": edges}

with open("json_files/similarity_graph.json", "w") as f:
    json.dump(graph_json, f, indent=4)


### fastAPI

In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import json

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"]
)

@app.get("/api/clusters.json")
def get_clusters():
    return json.load(open("json_files/clusters.json"))

@app.get("/api/personas.json")
def get_personas():
    return json.load(open("json_files/personas.json"))
@app.get("/api/conversations.json")
def get_conversations():
    return json.load(open("json_files/conversations.json"))

@app.get("/api/similarity_graph.json")
def get_graph():
    return json.load(open("json_files/similarity_graph.json"))