In [36]:
!pip install -q sentence-transformers scikit-learn umap-learn plotly nbformat

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [37]:
import os
from typing import List, Dict, Any, Optional

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import umap
import matplotlib.pyplot as plt
import json
from sentence_transformers import SentenceTransformer
import tqdm
import plotly.express as px

In [None]:
with open("../data/incidents-dataset-20250906-classified-new.json") as f:
    data = json.load(f)

# Join tags into a single string for each record
for record in data:
    record["tags_str"] = " ".join(record.get("tags", []))

# Embed all tag strings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
tag_strings = [record["tags_str"] for record in data]
embeddings = model.encode(
    tag_strings, normalize_embeddings=True, show_progress_bar=True
)

# Perform PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(embeddings)

# Perform UMAP
um = umap.UMAP(
    n_components=2, n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42
)
X_umap = um.fit_transform(embeddings)

# Add reduced dimensions back to records
for i, record in enumerate(data):
    record["pca"] = X_pca[i].tolist()  # Convert numpy array to list for compatibility
    record["umap"] = X_umap[i].tolist()  # Convert numpy array to list for compatibility

# Save processed data
np.savez_compressed("../data/dataset-embed.npz", data=data)

Batches:   0%|          | 0/9 [00:00<?, ?it/s]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
# Load serialized dataset
try:
    npz_file = np.load("../data/dataset-embed.npz", allow_pickle=True)
    records = npz_file["data"]
except FileNotFoundError:
    raise SystemExit(
        "Error: `../data/dataset-embed.npz` not found. Please run the previous cell to generate it."
    )

for r in records:
    if r.keys() != records[0].keys():
        # print(f"Diff: {r.keys() ^ records[0].keys()}")
        for key in r.keys() ^ records[0].keys():
            r[key] = None  # Fill missing keys with None

for r in records:
    if r.keys() != records[0].keys():
        print(f"Diff: {r.keys() ^ records[0].keys()}")

In [None]:
# with open("../data/incidents-dataset-20250907.json", "r", encoding="utf-8") as f:
#    with_value = json.load(f)
#
## sort by url
# with_value = sorted(with_value, key=lambda x: x["url"])
## enrich current data and match by url
# for r, w in zip(records, with_value):
#    r["total_value_extracted_usd"] = int(
#        w.get("value", 1).replace("$", "").replace(",", "").strip()
#    )

In [68]:
df_plot = pd.DataFrame.from_records(records)
df_plot.head()

Unnamed: 0,proto,type,exploit_type,total_value_extracted,hack_date,hacked_app_name,hacked_app_website,vuln_contract,attacker_addr,attack_complexity,actor_name,actor_type,tags,tags_str,pca,umap,value,total_value_extracted_usd
0,bnb,exploit,Improper access control,$4.5M,2021-06-15,Eleven Finance,,0xdb2d590aCe7cAe51DF1fB3312738038Ec032Bf33,0xdb2d590aCe7cAe51DF1fB3312738038Ec032Bf33,medium,0xdeadf4ce,individual,"[emergency-burn-exploit, flashloan-attack, yie...",emergency-burn-exploit flashloan-attack yield-...,"[0.05837976559996605, 0.045059651136398315]","[-5.735557556152344, 11.38453197479248]",4500000.0,4500000.0
1,eth,exploit,Integer Underflow,$5M,2025-03-05,1inch Fusion v1 resolver,,0xa88800cd213da5ae406ce248380802bd53b47647,0xa7264a43a57ca17012148c46adbc15a5f951766e,medium,,individual,"[integer-underflow-exploit, deprecated-code-vu...",integer-underflow-exploit deprecated-code-vuln...,"[0.08835408091545105, 0.06857344508171082]","[-5.366986274719238, 8.947030067443848]",5000000.0,5000000.0
2,other,exploit,Compromised Keys,$1.75 million,2022-03-24,8ight Finance,,,,low,,,"[private-key-compromise, opsec-failure, multi-...",private-key-compromise opsec-failure multi-sig...,"[-0.16246673464775085, 0.02321554161608219]","[-3.423513889312744, 8.11892032623291]",1750000.0,1750000.0
3,other,exploit,Access Control Vulnerability,$1.18 million,2025-05-09,LNDFi,,0xAA8cc9afE14f3A2B200CA25382e7C87CD883a527,0x5149A7696188F083297281D10293a20476252CDD,medium,,,"[admin-key-compromise, backdoor-insertion, acc...",admin-key-compromise backdoor-insertion access...,"[-0.04258838668465614, 0.24526435136795044]","[-3.2714037895202637, 8.68581771850586]",1180000.0,1180000.0
4,bnb,rugpull,liquidity drain,$2.79M,2022-10-18,"Lucky Star Currency, FSL",,0x8923881e8cae6684c2bb84d69ae88a9bbbec8d5a,0x7249b903da533358c897784438b87bc94b402352,low,,,"[bsc-shitcoin-rug, pre-minted-token-dump, liqu...",bsc-shitcoin-rug pre-minted-token-dump liquidi...,"[-0.17446422576904297, -0.19227005541324615]","[-4.848267555236816, 8.78579330444336]",2790000.0,2790000.0


In [None]:
# Build DataFrame for Plotly
df_plot[["x", "y"]] = pd.DataFrame(df_plot["umap"].tolist(), index=df_plot.index)
df_plot["app"] = df_plot["hacked_app_name"]

# --- Data cleaning for plotting ---
# Ensure 'total_value_extracted_usd' is numeric for sizing. Replace errors/missing values with 0.
df_plot["total_value_extracted_usd"] = pd.to_numeric(
    df_plot["total_value_extracted_usd"], errors="coerce"
).fillna(1)
# Fill missing 'exploit_type' to avoid errors in coloring
df_plot["exploit_type"] = df_plot["exploit_type"].fillna("Unknown")


# Interactive scatter plot
fig = px.scatter(
    df_plot,
    x="x",
    y="y",
    color="exploit_type",
    size="total_value_extracted_usd",
    hover_data=["app", "exploit_type", "hack_date", "tags_str"],
    title="Global Hacking Landscape – App Embedding Projection (UMAP)",
    width=1400,
    height=900,
)

fig.update_traces(
    marker=dict(sizemin=3, opacity=0.85, line=dict(width=0.5, color="white"))
)
fig.update_layout(
    legend_title="Attack Vector",
    xaxis_title="Dimension 1",
    yaxis_title="Dimension 2",
    margin=dict(l=40, r=40, t=60, b=40),
)
fig.write_html("../dashboard/hacking_landscape.html", include_plotlyjs="cdn")
fig.show()