In [None]:
%load_ext autoreload
%autoreload 2

import polars as pl
from pathlib import Path

import util.cleaning as cu

In [3]:
# assumes that the dataset(s) have been pre-converted to parquet from JSON
def load_dataset(root: Path, split: str) -> pl.DataFrame:
    path = root / f"{split}-*.parquet"
    lf = pl.scan_parquet(str(path))
    return lf.collect()

def make_combined(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    train2 = train.with_columns(pl.lit("train").alias("split"))
    test2  = test.with_columns(pl.lit("test").alias("split"))
    return pl.concat([train2, test2], how="vertical_relaxed")

ROOT = Path("/mnt/data/tdalton/phreshphish-v1.0.1/data")
train = load_dataset(ROOT, split="train")
test = load_dataset(ROOT, split="test")

In [4]:
df = make_combined(train, test)

In [5]:
df

sha256,url,label,target,date,lang,lang_score,html,split
str,str,str,str,date,str,f32,str,str
"""1b1417105281ac24f66aac0b49ec2a…","""https://interprac.xplan.iress.…","""benign""",,2025-09-05,"""en""",0.901835,"""<html xmlns=""http://www.w3.org…","""train"""
"""d6d882b7870fed67819849b261a5f9…","""https://www.nhs.uk/conditions/…","""benign""",,2025-07-06,"""en""",0.93151,"""<html lang=""en""><head>  <me…","""train"""
"""c7a7003c6bed8fc95d3b84f408fd49…","""https://www.nibusinessinfo.co.…","""benign""",,2024-09-04,"""en""",0.846901,"""<html lang=""en"" dir=""ltr"" pref…","""train"""
"""496c41ddeea4f14b2477c7bcb23343…","""https://www.scribbr.com/freque…","""benign""",,2025-07-14,"""en""",0.931154,"""<html lang=""en-US""><head>  …","""train"""
"""d782458f6ce25293299f9004376fb8…","""https://banking.orangecountysc…","""benign""",,2024-08-21,"""en""",0.501184,"""<html manifest=""OCCUOLB/nocach…","""train"""
…,…,…,…,…,…,…,…,…
"""eba44886185cd7786155454cf3bce1…","""https://usa-eng-meta-mask.page…","""phish""","""metamask""",2025-10-25,"""en""",0.804922,"""<!DOCTYPE html><html lang=""en""…","""test"""
"""ae5ece396791104f635b52607cf519…","""http://xx9511.com/scss/bootstr…","""phish""","""grandlisboamacau""",2025-10-23,,-1.0,"""<!DOCTYPE html><html lang=""en""…","""test"""
"""d698531a9b2f93e9f4e9492fbc7819…","""https://biitmrt-en.pages.dev/""","""phish""","""bitmart""",2025-10-20,"""en""",0.773629,"""<!DOCTYPE html><html lang=""en""…","""test"""
"""d80f133174bf19ecca4d6d7afe9349…","""https://wangyeban.566888884.vi…","""phish""","""grandlisboamacau""",2025-10-28,,-1.0,"""<!DOCTYPE html><html lang=""en""…","""test"""


In [4]:
print("Appling automated heuristics...")
df = cu.find_duplicate_urls(df)
df = cu.find_empty_html(df)
df = cu.find_bad_titles(df)

Appling automated heuristics...
Found 0 subsequent duplicate URLs
Found 0 empty HTML documents
Found 0 bad titles


In [8]:
df = df.filter(~(pl.col('bad_title')) & (~pl.col('empty_html'))).drop(['empty_html', 'bad_title'])
df = df.with_row_index()
df.group_by('label').agg(pl.col('label').count().alias('n'))

label,n
str,u32
"""benign""",366991
"""phish""",250002


In [9]:
X_tfidf = cu.build_tfidf(df, max_features=None, sample_frac=0.01, vectorizer_path="./tfidf_vectorizer.joblib")

[build_tfidf] Starting with 616,993 documents
[build_tfidf] Using 80 processes with batch size 1000
[build_tfidf] Loading existing vectorizer from ./tfidf_vectorizer.joblib...


Loading vectorizer: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [02:49<00:00, 169.61s/step]


[build_tfidf] Loaded vectorizer with 65,379,584 features
[build_tfidf] Transforming 616,993 documents in 617 chunks...
[build_tfidf] Memory-efficient streaming mode: processing 80 chunks at a time


TF-IDF transform: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 616993/616993 [32:59<00:00, 311.67docs/s]


[build_tfidf] Combining 617 sparse matrices...


Combining matrices: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [05:38<00:00, 338.28s/step]


[build_tfidf] Final matrix shape: (616993, 65379584)
[build_tfidf] Matrix sparsity: 99.9736%
[build_tfidf] Complete!


In [10]:
lsh_model = cu.train_lsh(X_tfidf, n_vectors=32)

[train_lsh] Training LSH with 32 projection vectors...
[train_lsh] Documents: 616,993, Features: 65,379,584


Generating random vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:10<00:00, 70.24s/step]


[train_lsh] Computing bin assignments via matrix multiplication...


Computing hash bits: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [14:19<00:00, 859.41s/step]
Converting bits to bin indices: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.70step/s]


[train_lsh] Building hash table with 80 processes...


Building hash table: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 321/321 [05:49<00:00,  1.09s/chunks]


[train_lsh] Created 370141 bins
[train_lsh] Complete!


In [11]:
# add the bin_id to dataframe
mapping = pl.DataFrame(
    [
        {"index": idx, "bin": bin_id}
        for bin_id, indices in lsh_model["table"].items()
        for idx in indices
    ]
)

df = df.join(mapping, on="index", how="left")

In [12]:
cleaned_1 = cu.run_cleaning(X_tfidf, df, grouping_method="bin", lsh_model=lsh_model, budget=30)

Budget exhausted or coverage thresholds met. Finalizing...
Processed: 59256 / 616993 (9.60%)
  Keep:   58785 / 59256 (99.21%)
  Reject: 471 / 59256 (0.79%)
----------------------------------------


In [13]:
# run a second time to go back to largest bins
cleaned_2 = cu.run_cleaning(X_tfidf, cleaned_1, grouping_method="bin", lsh_model=lsh_model, budget=30)

Budget exhausted or coverage thresholds met. Finalizing...
Processed: 80934 / 616993 (13.12%)
  Keep:   78217 / 80934 (96.64%)
  Reject: 2717 / 80934 (3.36%)
----------------------------------------


In [14]:
# how many of each class were removed at this stage?
reject_stats = (
    cleaned_2
    # only consider rows where a decision was made
    .filter(pl.col("keep").is_not_null())
    # group by the class label
    .group_by("label")
    .agg([
        # how many were rejected?
        pl.col("keep").eq(False).sum().alias("num_rejected"),
    ])
)
reject_stats

label,num_rejected
str,u32
"""phish""",1927
"""benign""",790


In [15]:
# now, switch to "title" grouping scheme.
# we run twice with budget=50 to account for the fact
# that we may encounter the same title more than once
# and we want to ensure that we process at least 50
# unique titles
cleaned_3 = cu.run_cleaning(X_tfidf, cleaned_2, grouping_method="title", lsh_model=lsh_model, budget=50)

Budget exhausted or coverage thresholds met. Finalizing...
Processed: 86944 / 616993 (14.09%)
  Keep:   84227 / 86944 (96.88%)
  Reject: 2717 / 86944 (3.12%)
----------------------------------------


In [16]:
cleaned_4 = cu.run_cleaning(X_tfidf, cleaned_3, grouping_method="title", lsh_model=lsh_model, budget=50)

Budget exhausted or coverage thresholds met. Finalizing...
Processed: 90947 / 616993 (14.74%)
  Keep:   87739 / 90947 (96.47%)
  Reject: 3208 / 90947 (3.53%)
----------------------------------------


In [17]:
# how many of each class were removed at this stage?
reject_stats = (
    cleaned_4
    # only consider rows where a decision was made
    .filter(pl.col("keep").is_not_null())
    # group by the class label
    .group_by("label")
    .agg([
        # how many were rejected?
        pl.col("keep").eq(False).sum().alias("num_rejected"),
    ])
)
reject_stats

label,num_rejected
str,u32
"""phish""",2418
"""benign""",790
