In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl
from collections import Counter

import util

In [2]:
import json
import re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from bs4 import BeautifulSoup
import polars as pl
from tqdm import tqdm

def _load_file(filepath: Path) -> dict:
    label = 1 if "phishes" in filepath.parts else -1

    try:
        data = json.loads(filepath.read_text(encoding="utf-8"))
        url = data.get("url")
        raw_html = data.get("html", "")
    except Exception:
        return {"url": None, "html": None, "text": None, "title": None, "label": label}

    # skip non-HTML
    if not re.search(r"<\s*\w+[^>]*>", raw_html):
        return {"url": url, "html": None, "text": None, "title": None, "label": label}

    soup = BeautifulSoup(raw_html, "lxml")
    pretty = soup.prettify()
    title = soup.title.string.strip().lower() if soup.title and soup.title.string else None
    text = soup.get_text(separator=" ", strip=True)

    return {"url": url, "html": pretty, "text": text, "title": title, "label": label}

def load_dataset(phish_dir: str,
                 benign_dir: str,
                 max_workers: int = None) -> pl.DataFrame:
    phish_paths  = list(Path(phish_dir).glob("*.json"))
    benign_paths = list(Path(benign_dir).glob("*.json"))
    all_paths    = phish_paths + benign_paths

    with ProcessPoolExecutor(max_workers=max_workers) as pool:
        records = list(tqdm(
            pool.map(_load_file, all_paths),
            total=len(all_paths),
            desc="Loading samples",
            unit="file",
        ))

    return pl.DataFrame(records)

In [None]:
df = load_dataset('<path/to>/phishes',
                  '<path/to>/benigns')

In [None]:
df = util.find_duplicate_urls(df)
df = util.find_empty_html(df)
df = util.find_bad_titles(df)

In [28]:
stats = (
    df
    .group_by("label")
    .agg([
        # raw counts
        pl.col("empty_html"     ).sum().alias("num_empty_html"),
        pl.col("bad_title"      ).sum().alias("num_bad_title"),
        pl.col("duplicate_url"  ).sum().alias("num_dupes"),
        pl.count().alias("total"),
    ])
    .with_columns([
        # convert to rates if you like
        (pl.col("num_empty_html") / pl.col("total"))
           .alias("empty_html_rate"),
        (pl.col("num_bad_title")  / pl.col("total"))
           .alias("bad_title_rate"),
        (pl.col("num_dupes")  / pl.col("total"))
           .alias("dupe_url_rate"),
    ])
)
stats

  pl.count().alias("total"),


label,num_empty_html,num_bad_title,num_dupes,total,empty_html_rate,bad_title_rate,dupe_url_rate
i64,u32,u32,u32,u32,f64,f64,f64
-1,0,231,0,253936,0.0,0.00091,0.0
1,0,10,0,119858,0.0,8.3e-05,0.0


In [18]:
df = df.filter(~(pl.col('bad_title')) & (~pl.col('empty_html'))).drop(['empty_html', 'bad_title'])
df = df.with_row_index()
df.group_by('label').agg(pl.col('label').count().alias('n'))

label,n
i64,u32
-1,373553


In [30]:
X_tfidf = util.build_tfidf(df, max_features=None, sample_frac=0.3)

Fitting TF-IDF on 11561 / 38537 docs...
Transforming all 38537 documents in 3854 chunks on 80 processes...


Transform: 100%|████████████████████████████████████████████████| 3854/3854 [00:38<00:00, 99.98it/s]


In [31]:
lsh_model = util.train_lsh(X_tfidf, n_vectors=16, seed=143)

# add the bin_id to dataframe
mapping = pl.DataFrame(
    [
        {"index": idx, "bin": bin_id}
        for bin_id, indices in lsh_model["table"].items()
        for idx in indices
    ]
)

df = df.join(mapping, on="index", how="left")

In [32]:
cleaned_1 = util.run_cleaning(X_tfidf, df, lsh_model, 30, group_col='bin')
cleaned_2 = util.run_cleaning(X_tfidf, cleaned_1, lsh_model, 30, group_col='bin')  # run a second time to go back to largest bins 

Budget exhausted. Finalizing...
Processed: 11014 / 38537 (28.58%)
  Keep:   3412 / 11014 (30.98%)
  Reject: 7602 / 11014 (69.02%)
----------------------------------------


In [30]:
cleaned_2.group_by('label').agg(pl.col('keep') == False)

NameError: name 'cleaned_2' is not defined

In [33]:
cleaned_3 = util.run_cleaning(X_tfidf, cleaned_2, lsh_model, budget=50, group_col='title')
cleaned_4 = util.run_cleaning(X_tfidf, cleaned_3, lsh_model, budget=50, group_col='title')

Budget exhausted. Finalizing...
Processed: 12188 / 38537 (31.63%)
  Keep:   3748 / 12188 (30.75%)
  Reject: 8440 / 12188 (69.25%)
----------------------------------------


In [34]:
final = cleaned_4.filter((pl.col('keep').is_null()) | (pl.col('keep') == True))
final.group_by('label').agg(pl.col('label').count().alias('n'))
final

index,url,html,text,title,label,bin,keep
u32,str,str,str,str,i64,i64,bool
0,"""10bestbingorooms.com""","""<!DOCTYPE HTML> <html>  <head>…","""10 Best Bingo Rooms :: Only th…","""10 best bingo rooms :: only th…",1,61597,
1,"""10stepstostartingyouronlinebus…","""<html>  <head>  <meta content…","""Connie Ragen Green | Starting …","""connie ragen green | starting …",1,62394,
2,"""10surdix.com""","""<!DOCTYPE html> <html lang=""fr…","""[10surdix] Architecte d’intéri…","""[10surdix] architecte d’intéri…",1,43022,
3,"""118usa.com""","""<!DOCTYPE html> <html>  <head>…","""Index of / Index of / Name Las…","""index of /""",1,17790,
4,"""123contactform.com/form-241310…","""<!DOCTYPE html PUBLIC ""-//W3C/…","""Surpreenda_MasterCard2017 Desc…","""surpreenda_mastercard2017""",1,7612,
…,…,…,…,…,…,…,…
38518,"""dreamshockdesign.com""","""<!DOCTYPE html> <html xmlns=""h…","""Creative Video Production for …","""creative video production for …",1,37270,
38519,"""dremsm.gob.pe""","""<!DOCTYPE html> <html lang=""en…","""DIRECCION REGIONAL DE ENERGIA …","""direccion regional de energia …",1,56189,
38533,"""dresslikea.com""","""<!DOCTYPE html> <html class=""n…","""Dress Like A Shop Journal Our …","""dress like a""",1,27665,
38534,"""dressymodafeminina.com.br""","""<!DOCTYPE html> <html dir=""ltr…","""Dressy Moda Feminina Plus Size…","""dressy moda feminina plus size…",1,7227,


In [35]:
reject_stats = (
    cleaned_4
    # only consider rows where a decision was made
    .filter(pl.col("keep").is_not_null())
    # group by the class label
    .group_by("label")
    .agg([
        # how many were rejected?
        pl.col("keep").eq(False).sum().alias("num_rejected"),
        # what fraction of this class’s decisions were rejects?
        pl.col("keep").eq(False).mean().alias("reject_rate"),
        # total number of decided rows in this class
        pl.count().alias("n_decided"),
    ])
)
reject_stats

label,num_rejected,reject_rate,n_decided
i64,u32,f64,u32
-1,5637,0.653414,8627
1,2803,0.787138,3561
