In [None]:
import json, joblib, time, lightgbm as lgb
import pandas as pd, numpy as np
from pathlib import Path
from onnxmltools import convert_lightgbm
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import onnx



In [None]:
URL_FEATURES = [
    "length_url","length_hostname","ip","nb_dots","nb_hyphens","nb_at",
    "nb_qm","nb_and","nb_or","nb_eq","nb_underscore","nb_tilde","nb_percent",
    "nb_slash","nb_star","nb_colon","nb_comma","nb_semicolumn","nb_dollar",
    "nb_space","nb_www","nb_com","nb_dslash","http_in_path","https_token",
    "ratio_digits_url","ratio_digits_host","punycode","port","tld_in_path",
    "tld_in_subdomain","abnormal_subdomain","nb_subdomains","prefix_suffix",
    "random_domain","shortening_service","path_extension","length_words_raw",
    "char_repeat","shortest_words_raw","shortest_word_host",
    "shortest_word_path","longest_words_raw","longest_word_host",
    "longest_word_path","avg_words_raw","avg_word_host","avg_word_path",
    "phish_hints","domain_in_brand","brand_in_subdomain","brand_in_path",
    "suspecious_tld",
]




In [None]:
URL_FEATURES = [c.strip() for c in URL_FEATURES]        # tidy

URL_COLS     = slice(0, 53)   # after IGNORE filtering
CONTENT_COLS = slice(56, 80)               # after drop(IGNORE)


In [None]:
class ResTiny(nn.Module):
    def __init__(self, d_in=24):
        super().__init__()
        self.fc1 = nn.Linear(d_in, 96)
        self.fc2 = nn.Linear(96, 48)
        self.res = nn.Linear(d_in, 48, bias=False)   # match dims for add
        self.fc3 = nn.Linear(48, 24)
        self.fc4 = nn.Linear(24, 12)
        self.out = nn.Linear(12, 1)
        self.act, self.drop = nn.ReLU(), nn.Dropout(0.30)

    def forward(self, x):
        h = self.drop(self.act(self.fc1(x)))
        h = self.drop(self.act(self.fc2(h) + self.res(x)))  # skip-add
        h = self.drop(self.act(self.fc3(h)))
        h = self.act(self.fc4(h))
        return torch.sigmoid(self.out(h))



In [None]:
def focal_loss(p, y, α=0.25, γ=2.0, eps=1e-6):
    p = torch.clamp(p, eps, 1.0-eps)
    ce = -(y*torch.log(p) + (1-y)*torch.log(1-p))
    pt = torch.where(y==1, p, 1-p)
    return (α * (1-pt)**γ * ce).mean()



In [None]:
def epoch_stats(loader, model, lossf, dev):
    model.eval()
    losses, correct, total = [], 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(dev), yb.to(dev)
            preds = model(xb)
            losses.append(lossf(preds, yb).item())
            correct += ((preds >= 0.5) == (yb >= 0.5)).sum().item()
            total   += yb.size(0)
    return np.mean(losses), correct / total



In [None]:
def train_one(
        name,                 # "url_model" | "content_model"
        X, y, outdir,         # data slice + labels
        *,                    # force keyword args after here
        build_fn,             # callable(d_in) -> nn.Module  (required)
        loss_type="bce",      # "bce" | "focal"
        pos_w=1.0,            # pos-class weight for BCE
        patience=8,           # early-stop patience
        batch=256, epochs=60  # training schedule
    ):

    scaler = StandardScaler();  X = scaler.fit_transform(X)
    Xtr, Xva, ytr, yva = train_test_split(
        X, y, test_size=.20, stratify=y, random_state=42)

    # ------------- bring these two lines back -----------------
    tr = DataLoader(
        TensorDataset(torch.tensor(Xtr, dtype=torch.float32),
                      torch.tensor(ytr, dtype=torch.float32).unsqueeze(1)),
        batch_size=256, shuffle=True)

    va = DataLoader(
        TensorDataset(torch.tensor(Xva, dtype=torch.float32),
                      torch.tensor(yva, dtype=torch.float32).unsqueeze(1)),
        batch_size=256)
    # ----------------------------------------------------------

    # pick architecture
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if build_fn is None:
        model = WebPhishCNN(X.shape[1])              # default
    else:
        model = build_fn(X.shape[1])
    opt   = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

    pos_weight = torch.tensor([pos_w], device=dev)
    if loss_type == "focal":
        def lossf(pred, tgt): return focal_loss(pred, tgt, α=0.25, γ=2.0)
    else:                                    # standard BCE
        if pos_w != 1.0:
            pos_weight = torch.tensor([pos_w], device=dev)
            lossf = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            lossf = nn.BCELoss()

    # … rest of the loop unchanged …

    best_loss, best_acc, wait = 9e9, 0.0, patience
    for ep in range(1, epochs + 1):
        model.train()
        for xb, yb in tr:
            xb, yb = xb.to(dev), yb.to(dev)
            opt.zero_grad()
            loss = lossf(model(xb), yb)
            loss.backward(); opt.step()

        v_loss, v_acc = epoch_stats(va, model, lossf, dev)
        print(f"{name:14} ep{ep:02d}  loss={v_loss:.4f}  acc={v_acc*100:5.2f}%")

        if v_loss < best_loss:
            best_loss, best_acc, wait = v_loss, v_acc, patience
            torch.save(model.state_dict(), outdir / f"{name}.pt")
        else:
            wait -= 1
            if wait == 0:
                break

    print(f"✔ {name} best loss={best_loss:.4f} acc={best_acc*100:5.2f}%\n")

    # save scaler
    (outdir/f"scaler_{name}.json").write_text(
        json.dumps({"mean": scaler.mean_.tolist(),
                    "std":  scaler.scale_.tolist()}))

    # export ONNX
    dummy = torch.randn(1, X.shape[1])
    torch.onnx.export(model.cpu(), dummy, outdir/f"{name}.onnx",
                      input_names=["x"], output_names=["y"],
                      dynamic_axes={"x": {0: "b"}, "y": {0: "b"}},
                      opset_version=12)



In [None]:
def main():

    out = Path("outdir"); out.mkdir(exist_ok=True)
    df = pd.read_csv("csv_path")

    raw = df.copy()
    y   = (raw["status"] == "phishing").astype(np.float32).values
    raw_dropped = raw.drop(columns=IGNORE)
    missing = [c for c in URL_FEATURES if c not in raw_dropped.columns]
    if missing:
        raise ValueError(f"CSV is missing URL feature(s): {raw_dropped}")
    X   = raw_dropped.values.astype(np.float32)
    cols_after_drop = raw_dropped.columns

    train_one(
        "url_model", X[:, URL_COLS], y, out,
        build_fn=lambda d: ResTiny(d),
        loss_type="focal", pos_w=1.0, patience=5)


    X   = df.values.astype(np.float32)[:, CONTENT_COLS]
    y   = (pd.read_csv("csv_path")["status"]=="phishing").astype(np.int32).values

    # scale numeric cols (tree depth is small, scaling helps a bit)
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    Xtr, Xva, ytr, yva = train_test_split(
        X, y, test_size=.20, stratify=y, random_state=42)

    lgbm = lgb.LGBMClassifier(
            n_estimators=400, learning_rate=0.05,
            num_leaves=64, subsample=0.8, colsample_bytree=0.8,
            max_depth=-1, objective="binary", n_jobs=-1)

    lgbm.fit(Xtr, ytr, eval_set=[(Xva,yva)], eval_metric="auc")

    print("val-accuracy:",
          (lgbm.predict(Xva) == yva).mean().round(4))

    # Save model + scaler
    joblib.dump(lgbm,  out/"content_lgbm.pkl")
    json.dump({"mean": scaler.mean_.tolist(),
              "std":  scaler.scale_.tolist()},
              open(out/"scaler_content_lgbm.json","w"))

    initial = [("x", FloatTensorType([None, X.shape[1]]))]
    onx = convert_lightgbm(
            lgbm,                    # fitted model
            initial_types=initial,
            zipmap=False,            # disable ZipMap so ORT gets a tensor
            target_opset=13)
    open(f"{out}/content_lgbm.onnx","wb").write(onx.SerializeToString())

    Path(f"{out}/content_lgbm.onnx").write_bytes(onx.SerializeToString())
    print("✔  ZipMap removed:",
          all(n.op_type != "ZipMap" for n in onx.graph.node))

    m = onnx.load(f"{out}/content_lgbm.onnx")
    assert "ZipMap" not in [n.op_type for n in m.graph.node]
    print("✔ tensor output, ready for onnxruntime-web")



In [None]:
main()