Parse HTTP responses to Cookie Hygiene Score (CHS)

In [None]:
# analysis/cookie_hygiene.py
import re, math, pandas as pd

ATTRS = ["secure", "httponly", "samesite"]
def parse_set_cookie(header):
    parts = [p.strip() for p in header.split(";")]
    name, val = parts[0].split("=", 1) if "=" in parts[0] else (parts[0], "")
    flags = {k: False for k in ATTRS}
    samesite = None; max_age=None; expires=None
    for p in parts[1:]:
        kv = p.split("=", 1)
        k = kv[0].strip().lower()
        v = kv[1].strip().lower() if len(kv)==2 else True
        if k in ("secure","httponly"): flags[k]=True
        elif k=="samesite": flags["samesite"]=True; samesite=v
        elif k=="max-age": max_age = int(v) if v.isdigit() else None
        elif k=="expires": expires = v
    return {"name":name,"secure":flags["secure"],"httponly":flags["httponly"],
            "samesite":flags["samesite"],"samesite_val":samesite,
            "max_age":max_age,"expires":expires}

def cookie_score(df_setcookie):  # df_setcookie columns: url, set_cookie_header
    rows = []
    for _, r in df_setcookie.iterrows():
        meta = parse_set_cookie(r["set_cookie_header"])
        score = (1 if meta["secure"] else 0) + (1 if meta["httponly"] else 0) + (1 if meta["samesite"] else 0)
        # expiry weighting: very long lifetimes penalized (tracking risk)
        life_pen = 1 if (meta["max_age"] and meta["max_age"]>60*60*24*30) else 0
        rows.append({**meta, "url": r["url"], "score": score - life_pen})
    out = pd.DataFrame(rows)
    chs = out.groupby("url")["score"].agg(["mean","median","count"]).reset_index().rename(
        columns={"mean":"chs_mean","median":"chs_med","count":"cookie_count"})
    return chs, out


Extract Navigator feature usage (NSP)

In [None]:
# analysis/extract_navigator.py
import pandas as pd

NAV_KEYS = ["userAgent","language","languages","platform","deviceMemory",
            "hardwareConcurrency","plugins","webdriver","userAgentData.brands",
            "userAgentData.platform","userAgentData.mobile"]

def build_nsp(js_calls_df):  # columns: url, api, prop
    df = js_calls_df[js_calls_df["api"].str.contains("navigator", case=False, na=False)].copy()
    df["prop_norm"] = df["prop"].str.lower()
    feats = {k.lower():k for k in NAV_KEYS}
    df["prop_bucket"] = df["prop_norm"].map(lambda p: next((f for f in feats if f in p), None))
    mat = (df.dropna(subset=["prop_bucket"])
             .assign(val=1)
             .pivot_table(index="url", columns="prop_bucket", values="val", aggfunc="sum", fill_value=0)
             .reset_index())
    # binary presence
    for c in [c for c in mat.columns if c!="url"]:
        mat[c] = (mat[c] > 0).astype(int)
    mat["n_nav_keys"] = mat.drop(columns=["url"]).sum(axis=1)
    return mat


Simple model & ablation

In [None]:
# analysis/build_features.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score

def train_eval(X, y):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    clf = LogisticRegression(max_iter=200).fit(Xtr, ytr)
    proba = clf.predict_proba(Xte)[:,1]
    return {
        "AUC": roc_auc_score(yte, proba),
        "AP": average_precision_score(yte, proba),
        "coef": dict(zip(X.columns, clf.coef_[0]))
    }


In [None]:
# crawl (baseline vs EASP)