In [None]:
import pandas as pd
import numpy as np

# ---------- CONFIG ----------
# Put your file paths here:
FILES = {
    "Original":  "OroginalDatasetBalanced.csv",   # or .tsv
    "Mild":      "Obfuscated_Mild.csv",
    "Moderate":  "/content/modarate_new.csv",
    "Strong":    "Obfuscated_Strong.csv",
}

# If your files are TSV (tab-separated), set SEP = "\t"
# If they are CSV (comma-separated), set SEP = ","
SEP = ","  # change to "\t" if needed

# Choose quasi-identifiers (Q) that exist in ALL datasets
# (avoid Flow ID / Src IP / Dst IP / Timestamp)
QI_COLS = [
    "Protocol",
    "Dst Port",
    "Flow Duration",
    "Total Fwd Packet",
    "Total Bwd packets",
    "Flow Bytes/s",
    "Flow Packets/s",
]

# Numeric QI columns to discretize into bins (consistent edges for all levels)
NUM_QI = [
    "Flow Duration",
    "Total Fwd Packet",
    "Total Bwd packets",
    "Flow Bytes/s",
    "Flow Packets/s",
]

N_BINS = 5  # keep consistent with your "qcut (5 bins)" idea
# ---------------------------

def read_table(path: str) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        sep=SEP,
        engine="python",
        na_values=["", " ", "NA", "NaN"],
    )
    # Trim whitespace in column names
    df.columns = [c.strip() for c in df.columns]
    return df

def coerce_numeric(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def make_bin_edges_from_reference(ref: pd.DataFrame, col: str, n_bins: int):
    # Use quantiles from Original (reference) to build stable bin edges
    x = ref[col].dropna().astype(float)
    if x.empty:
        return None
    qs = np.linspace(0, 1, n_bins + 1)
    edges = np.unique(np.quantile(x, qs))
    if len(edges) < 3:
        return None
    edges[0] = -np.inf
    edges[-1] = np.inf
    return edges

def apply_binning(df: pd.DataFrame, col: str, edges):
    if edges is None or col not in df.columns:
        return df
    # If already looks like 0..4 bins, keep it
    s = df[col]
    if pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s):
        vals = s.dropna().unique()
        if len(vals) > 0 and np.all(np.isin(vals, [0,1,2,3,4])):
            df[col] = s.astype("Int64")
            return df
    df[col] = pd.cut(df[col].astype(float), bins=edges, labels=False, include_lowest=True).astype("Int64")
    return df

def compute_k_metrics(df: pd.DataFrame, qi_cols):
    d = df[qi_cols].copy()

    # Drop rows where any QI is missing (keeps metric clean)
    d = d.dropna()

    # Equivalence class size k for each row
    grp = d.groupby(qi_cols, dropna=False).size()
    k = d.merge(grp.rename("k"), left_on=qi_cols, right_index=True)["k"].to_numpy()

    uniq_pct = (k == 1).mean() * 100.0
    avg_k = float(np.mean(k))
    prosecutor = float(np.mean(1.0 / k))

    return uniq_pct, avg_k, prosecutor, len(d)

def main():
    # Load all
    data = {name: read_table(path) for name, path in FILES.items()}

    # Basic checks
    for name, df in data.items():
        missing = [c for c in QI_COLS if c not in df.columns]
        if missing:
            raise ValueError(f"{name} is missing columns: {missing}")

    # Coerce numeric
    for name in data:
        data[name] = coerce_numeric(data[name], NUM_QI)

    # Build bin edges from ORIGINAL, apply to all levels (consistency!)
    ref = data["Original"]
    edges_map = {c: make_bin_edges_from_reference(ref, c, N_BINS) for c in NUM_QI}

    for name, df in data.items():
        for c in NUM_QI:
            df = apply_binning(df, c, edges_map[c])
        data[name] = df

    # Compute metrics
    rows = []
    for name, df in data.items():
        uniq_pct, avg_k, prosecutor, n_used = compute_k_metrics(df, QI_COLS)
        rows.append([name, uniq_pct, avg_k, prosecutor, n_used])

    out = pd.DataFrame(rows, columns=["Level", "Uniq% (k=1)", "Avg k", "Prosecutor risk (mean 1/k)", "Rows used"])
    out = out.sort_values("Level", key=lambda s: s.map({"Original":0,"Mild":1,"Moderate":2,"Strong":3}))
    print(out.to_string(index=False))

if __name__ == "__main__":
    main()


   Level  Uniq% (k=1)    Avg k  Prosecutor risk (mean 1/k)  Rows used
Original        43.93 127.4438                      0.4564      10000
    Mild        96.46   1.1230                      0.9748      10000
Moderate        17.91  17.7002                      0.3427      10000
  Strong        20.19   9.8338                      0.3766      10000
