# 01 — Data Preprocessing (CIC-IDS2017)

This notebook cleans and prepares the **CIC-IDS2017** dataset and produces standardized splits for three approaches:

- **RF (supervised)** — binary BENIGN vs. attack.  
- **IF (unsupervised)** — trained only on BENIGN traffic, validated/tested on both classes.  
- **ET-SSL (self-supervised)** — encoder trained on both benign and malicious (binary labels used for validation and thresholding).  

### Processing steps
1. **Column cleaning**: normalize column names, remove duplicates, standardize `Label` values (dash variants unified).  
2. **Drop rows with missing values** (any NaN/±Inf across features).  
3. **Drop categorical features** (keep only numeric + `Label` + binary `label`).  
4. **Drop constant features** (manual zero-variance filter).  
5. Create a **binary label**: `0` = BENIGN, `1` = any attack.  
6. Keep only numeric features plus `Label` (string) and `label` (binary int).  
7. **Train/val/test splits (70/15/15)** with class-stratification:  
   - **RF/ET-SSL**: both benign and malicious in train/val/test.  
   - **IF**: train restricted to BENIGN only; val/test contain both.  
8. **Zero-day scenario**: remove `"Bot"`, `"Web Attack - Brute Force"`, and `"Infiltration"` from train/val; keep them in test.  
   - RF/ET-SSL: train/val exclude these, test includes full label space.  
   - IF: train BENIGN-only as usual; val/test include all.  

### Notes
- No normalization is applied here; MinMaxScaler is reconstructed later during training.  
- PCA reduction (>0.95 corr → keep 0.99 var) is only applied inside RF/IF training pipelines.  
- Splits are written to Parquet per approach/scenario.  

**Input**: `./cicids2017_full_clean.parquet`  
**Outputs**:  
- `data/splits/<approach>/<scenario>/{train,val,test}.parquet`  
- `data/meta/preprocess_summary.json`

In [1]:
# %% [markdown]
# ## Imports & configuration

import os, json, time, math, warnings
from pathlib import Path
from typing import Dict, Tuple, List, Optional
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

SEED = 42
np.random.seed(SEED)

ROOT = Path(".").resolve()
INPUT_PARQUET = ROOT / "cicids2017_full_clean.parquet"

SPLITS_DIR = ROOT / "data" / "splits"
META_DIR   = ROOT / "data" / "meta"
for p in [SPLITS_DIR, META_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Label conventions
BENIGN_NAME = "BENIGN"
POSSIBLE_LABEL_COLS = ["Label", "label", "labels", "attack", "Attack", "category"]

# Zero-day classes (must match cleaned names below)
ZERO_DAY_ATTACKS = {"Bot", "Web Attack - Brute Force", "Infiltration"}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %% [markdown]
# ## Utility helpers

def find_label_column(df: pd.DataFrame) -> str:
    for col in POSSIBLE_LABEL_COLS:
        if col in df.columns:
            return col
    # heuristic: last object dtype column
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols:
        raise ValueError("No obvious label column found; please rename your label to 'Label'.")
    return obj_cols[-1]

def normalize_labels_column(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    """Standardize strange characters and unify names."""
    df = df.copy()
    # Replace weird '�' with '-'
    if df[label_col].dtype == "object":
        df[label_col] = (
            df[label_col].astype(str)
            .str.replace("�", "-", regex=False)
            .str.strip()
        )
    # Map a few known variants:
    replacements = {
        "Web Attack – Brute Force": "Web Attack - Brute Force",
        "Web Attack — Brute Force": "Web Attack - Brute Force",
        "Web Attack – XSS": "Web Attack - XSS",
        "Web Attack — XSS": "Web Attack - XSS",
        "Web Attack – Sql Injection": "Web Attack - Sql Injection",
        "Web Attack — Sql Injection": "Web Attack - Sql Injection",
        "DoS Slowhttptest": "DoS SlowHTTPTest",
    }
    df[label_col] = df[label_col].replace(replacements)
    return df

def to_binary_label(df: pd.DataFrame, label_col: str) -> pd.Series:
    """Return 0 for BENIGN, 1 for everything else."""
    return (df[label_col] != BENIGN_NAME).astype(np.int64)

def drop_categorical_and_keep_label(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    """Keep only numeric columns + the label column."""
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c].dtype)]
    keep = list(dict.fromkeys(num_cols + [label_col]))
    return df[keep].copy()

def remove_constant_features(df: pd.DataFrame, label_col: str) -> Tuple[pd.DataFrame, List[str]]:
    """Drop features with zero variance (constant)."""
    df = df.copy()
    removed = []
    for c in list(df.columns):
        if c == label_col: 
            continue
        s = df[c]
        # treat NaNs as separate — but we dropped rows with NaN already
        if s.nunique(dropna=False) <= 1:
            removed.append(c)
            df.drop(columns=[c], inplace=True)
    return df, removed

def train_val_test_split_binary(
    df: pd.DataFrame, label_col: str, train=0.70, val=0.15, test=0.15, seed=SEED
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Stratified split auf binäres Label, robust bei Miniklassen und garantiert 1D-Label."""
    assert abs(train + val + test - 1.0) < 1e-6

    df = df.loc[:, ~df.columns.duplicated(keep="first")].copy()

    y = df[label_col]
    if isinstance(y, pd.DataFrame):
        y = y.iloc[:, 0]
    y = y.to_numpy().ravel()

    rng = np.random.default_rng(seed)
    df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    indices = np.arange(len(df))

    tr_idx, va_idx, te_idx = [], [], []
    classes = np.unique(y)

    for cls in classes:
        mask = (y == cls)
        cls_idx = indices[mask]
        rng.shuffle(cls_idx)
        n = len(cls_idx)

        if n < 3:
            tr_idx.extend(cls_idx)
            continue

        n_tr = max(1, int(round(train * n)))
        n_va = max(1, int(round(val   * n)))
        n_te = n - n_tr - n_va

        if n_te < 1:
            n_te = 1
            if n_va > 1:
                n_va -= 1
            else:
                n_tr = max(1, n_tr - 1)

        tr_idx.extend(cls_idx[:n_tr])
        va_idx.extend(cls_idx[n_tr:n_tr+n_va])
        te_idx.extend(cls_idx[n_tr+n_va:])

    tr = df.iloc[tr_idx].reset_index(drop=True)
    va = df.iloc[va_idx].reset_index(drop=True)
    te = df.iloc[te_idx].reset_index(drop=True)

    # final shuffles
    tr = tr.sample(frac=1.0, random_state=seed)
    va = va.sample(frac=1.0, random_state=seed+1)
    te = te.sample(frac=1.0, random_state=seed+2)
    return tr, va, te


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def write_parquet(df: pd.DataFrame, path: Path):
    ensure_dir(path.parent)
    df = df.copy()
    df.columns = pd.Index(df.columns).map(str)
    df = df.loc[:, ~df.columns.duplicated(keep="first")]
    if "label" in df.columns:
        df["label"] = pd.to_numeric(df["label"], errors="coerce").fillna(0).astype(np.int64)
    df.to_parquet(path, index=False)


def summary_counts(df: pd.DataFrame, label_col: str) -> Dict:
    counts = df[label_col].value_counts().to_dict()
    return {"rows": int(len(df)), "label_counts": {str(k): int(v) for k, v in counts.items()}}

In [3]:
# %% [markdown]
# ## Load dataset and run cleaning pipeline (with progress)

t_start = time.time()
if not INPUT_PARQUET.exists():
    raise FileNotFoundError(f"Input parquet not found: {INPUT_PARQUET}")

print(f"Reading {INPUT_PARQUET} ...")
df = pd.read_parquet(INPUT_PARQUET)
df.columns = pd.Index(df.columns).map(str).str.strip()
dup_mask = df.columns.duplicated(keep="first")
if dup_mask.any():
    print(f"[warn] duplicate column names removed: {df.columns[dup_mask].tolist()}")
    df = df.loc[:, ~dup_mask]
print(f"Loaded shape: {df.shape}")

label_col = find_label_column(df)
df = normalize_labels_column(df, label_col=label_col)

# Drop NA rows (log progress in chunks)
n_before = len(df)
na_rows = df.isna().any(axis=1)
n_nas = int(na_rows.sum())
if n_nas > 0:
    print(f"Dropping {n_nas} rows with any missing values ...")
    df = df.loc[~na_rows].reset_index(drop=True)

# Drop categoricals (except label)
df = drop_categorical_and_keep_label(df, label_col=label_col)

# Remove constant features
df, removed_const = remove_constant_features(df, label_col=label_col)
print(f"Removed {len(removed_const)} constant feature(s).")

# Create binary column 'label'
df = df.rename(columns={label_col: "Label"})
label_col = "Label"
df["label"] = to_binary_label(df, label_col="Label").astype(np.int64)

# Quick report of attack-name normalization
if df["Label"].dtype == "object":
    unique_attacks = sorted(set(df["Label"].unique().tolist()))
    # show a few
    print("Unique label examples:", unique_attacks[:10], "... (total:", len(unique_attacks), ")")

# Keep only numeric features + 'Label' + 'label'
keep_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c].dtype)] + ["Label", "label"]
df = df[keep_cols]

print(f"Final cleaned shape: {df.shape}")
print(f"Prep time: {time.time() - t_start:.1f}s")

Reading /home/user/project/cicids2017_full_clean.parquet ...
Loaded shape: (2830743, 79)
Dropping 2867 rows with any missing values ...
Removed 8 constant feature(s).
Final cleaned shape: (2827876, 73)
Prep time: 4.7s


In [None]:
# %% [markdown]
# ## Build splits for both scenarios (base & zero-day) and all approaches (rf, if, etssl)

def save_splits_for_rf_etssl_base(df: pd.DataFrame):
    tr, va, te = train_val_test_split_binary(df, label_col="label")
    base_dir_rf = SPLITS_DIR / "rf" / "base"
    base_dir_et = SPLITS_DIR / "etssl" / "base"
    for d in [base_dir_rf, base_dir_et]:
        ensure_dir(d)
    for split, data in zip(["train", "val", "test"], [tr, va, te]):
        write_parquet(data, base_dir_rf / f"{split}.parquet")
        write_parquet(data, base_dir_et / f"{split}.parquet")
    return {
        "rf/base": {k: summary_counts(v, "label") for k, v in zip(["train","val","test"], [tr,va,te])},
        "etssl/base": {k: summary_counts(v, "label") for k, v in zip(["train","val","test"], [tr,va,te])},
    }

def save_splits_for_if_base(df: pd.DataFrame):
    tr_full, va_full, te_full = train_val_test_split_binary(df, label_col="label")
    tr = tr_full[tr_full["label"] == 0].copy()
    base_dir_if = SPLITS_DIR / "if" / "base"
    ensure_dir(base_dir_if)
    for split, data in zip(["train", "val", "test"], [tr, va_full, te_full]):
        write_parquet(data, base_dir_if / f"{split}.parquet")
    return {"if/base": {k: summary_counts(v, "label") for k, v in zip(["train","val","test"], [tr,va_full,te_full])}}

def save_splits_zeroday(df: pd.DataFrame):
    df = df.loc[:, ~df.columns.duplicated(keep="first")].reset_index(drop=True).copy()

    Label_series = df["Label"]
    if isinstance(Label_series, pd.DataFrame):
        Label_series = Label_series.iloc[:, 0]
    label_text = Label_series.astype(str).to_numpy()
    y_bin = df["label"].to_numpy().reshape(-1)

    is_zero_day = np.isin(label_text, list(ZERO_DAY_ATTACKS))

    keep_mask = ~(np.logical_and(is_zero_day, y_bin == 1))
    df_no_zd_mal = df.iloc[keep_mask].copy()

    # RF/ETSSL
    tr, va, _ = train_val_test_split_binary(df_no_zd_mal, label_col="label")
    
    _, _, test_full = train_val_test_split_binary(df, label_col="label")

    # RF/ETSSL
    for algo in ["rf", "etssl"]:
        base = SPLITS_DIR / algo / "zeroday"
        ensure_dir(base)
        for split, data in zip(["train", "val", "test"], [tr, va, test_full]):
            write_parquet(data, base / f"{split}.parquet")

    # IF
    tr_b, va_b, te_b = train_val_test_split_binary(df, label_col="label")
    tr_b = tr_b[tr_b["label"] == 0].copy()
    zif = SPLITS_DIR / "if" / "zeroday"
    ensure_dir(zif)
    for split, data in zip(["train", "val", "test"], [tr_b, va_b, te_b]):
        write_parquet(data, zif / f"{split}.parquet")

    return {
        "rf/zeroday": {**{k: summary_counts(v, "label") for k, v in zip(["train","val"], [tr,va])},
                       "test": summary_counts(test_full, "label")},
        "etssl/zeroday": {**{k: summary_counts(v, "label") for k, v in zip(["train","val"], [tr,va])},
                          "test": summary_counts(test_full, "label")},
        "if/zeroday": {k: summary_counts(v, "label") for k, v in zip(["train","val","test"], [tr_b,va_b,te_b])},
    }


# Run and record metadata
meta = {}
meta.update(save_splits_for_rf_etssl_base(df))
meta.update(save_splits_for_if_base(df))
meta.update(save_splits_zeroday(df))

with open(META_DIR / "preprocess_summary.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Done. Split summaries:\n", json.dumps(meta, indent=2)[:2000], "...")


Done. Split summaries:
 {
  "rf/base": {
    "train": {
      "rows": 1979513,
      "label_counts": {
        "0": 1589654,
        "1": 389859
      }
    },
    "val": {
      "rows": 424181,
      "label_counts": {
        "0": 340942,
        "1": 83239
      }
    },
    "test": {
      "rows": 424182,
      "label_counts": {
        "0": 340724,
        "1": 83458
      }
    }
  },
  "etssl/base": {
    "train": {
      "rows": 1979513,
      "label_counts": {
        "0": 1589654,
        "1": 389859
      }
    },
    "val": {
      "rows": 424181,
      "label_counts": {
        "0": 340942,
        "1": 83239
      }
    },
    "test": {
      "rows": 424182,
      "label_counts": {
        "0": 340724,
        "1": 83458
      }
    }
  },
  "if/base": {
    "train": {
      "rows": 1589654,
      "label_counts": {
        "0": 1589654
      }
    },
    "val": {
      "rows": 424181,
      "label_counts": {
        "0": 340942,
        "1": 83239
      }
    },
    "test"