# ==============================================================================
# 02_preprocessing.ipynb — NSL-KDD (Clean → Locked Schema CSVs)
# ==============================================================================
# GOAL
# - Single source of truth for *data cleaning* and *label mapping*.
# - Outputs: `data/processed/train_cleaned.csv` and `data/processed/test_cleaned.csv`
# - Notebook 03 will **only** train models (no cleaning duplication).
#
# NOTES
# - We rename `difficulty` → `level` (and keep it) so Notebook 03 can safely drop it.
# - We do NOT fit/serialize sklearn preprocessors here (Notebook 03 owns that).
# ==============================================================================


In [None]:
from __future__ import annotations

import os
from pathlib import Path

import numpy as np
import pandas as pd

RANDOM_STATE = 42


In [None]:
# ----------------------------------------------------------------------
# Paths
# ----------------------------------------------------------------------
def get_project_root() -> Path:
    cwd = Path.cwd().resolve()
    return cwd.parent if cwd.name == "notebooks" else cwd

PROJECT_ROOT = get_project_root()
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

train_path = RAW_DIR / "KDDTrain+.txt"
test_path  = RAW_DIR / "KDDTest+.txt"

print(f"[✓] Project Root: {PROJECT_ROOT}")
print(f"[✓] RAW_DIR:      {RAW_DIR}")
print(f"[✓] OUT_DIR:      {PROCESSED_DIR}")

if not train_path.exists() or not test_path.exists():
    raise FileNotFoundError("Missing NSL-KDD raw files in data/raw (KDDTrain+.txt, KDDTest+.txt).")


In [None]:
# ----------------------------------------------------------------------
# NSL-KDD column names (official)
# ----------------------------------------------------------------------
COLUMNS = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'label', 'difficulty'
]


In [None]:
# ----------------------------------------------------------------------
# Load raw data
# ----------------------------------------------------------------------
print("[1] Loading raw datasets...")

train_df = pd.read_csv(train_path, names=COLUMNS)
test_df  = pd.read_csv(test_path,  names=COLUMNS)

print(f"[✓] Loaded train={train_df.shape}, test={test_df.shape}")

# Some versions of KDDTest+ come with 42 cols; in that case, 'difficulty' is NaN
if 'difficulty' in test_df.columns and test_df['difficulty'].isna().all():
    print("[i] Test difficulty column appears missing in file (all NaN). Keeping as NaN for consistency.")


In [None]:
# ----------------------------------------------------------------------
# Basic cleaning (NO leakage)
# ----------------------------------------------------------------------
print("\n[2] Basic cleaning...")

def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Standardize label text
    out['label'] = out['label'].astype(str).str.lower().str.strip()

    # Fix known NSL-KDD quirk: su_attempted sometimes has value 2, treat as 1 (binary)
    if 'su_attempted' in out.columns:
        out['su_attempted'] = pd.to_numeric(out['su_attempted'], errors='coerce').fillna(0)
        out.loc[out['su_attempted'] > 0, 'su_attempted'] = 1

    # Enforce numeric types where expected
    # (Leave protocol_type/service/flag as categorical strings)
    # Convert everything else except label/protocol_type/service/flag
    cat_cols = {'protocol_type', 'service', 'flag', 'label'}
    for c in out.columns:
        if c not in cat_cols:
            out[c] = pd.to_numeric(out[c], errors='coerce')

    # Fill remaining NaNs in numeric with 0 (safe default for NSL-KDD counts/flags)
    num_cols = out.columns.difference(list(cat_cols))
    out[num_cols] = out[num_cols].fillna(0)

    # Rename difficulty → level for consistency with Notebook 03 DROP_COLS
    if 'difficulty' in out.columns:
        out = out.rename(columns={'difficulty': 'level'})

    return out

train_df = basic_clean(train_df)
test_df  = basic_clean(test_df)

# Drop duplicates (train & test independently)
before_tr = len(train_df)
before_te = len(test_df)
train_df = train_df.drop_duplicates()
test_df  = test_df.drop_duplicates()
print(f"[✓] Dropped duplicates: train {before_tr-len(train_df)}, test {before_te-len(test_df)}")

# Quick missing-values check (should be 0 after fill)
print(f"[✓] Total NaNs: train={int(train_df.isna().sum().sum())}, test={int(test_df.isna().sum().sum())}")


In [None]:
# ----------------------------------------------------------------------
# Label mapping (4-class + binary)
# ----------------------------------------------------------------------
print("\n[3] Mapping labels...")

attack_mapping = {
    'normal': 'Normal',
    # DoS
    'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS', 'smurf': 'DoS', 'teardrop': 'DoS',
    'mailbomb': 'DoS', 'apache2': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS', 'worm': 'DoS',
    # Probe
    'satan': 'Probe', 'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe', 'mscan': 'Probe', 'saint': 'Probe',
    # R2L
    'guess_passwd': 'R2L', 'ftp_write': 'R2L', 'imap': 'R2L', 'phf': 'R2L', 'multihop': 'R2L',
    'warezmaster': 'R2L', 'warezclient': 'R2L', 'spy': 'R2L', 'xlock': 'R2L', 'xsnoop': 'R2L',
    'snmpguess': 'R2L', 'snmpgetattack': 'R2L', 'httptunnel': 'R2L', 'sendmail': 'R2L', 'named': 'R2L',
    # U2R
    'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R', 'ps': 'U2R',
    'sqlattack': 'U2R', 'xterm': 'U2R'
}

def map_attack_class(label_series: pd.Series) -> pd.Series:
    return (
        label_series.astype(str)
        .str.lower().str.strip()
        .map(attack_mapping)
        .fillna('Normal')
    )

for df in (train_df, test_df):
    df['attack_class'] = map_attack_class(df['label'])
    df['binary_target'] = (df['attack_class'] != 'Normal').astype(int)

print("[✓] Distributions (train):")
print(train_df['attack_class'].value_counts())
print("\n[✓] Binary distribution (train):")
print(train_df['binary_target'].value_counts())
print("\n[✓] Binary distribution (test):")
print(test_df['binary_target'].value_counts())


In [None]:
# ----------------------------------------------------------------------
# Schema lock (feature columns only) + save cleaned CSVs
# ----------------------------------------------------------------------
print("\n[4] Locking schema and saving cleaned CSVs...")

# Keep these columns in the cleaned output for Notebook 03
# (Notebook 03 will drop label/attack_class/binary_target/level safely.)
KEEP_COLS = list(train_df.columns)

# Ensure both have the same columns in the same order
all_cols = sorted(set(train_df.columns).union(set(test_df.columns)))
train_out = train_df.reindex(columns=all_cols, fill_value=0)
test_out  = test_df.reindex(columns=all_cols, fill_value=0)

out_train_path = PROCESSED_DIR / "train_cleaned.csv"
out_test_path  = PROCESSED_DIR / "test_cleaned.csv"

train_out.to_csv(out_train_path, index=False)
test_out.to_csv(out_test_path, index=False)

print(f"[✓] Saved: {out_train_path}  shape={train_out.shape}")
print(f"[✓] Saved: {out_test_path}   shape={test_out.shape}")

# Minimal integrity checks
assert set(train_out.columns) == set(test_out.columns)
assert 'label' in train_out.columns and 'label' in test_out.columns
print("[✓] Integrity checks passed.")
