In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [17]:
DATA_DIR = "../data/processed/cicids_v1_raw.csv"

df = pd.read_csv(DATA_DIR)


In [18]:
df.shape

(2830743, 83)

In [19]:
print(df.columns.tolist())


['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

In [20]:
# Strip whitespace from all column names
df.columns = df.columns.str.strip()

# Now this will work perfectly:
df['Label'].value_counts()

Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [21]:
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


In [37]:
META_COLS = ["Label", "source_file", "day", "attack_group", "label_bin"]
features = [c for c in df.columns if c not in META_COLS]

X_all = df[features]
y_all = df["label_bin"]

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X_all,
    y_all,
    test_size=0.3,
    random_state=42,
    stratify=y_all
)



In [38]:
results = []

for feature in features:
    Xtr = X_train_all[[feature]].replace([np.inf, -np.inf], np.nan)
    Xte = X_test_all[[feature]].replace([np.inf, -np.inf], np.nan)

    # align labels
    ytr = y_train_all.loc[Xtr.index]
    yte = y_test_all.loc[Xte.index]

    # drop NaNs separately
    tr_mask = Xtr.notna().values.ravel()
    te_mask = Xte.notna().values.ravel()

    Xtr, ytr = Xtr[tr_mask], ytr[tr_mask]
    Xte, yte = Xte[te_mask], yte[te_mask]

    # skip degenerate cases
    if ytr.nunique() < 2 or yte.nunique() < 2:
        continue
    if Xtr.nunique().values[0] < 2:
        continue

    model = LogisticRegression(max_iter=1000)
    model.fit(Xtr, ytr)

    y_pred = model.predict(Xte)
    y_prob = model.predict_proba(Xte)[:, 1]

    acc = accuracy_score(yte, y_pred)
    auc = roc_auc_score(yte, y_prob)

    results.append((feature, acc, auc))



In [39]:
results_df = (
    pd.DataFrame(results, columns=["feature", "accuracy", "auc"])
    .sort_values(by=["accuracy", "auc"], ascending=False)
)

results_df.head(10)



Unnamed: 0,feature,accuracy,auc


# A single-feature predictive audit was conducted to identify potential shortcut or leakage features. No individual feature achieved near-perfect classification performance, indicating the absence of direct label leakage. High-performing features primarily captured traffic rate and packet-level behavioral characteristics, which are expected and realistic in intrusion detection settings