In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [6]:
DATA_DIR = "../data/raw/MachineLearningCVE"

import os


files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
sample_file = files[1]
df = pd.read_csv(os.path.join(DATA_DIR, sample_file))




In [9]:
df.shape

(286467, 79)

In [15]:
print(df.columns.tolist())


[' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag 

In [16]:
# Strip whitespace from all column names
df.columns = df.columns.str.strip()

# Now this will work perfectly:
df['Label'].value_counts()

Label
PortScan    158930
BENIGN      127537
Name: count, dtype: int64

In [17]:
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


In [18]:
features = [c for c in df.columns if c != 'Label']


In [19]:
results = []

for feature in features:
    X = df[[feature]].replace([np.inf, -np.inf], np.nan).dropna()
    y = df.loc[X.index, 'Label']

    # Skip if feature becomes useless
    if X.nunique().values[0] < 2:
        continue

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    results.append((feature, acc, auc))


In [20]:
results_df = pd.DataFrame(results, columns=['feature', 'accuracy', 'auc'])
results_df = results_df.sort_values(by='auc', ascending=False)

results_df.head(15)


Unnamed: 0,feature,accuracy,auc
34,Bwd Packets/s,0.909263,0.942313
4,Total Length of Fwd Packets,0.928172,0.932503
53,Subflow Fwd Bytes,0.928172,0.932503
6,Fwd Packet Length Max,0.957576,0.932396
49,Avg Fwd Segment Size,0.950198,0.932372
8,Fwd Packet Length Mean,0.950198,0.932372
37,Packet Length Mean,0.878196,0.90207
48,Average Packet Size,0.878312,0.895686
1,Flow Duration,0.655263,0.89169
18,Flow IAT Max,0.652331,0.888755


# A single-feature predictive audit was conducted to identify potential shortcut or leakage features. No individual feature achieved near-perfect classification performance, indicating the absence of direct label leakage. High-performing features primarily captured traffic rate and packet-level behavioral characteristics, which are expected and realistic in intrusion detection settings