In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import missingno as msno
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report
)

In [2]:
RANDOM_STATE = 42

In [3]:
df = pd.read_csv("All_dataset.csv", encoding="utf-8", encoding_errors="replace")

In [4]:
df.rename(columns={col: col.strip() for col in df.columns}, inplace=True)

df["Label"] = (
    df["Label"]
    .str.replace("√Ø¬ø¬Ω", "-", regex=False)
    .str.replace("ÔøΩ", "-", regex=False)
    .str.strip()
)

In [5]:
df["AttackBinary"] = df["Label"].apply(lambda x: "Normal" if x == "BENIGN" else "Attack")
df.drop_duplicates(inplace=True)

# Replace infinities
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill Flow Bytes/s and Flow Packets/s
df["Flow Bytes/s"].fillna(df["Flow Bytes/s"].median(), inplace=True)
df["Flow Packets/s"].fillna(df["Flow Packets/s"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Flow Bytes/s"].fillna(df["Flow Bytes/s"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Flow Packets/s"].fillna(df["Flow Packets/s"].median(), inplace=True)


In [6]:
df["AttackBinary"] = df["AttackBinary"].map({"Normal": 0, "Attack": 1})

In [7]:
SELECTED_FEATURES = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Down/Up Ratio',
    'Average Packet Size',
    'Packet Length Mean',
    'Packet Length Std',
    'Min Packet Length',
    'Max Packet Length',
    'Packet Length Variance',
    'Fwd Packets/s',
    'Bwd Packets/s',
    'SYN Flag Count',
    'FIN Flag Count',
    'RST Flag Count',
    'PSH Flag Count',
    'ACK Flag Count',
    'URG Flag Count',
    'Init_Win_bytes_forward',
    'Init_Win_bytes_backward',
    'Avg Fwd Segment Size',
    'Avg Bwd Segment Size',
    'Destination Port',
    'Fwd Header Length',
    'Bwd Header Length',
    'Subflow Fwd Packets',
    'Subflow Bwd Packets'
]

In [8]:
available_features = [f for f in SELECTED_FEATURES if f in df.columns]
X = df[available_features]
y = df["AttackBinary"]

print(f"üìä Using {len(available_features)} features")

üìä Using 27 features


**Cross Validation**

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [10]:
f1_scores = []
roc_scores = []
ap_scores = []

In [11]:
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nüîÅ Fold {fold}")

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    CONTAMINATION = len(y_train[y_train == 1]) / len(y_train)

    model = IsolationForest(
        n_estimators=300,
        contamination=CONTAMINATION,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    model.fit(X_train_scaled)

    scores = -model.decision_function(X_test_scaled)
    threshold = np.percentile(scores, 81)

    y_pred = (scores >= threshold).astype(int)

    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, scores)
    ap = average_precision_score(y_test, scores)

    f1_scores.append(f1)
    roc_scores.append(roc)
    ap_scores.append(ap)

    print(f"F1: {f1:.4f} | ROC-AUC: {roc:.4f} | Avg Precision: {ap:.4f}")


üîÅ Fold 1
F1: 0.5124 | ROC-AUC: 0.7605 | Avg Precision: 0.3526

üîÅ Fold 2
F1: 0.5436 | ROC-AUC: 0.7684 | Avg Precision: 0.3671

üîÅ Fold 3
F1: 0.5407 | ROC-AUC: 0.7669 | Avg Precision: 0.3635

üîÅ Fold 4
F1: 0.5069 | ROC-AUC: 0.7652 | Avg Precision: 0.3588

üîÅ Fold 5
F1: 0.4954 | ROC-AUC: 0.7668 | Avg Precision: 0.3555


In [12]:
print("\n==============================")
print("Cross-Validation Results (5-Fold)")
print("==============================")
print(f"Mean F1 Score        : {np.mean(f1_scores):.4f}")
print(f"Mean ROC-AUC         : {np.mean(roc_scores):.4f}")
print(f"Mean Avg Precision   : {np.mean(ap_scores):.4f}")


Cross-Validation Results (5-Fold)
Mean F1 Score        : 0.5198
Mean ROC-AUC         : 0.7655
Mean Avg Precision   : 0.3595


In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

CONTAMINATION = len(y[y == 1]) / len(y)

final_model = IsolationForest(
    n_estimators=300,
    contamination=CONTAMINATION,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

final_model.fit(X_scaled)

scores_all = -final_model.decision_function(X_scaled)
threshold = np.percentile(scores_all, 81)

In [None]:
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_scaled)

# Inspect PCA loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)],
    index=available_features
)

# top contributors for each component
for i in range(pca.n_components_):
    pc = f"PC{i+1}"
    top_features = loadings[pc].abs().sort_values(ascending=False).head(5)
    print(f"\nüîπ Top features for {pc}:")
    print(top_features)


print(f"\nFinal PCA components: {pca.n_components_}")


üîπ Top features for PC1:
Packet Length Std       0.393078
Max Packet Length       0.389176
Packet Length Mean      0.386013
Average Packet Size     0.384072
Avg Bwd Segment Size    0.373326
Name: PC1, dtype: float64

üîπ Top features for PC2:
Total Backward Packets    0.499474
Subflow Bwd Packets       0.499474
Subflow Fwd Packets       0.499470
Total Fwd Packets         0.499470
Packet Length Variance    0.021160
Name: PC2, dtype: float64

üîπ Top features for PC3:
ACK Flag Count       0.533599
Destination Port     0.493463
URG Flag Count       0.377220
Min Packet Length    0.277936
Fwd Packets/s        0.253624
Name: PC3, dtype: float64

üîπ Top features for PC4:
Init_Win_bytes_forward     0.538841
PSH Flag Count             0.517104
Min Packet Length          0.445566
Down/Up Ratio              0.327963
Init_Win_bytes_backward    0.227812
Name: PC4, dtype: float64

üîπ Top features for PC5:
Down/Up Ratio              0.485528
URG Flag Count             0.455990
SYN Flag Count

In [18]:
# üîç Compare PCA features with SELECTED_FEATURES
selected_set = set(SELECTED_FEATURES)
used_set = set(available_features)

print("\n================ Feature Validation ================\n")

# 1Ô∏è‚É£ Features used in PCA but not in SELECTED_FEATURES
not_in_selected = used_set - selected_set
print("‚ùå In PCA/X but NOT in SELECTED_FEATURES:")
print(not_in_selected if not_in_selected else "‚úî None")

# 2Ô∏è‚É£ Features in SELECTED_FEATURES but not used in PCA/X
not_used = selected_set - used_set
print("\n‚ö† In SELECTED_FEATURES but NOT in X / DataFrame:")
print(not_used if not_used else "‚úî None")

# 3Ô∏è‚É£ Final sanity check
print("\n‚úî Final feature list passed into PCA:")
print(available_features)




‚ùå In PCA/X but NOT in SELECTED_FEATURES:
‚úî None

‚ö† In SELECTED_FEATURES but NOT in X / DataFrame:
‚úî None

‚úî Final feature list passed into PCA:
['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Down/Up Ratio', 'Average Packet Size', 'Packet Length Mean', 'Packet Length Std', 'Min Packet Length', 'Max Packet Length', 'Packet Length Variance', 'Fwd Packets/s', 'Bwd Packets/s', 'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Destination Port', 'Fwd Header Length', 'Bwd Header Length', 'Subflow Fwd Packets', 'Subflow Bwd Packets']


In [19]:
for i in range(pca.n_components_):
    pc = f"PC{i+1}"
    top_features = loadings[pc].abs().sort_values(ascending=False).head(5)
    print(f"\nüîπ Top features for {pc} (validated):")
    for feat in top_features.index:
        status = "‚úî in SELECTED_FEATURES" if feat in SELECTED_FEATURES else "‚ùå NOT in SELECTED_FEATURES"
        print(f"  {feat}: {status}")



üîπ Top features for PC1 (validated):
  Packet Length Std: ‚úî in SELECTED_FEATURES
  Max Packet Length: ‚úî in SELECTED_FEATURES
  Packet Length Mean: ‚úî in SELECTED_FEATURES
  Average Packet Size: ‚úî in SELECTED_FEATURES
  Avg Bwd Segment Size: ‚úî in SELECTED_FEATURES

üîπ Top features for PC2 (validated):
  Total Backward Packets: ‚úî in SELECTED_FEATURES
  Subflow Bwd Packets: ‚úî in SELECTED_FEATURES
  Subflow Fwd Packets: ‚úî in SELECTED_FEATURES
  Total Fwd Packets: ‚úî in SELECTED_FEATURES
  Packet Length Variance: ‚úî in SELECTED_FEATURES

üîπ Top features for PC3 (validated):
  ACK Flag Count: ‚úî in SELECTED_FEATURES
  Destination Port: ‚úî in SELECTED_FEATURES
  URG Flag Count: ‚úî in SELECTED_FEATURES
  Min Packet Length: ‚úî in SELECTED_FEATURES
  Fwd Packets/s: ‚úî in SELECTED_FEATURES

üîπ Top features for PC4 (validated):
  Init_Win_bytes_forward: ‚úî in SELECTED_FEATURES
  PSH Flag Count: ‚úî in SELECTED_FEATURES
  Min Packet Length: ‚úî in SELECTED_FEATURES
 