In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import missingno as msno
sns.set(style='darkgrid')
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score,
    classification_report
)

In [2]:
RANDOM_STATE = 42
# THRESHOLD_PERCENTILE = 85 

In [3]:
print("Libraries imported successfully.")

Libraries imported successfully.


In [4]:
df = pd.read_csv('All_dataset.csv', encoding="utf-8", encoding_errors="replace")

**Data Cleaning**

In [5]:
# removing leading/trailing whitespace in col names
col_names = {col: col.strip() for col in df.columns}
df.rename(columns = col_names, inplace = True)

In [6]:
# fix the encoding issues in the label
df['Label'] = df['Label'] \
    .str.replace('Ã¯Â¿Â½', '-', regex=False) \
    .str.replace('ï¿½', '-', regex=False) \
    .str.strip()

In [7]:
df['AttackBinary'] = df['Label'].apply(lambda x: 'Normal' if x == 'BENIGN' else 'Attack')

In [8]:
# handle duplicates
df.drop_duplicates(inplace = True)

In [9]:
# handle missing values
missing_values = df.isnull().sum()
missing_values

print(missing_values.loc[missing_values > 0])

Flow Bytes/s    353
dtype: int64


In [10]:
# check infinity values
numeric_cols = df.select_dtypes(include = np.number).columns
inf_count = np.isinf(df[numeric_cols]).sum()
print(inf_count[inf_count > 0])

Flow Bytes/s      1211
Flow Packets/s    1564
dtype: int64


In [11]:
inf_rows = df[np.isinf(df['Flow Bytes/s']) | np.isinf(df['Flow Packets/s'])]
print('Rows with infinity values in \'Flow Bytes/s\' or \'Flow Packets/s\':')
display(inf_rows[['Flow Bytes/s', 'Flow Packets/s', 'Label']].sample(5))

Rows with infinity values in 'Flow Bytes/s' or 'Flow Packets/s':


Unnamed: 0,Flow Bytes/s,Flow Packets/s,Label
2548570,inf,inf,PortScan
1897451,inf,inf,BENIGN
1858158,inf,inf,BENIGN
2604731,inf,inf,BENIGN
2445504,inf,inf,PortScan


In [12]:
# replace infinite values with NaN
print(f'Initial missing values: {df.isna().sum().sum()}')

df.replace([np.inf, -np.inf], np.nan, inplace = True)

print(f'Missing values after processing infinite values: {df.isna().sum().sum()}')

Initial missing values: 353
Missing values after processing infinite values: 3128


In [13]:
missing = df.isna().sum()
print(missing.loc[missing > 0])

Flow Bytes/s      1564
Flow Packets/s    1564
dtype: int64


In [14]:
fb_median = df['Flow Bytes/s'].median()
fp_median = df['Flow Packets/s'].median()

print('Median of Flow Bytes/s: ', fb_median)
print('Median of Flow Packets/s: ', fp_median)

Median of Flow Bytes/s:  3715.0378579999997
Median of Flow Packets/s:  69.742244285


In [15]:
# Fill missing values with median
df['Flow Bytes/s'].fillna(fb_median, inplace = True)
df['Flow Packets/s'].fillna(fp_median, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Flow Bytes/s'].fillna(fb_median, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Flow Packets/s'].fillna(fp_median, inplace = True)


In [16]:
print('Number of \'Flow Bytes/s\' missing values:', df['Flow Bytes/s'].isna().sum())
print('Number of \'Flow Packets/s\' missing values:', df['Flow Packets/s'].isna().sum())

Number of 'Flow Bytes/s' missing values: 0
Number of 'Flow Packets/s' missing values: 0


In [17]:
# Creating a dictionary that maps each label to its attack type
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bot',
    'Web Attack - Brute Force': 'Web Attack',
    'Web Attack - XSS': 'Web Attack',
    'Web Attack - Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}

In [18]:
df['Attack Type'] = df['Label'].map(attack_map)

In [19]:
df['Attack Type'].value_counts()

Attack Type
BENIGN          2096484
DoS              193748
DDoS             128016
Port Scan         90819
Brute Force        9152
Web Attack         2143
Bot                1953
Infiltration         36
Heartbleed           11
Name: count, dtype: int64

In [20]:
df.drop('Label', axis = 1, inplace = True)

In [21]:
df['AttackBinary'].unique()

array(['Normal', 'Attack'], dtype=object)

In [22]:
df['Attack Type'].unique()

array(['BENIGN', 'Brute Force', 'DoS', 'Heartbleed', 'Web Attack',
       'Infiltration', 'Bot', 'Port Scan', 'DDoS'], dtype=object)

In [23]:
# Dropping columns with only one unique value
num_unique = df.nunique()
one_variable = num_unique[num_unique == 1]
not_one_variable = num_unique[num_unique > 1].index

dropped_cols = one_variable.index
df = df[not_one_variable]

print('Dropped columns:')
dropped_cols

Dropped columns:


Index(['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk',
       'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk',
       'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'],
      dtype='object')

In [24]:
df['AttackBinary'] = df['AttackBinary'].map({'Normal': 0, 'Attack': 1})

In [25]:
SELECTED_FEATURES = [
    # Flow characteristics
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Down/Up Ratio',
    
    'Average Packet Size',
    'Packet Length Mean',
    'Packet Length Std',
    'Min Packet Length',
    'Max Packet Length',
    'Packet Length Variance',
    
    'Fwd Packets/s',
    'Bwd Packets/s',
    
    'SYN Flag Count',
    'FIN Flag Count',
    'RST Flag Count',
    'PSH Flag Count',
    'ACK Flag Count',
    'URG Flag Count',
    
    'Init_Win_bytes_forward',
    'Init_Win_bytes_backward',
    'Avg Fwd Segment Size',
    'Avg Bwd Segment Size',
    
    'Destination Port',
    'Fwd Header Length',
    'Bwd Header Length',
    'Subflow Fwd Packets',
    'Subflow Bwd Packets'
]


In [26]:
# Filter to only include features that exist in the dataframe
available_features = [f for f in SELECTED_FEATURES if f in df.columns]
print(f"\nðŸ“Š Using {len(available_features)} selected features out of {len(SELECTED_FEATURES)} requested")
print(f"Features used: {available_features}\n")


ðŸ“Š Using 27 selected features out of 27 requested
Features used: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Down/Up Ratio', 'Average Packet Size', 'Packet Length Mean', 'Packet Length Std', 'Min Packet Length', 'Max Packet Length', 'Packet Length Variance', 'Fwd Packets/s', 'Bwd Packets/s', 'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Destination Port', 'Fwd Header Length', 'Bwd Header Length', 'Subflow Fwd Packets', 'Subflow Bwd Packets']



In [27]:
# ðŸ”€ Shuffle the entire dataset to remove ordering bias
df = df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

**Model Training**

In [28]:
y = df["AttackBinary"]

In [29]:
X = df[available_features]

In [30]:
X.sample(10)

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Down/Up Ratio,Average Packet Size,Packet Length Mean,Packet Length Std,Min Packet Length,Max Packet Length,Packet Length Variance,...,URG Flag Count,Init_Win_bytes_forward,Init_Win_bytes_backward,Avg Fwd Segment Size,Avg Bwd Segment Size,Destination Port,Fwd Header Length,Bwd Header Length,Subflow Fwd Packets,Subflow Bwd Packets
260234,208,2,1,0,24.666667,18.5,21.36196,0,37,456.333333,...,0,28944,0,18.5,0.0,58049,64,20,2,1
2321547,101,2,2,1,56.25,45.0,35.601966,6,71,1267.5,...,0,65535,0,71.0,6.0,389,64,40,2,2
1253902,362530,10,4,0,80.5,75.133333,146.482991,0,549,21457.266667,...,0,65535,61,79.2,83.75,443,332,136,10,4
183387,4502,2,0,0,9.0,6.0,0.0,6,6,0.0,...,0,253,-1,6.0,0.0,80,40,0,2,0
1127477,10,1,1,1,5.0,3.333333,2.309401,2,6,5.333333,...,0,1024,0,2.0,6.0,179,24,20,1,1
820809,4,2,0,0,37.0,24.666667,21.36196,0,37,456.333333,...,0,972,-1,18.5,0.0,64121,64,0,2,0
1811191,70,1,1,1,9.0,6.0,0.0,6,6,0.0,...,1,65535,256,6.0,6.0,49842,20,20,1,1
1333675,53,1,1,1,0.0,0.0,0.0,0,0,0.0,...,1,404,417,0.0,0.0,60608,32,32,1,1
156851,4,3,0,0,10.333333,7.75,15.5,0,31,240.25,...,0,71,-1,10.333333,0.0,50800,96,0,3,0
1024582,5183832,1,5,5,7.0,6.0,0.0,6,6,0.0,...,1,229,0,6.0,6.0,52742,20,100,1,5


In [31]:
print("Total features:", X.shape[1])

Total features: 27


**Train - Test Split**

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=RANDOM_STATE
)

In [33]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
CONTAMINATION = len(y_train[y_train == 1]) / len(y_train)

In [35]:
iso_forest = IsolationForest(
    n_estimators=300,
    contamination=CONTAMINATION,
    random_state=RANDOM_STATE,
    max_samples='auto',
    n_jobs=-1
)

In [36]:
iso_forest.fit(X_train_scaled)

0,1,2
,n_estimators,300
,max_samples,'auto'
,contamination,0.16884080343368738
,max_features,1.0
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [37]:
train_scores = -iso_forest.decision_function(X_train_scaled)
test_scores = -iso_forest.decision_function(X_test_scaled)

In [38]:
THRESHOLD_PERCENTILE = 81.0
threshold = -0.0073
print(f'Threshold at {THRESHOLD_PERCENTILE} percentile: {threshold:.4f}')

Threshold at 81.0 percentile: -0.0073


In [39]:
# Prediction logic: If score > threshold, it's an Attack (1)
y_pred = (test_scores >= threshold).astype(int)

**Evaluation**

In [40]:
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, test_scores)
avg_precision = average_precision_score(y_test, test_scores)
precision, recall, _ = precision_recall_curve(y_test, test_scores)

In [41]:
print("\n==============================")
print("Isolation Forest Evaluation")
print("==============================")
print(f"F1 Score        : {f1:.4f}")
print(f"ROC-AUC         : {roc_auc:.4f}")
print(f"Avg Precision   : {avg_precision:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["BENIGN", "ATTACK"]))


Isolation Forest Evaluation
F1 Score        : 0.5281
ROC-AUC         : 0.7634
Avg Precision   : 0.3592

Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.91      0.88      0.90    419297
      ATTACK       0.50      0.56      0.53     85176

    accuracy                           0.83    504473
   macro avg       0.70      0.72      0.71    504473
weighted avg       0.84      0.83      0.83    504473

