<a href="https://colab.research.google.com/github/SVL98/IoMT-Based-Intrusion-Detection-Using-Nature-Inspired-Algorithms-and-ML-Techniques/blob/main/NIA%2BRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Firely + RF on WUSTL dataset

In [None]:
import os
import pandas as pd
import numpy as np
import random
import math
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

def print_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    support = len(y_true)
    print("\nüß© Confusion Matrix:")
    print(cm)
    print("\nüìä Metrics Summary:")
    print(f"{'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print(f"{accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {support:<10}")

# Dataset load and preprocessing (same as your code)
file_path = "/content/wustl-ehms-2020_with_attacks_categories.csv"
df = pd.read_csv(file_path)
df.drop(columns=['Dir', 'Flgs', 'SrcAddr', 'DstAddr', 'SrcMac', 'DstMac', 'Attack Category'], inplace=True, errors='ignore')
df.dropna(subset=['Label'], inplace=True)
benign = df[df['Label'] == 0].sample(n=min(14000, df[df['Label'] == 0].shape[0]), random_state=42)
attack = df[df['Label'] == 1].sample(n=min(1400, df[df['Label'] == 1].shape[0]), random_state=42)
df_sampled = pd.concat([benign, attack]).sample(frac=1.0, random_state=42)
X_raw = df_sampled.drop(columns=["Label"])
y = df_sampled["Label"].reset_index(drop=True)
feature_names = X_raw.columns.tolist()
for column in X_raw.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_raw[column] = le.fit_transform(X_raw[column].astype(str))
imp = SimpleImputer(strategy='median')
X_imp = imp.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
num_features = X_train.shape[1]

# Fitness
def fitness(mask):
    mask = np.round(mask).astype(int)
    if np.count_nonzero(mask) == 0:
        return 0
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train[:, mask == 1], y_train)
    y_pred = clf.predict(X_test[:, mask == 1])
    return accuracy_score(y_test, y_pred)

# Firefly Algorithm
def firefly_algorithm(n=20, max_gen=30, alpha=0.2, beta0=1, gamma=1):
    pop = np.random.randint(0, 2, size=(n, num_features))
    scores = [fitness(ind) for ind in pop]

    for gen in range(max_gen):
        for i in range(n):
            for j in range(n):
                if scores[j] > scores[i]:
                    r = np.sum((pop[i] - pop[j]) ** 2)
                    beta = beta0 * math.exp(-gamma * r)
                    move = alpha * (np.random.rand(num_features) - 0.5)
                    pop[i] = np.clip(pop[i] + beta * (pop[j] - pop[i]) + move, 0, 1)
                    pop[i] = np.round(pop[i])
        scores = [fitness(ind) for ind in pop]
    best_idx = np.argmax(scores)
    return pop[best_idx], scores[best_idx]

print("\n‚ú® Running Firefly Algorithm for Feature Selection...")
best_mask, _ = firefly_algorithm()
selected_features = np.where(best_mask == 1)[0]
selected_feature_names = [feature_names[i] for i in selected_features]
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train[:, selected_features], y_train)
y_pred = model.predict(X_test[:, selected_features])

print(f"\n‚úÖ Selected {len(selected_features)} features using FA:")
print(selected_feature_names)
print(f"\nüéØ Accuracy (FA + RF): {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nüìÑ Classification Report:\n", classification_report(y_test, y_pred))
print_metrics(y_test, y_pred)



‚ú® Running Firefly Algorithm for Feature Selection...

‚úÖ Selected 16 features using FA:
['DstBytes', 'DstLoad', 'SIntPkt', 'DIntPkt', 'SIntPktAct', 'DIntPktAct', 'sMaxPktSz', 'dMinPktSz', 'TotBytes', 'Rate', 'Packet_num', 'Temp', 'Pulse_Rate', 'SYS', 'Resp_Rate', 'ST']

üéØ Accuracy (FA + RF): 95.62%

üìÑ Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      2800
           1       0.98      0.53      0.69       280

    accuracy                           0.96      3080
   macro avg       0.97      0.76      0.83      3080
weighted avg       0.96      0.96      0.95      3080


üß© Confusion Matrix:
[[2797    3]
 [ 132  148]]

üìä Metrics Summary:
Accuracy   Precision  Recall     F1-Score   Support   
0.9562     0.9675     0.7637     0.8316     3080      


ACO+RF

In [None]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# üßÆ Metrics printer
def print_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    support = len(y_true)

    print("\nüß© Confusion Matrix:")
    print(cm)
    print("\nüìä Metrics Summary:")
    print(f"{'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print(f"{accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {support:<10}")

# Step 1: Load dataset
file_path = "/content/wustl-ehms-2020_with_attacks_categories.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")
df = pd.read_csv(file_path)

# Step 2: Clean and preprocess
df.drop(columns=['Dir', 'Flgs', 'SrcAddr', 'DstAddr', 'SrcMac', 'DstMac', 'Attack Category'], inplace=True, errors='ignore')
df.dropna(subset=['Label'], inplace=True)

# Step 3: Sampling
benign = df[df['Label'] == 0].sample(n=min(14000, df[df['Label'] == 0].shape[0]), random_state=42)
attack = df[df['Label'] == 1].sample(n=min(1400, df[df['Label'] == 1].shape[0]), random_state=42)
df_sampled = pd.concat([benign, attack]).sample(frac=1.0, random_state=42)

# Step 4: Feature preprocessing
X_raw = df_sampled.drop(columns=["Label"])
y = df_sampled["Label"].reset_index(drop=True)
feature_names = X_raw.columns.tolist()

# Encoding
for column in X_raw.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_raw[column] = le.fit_transform(X_raw[column].astype(str))

# Imputation and scaling
imp = SimpleImputer(strategy='median')
X_imp = imp.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
num_features = X_train.shape[1]

# Step 5: Fitness function
def fitness(mask):
    mask = np.array(mask)
    if np.count_nonzero(mask) < 5:
        return 0
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train[:, mask == 1], y_train)
    y_pred = clf.predict(X_test[:, mask == 1])
    return accuracy_score(y_test, y_pred)

# Step 6: ACO function with enhanced feature selection
def aco_feature_selection(num_ants=20, num_iterations=20, evaporation_rate=0.1, alpha=1):
    pheromone = np.ones(num_features)
    best_mask = None
    best_score = 0

    for it in range(num_iterations):
        all_solutions = []
        all_scores = []

        for _ in range(num_ants):
            probs = pheromone ** alpha
            probs /= probs.sum()

            # üîß Force selection of 10‚Äì20 features randomly
            k = random.randint(10, 20)
            selected = np.random.choice(range(num_features), size=k, replace=False, p=probs)
            mask = np.zeros(num_features)
            mask[selected] = 1

            acc = fitness(mask)
            all_solutions.append(mask)
            all_scores.append(acc)

            if acc > best_score:
                best_score = acc
                best_mask = mask.copy()

        # Update pheromone
        pheromone *= (1 - evaporation_rate)
        for mask, score in zip(all_solutions, all_scores):
            pheromone += score * mask

        pheromone = np.clip(pheromone, 1e-6, 1e6)
        print(f"Iteration {it + 1}/{num_iterations} | Best Accuracy: {best_score:.4f}")

    return best_mask, best_score

# Step 7: Run ACO
print("\nüêú Running Ant Colony Optimization for Feature Selection...")
best_mask, best_score = aco_feature_selection()
selected_features = np.where(best_mask == 1)[0]
selected_feature_names = [feature_names[i] for i in selected_features]

# Step 8: Final model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train[:, selected_features], y_train)
y_pred = model.predict(X_test[:, selected_features])

# Step 9: Evaluation
print(f"\n‚úÖ Selected {len(selected_features)} features using ACO:")
print(selected_feature_names)
print(f"\nüéØ Accuracy (ACO + RF): {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nüìÑ Classification Report:\n", classification_report(y_test, y_pred))
print_metrics(y_test, y_pred)



üêú Running Ant Colony Optimization for Feature Selection...
Iteration 1/20 | Best Accuracy: 0.9740
Iteration 2/20 | Best Accuracy: 0.9740
Iteration 3/20 | Best Accuracy: 0.9740
Iteration 4/20 | Best Accuracy: 0.9740
Iteration 5/20 | Best Accuracy: 0.9740
Iteration 6/20 | Best Accuracy: 0.9740
Iteration 7/20 | Best Accuracy: 0.9740
Iteration 8/20 | Best Accuracy: 0.9740
Iteration 9/20 | Best Accuracy: 0.9740
Iteration 10/20 | Best Accuracy: 0.9740
Iteration 11/20 | Best Accuracy: 0.9740
Iteration 12/20 | Best Accuracy: 0.9740
Iteration 13/20 | Best Accuracy: 0.9740
Iteration 14/20 | Best Accuracy: 0.9740
Iteration 15/20 | Best Accuracy: 0.9740
Iteration 16/20 | Best Accuracy: 0.9789
Iteration 17/20 | Best Accuracy: 0.9789
Iteration 18/20 | Best Accuracy: 0.9789
Iteration 19/20 | Best Accuracy: 0.9789
Iteration 20/20 | Best Accuracy: 0.9789

‚úÖ Selected 14 features using ACO:
['Sport', 'SrcBytes', 'DstBytes', 'DstGap', 'SIntPkt', 'SIntPktAct', 'DIntPktAct', 'sMaxPktSz', 'TotBytes', '

cuckoo +rf

In [None]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# ------------------ Metric Printer ------------------
def print_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    support = len(y_true)
    print("\nüß© Confusion Matrix:")
    print(cm)
    print("\nüìä Metrics Summary:")
    print(f"{'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print(f"{accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {support:<10}")

# ------------------ 1. Load Data ------------------
file_path = "/content/wustl-ehms-2020_with_attacks_categories.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")
df = pd.read_csv(file_path)

# ------------------ 2. Clean & Preprocess ------------------
df.drop(columns=['Dir', 'Flgs', 'SrcAddr', 'DstAddr', 'SrcMac', 'DstMac', 'Attack Category'], inplace=True, errors='ignore')
df.dropna(subset=['Label'], inplace=True)

# ------------------ 3. Sample Data ------------------
benign = df[df['Label'] == 0].sample(n=min(14000, df[df['Label'] == 0].shape[0]), random_state=42)
attack = df[df['Label'] == 1].sample(n=min(1400, df[df['Label'] == 1].shape[0]), random_state=42)
df_sampled = pd.concat([benign, attack]).sample(frac=1.0, random_state=42)

# ------------------ 4. Feature Preparation ------------------
X_raw = df_sampled.drop(columns=["Label"])
y = df_sampled["Label"].reset_index(drop=True)
feature_names = X_raw.columns.tolist()

for col in X_raw.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_raw[col] = le.fit_transform(X_raw[col].astype(str))

imp = SimpleImputer(strategy='median')
X_imp = imp.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
num_features = X_train.shape[1]

# ------------------ 5. Fitness Function ------------------
def fitness(solution):
    mask = np.array(solution, dtype=int)
    if np.count_nonzero(mask) == 0:
        return 0.0
    selected = mask == 1
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train[:, selected], y_train)
    y_pred = clf.predict(X_test[:, selected])
    return accuracy_score(y_test, y_pred)

# ------------------ 6. Cuckoo Search Algorithm ------------------
def levy_flight(Lambda):
    u = np.random.normal(0, 1) * 0.01
    v = np.random.normal(0, 1)
    step = u / abs(v) ** (1 / Lambda)
    return step

def cuckoo_search(n=30, pa=0.25, alpha=1.0, generations=30):
    nests = [np.random.randint(0, 2, num_features).tolist() for _ in range(n)]
    best = nests[0]
    best_score = fitness(best)

    for gen in range(generations):
        for i in range(n):
            step_size = levy_flight(1.5)
            new_nest = nests[i][:]
            for j in range(num_features):
                if random.random() < 0.5:
                    new_nest[j] = 1 - new_nest[j] if random.random() < abs(step_size) else new_nest[j]
            score = fitness(new_nest)
            if score > fitness(nests[i]):
                nests[i] = new_nest
            if score > best_score:
                best = new_nest
                best_score = score

        # Abandon some nests
        for i in range(n):
            if random.random() < pa:
                nests[i] = [random.randint(0, 1) for _ in range(num_features)]
    return np.array(best), best_score

# ------------------ 7. Run Cuckoo + RF ------------------
print("\nüê¶ Running Cuckoo Search for Feature Selection...")
best_mask, best_accuracy = cuckoo_search(n=30, generations=30)
selected_features = np.where(best_mask == 1)[0]
selected_feature_names = [feature_names[i] for i in selected_features]

# ------------------ 8. Final Model ------------------
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train[:, selected_features], y_train)
y_pred = model.predict(X_test[:, selected_features])

# ------------------ 9. Evaluation ------------------
print(f"\n‚úÖ Selected {len(selected_features)} features using Cuckoo Search:")
print(selected_feature_names)
print(f"\nüéØ Accuracy (Cuckoo + RF): {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nüìÑ Classification Report:\n", classification_report(y_test, y_pred))
print_metrics(y_test, y_pred)



üê¶ Running Cuckoo Search for Feature Selection...

‚úÖ Selected 16 features using Cuckoo Search:
['Sport', 'SrcBytes', 'DstBytes', 'DstLoad', 'SrcGap', 'DstGap', 'SIntPktAct', 'DIntPktAct', 'sMaxPktSz', 'sMinPktSz', 'pSrcLoss', 'pDstLoss', 'Packet_num', 'Temp', 'SpO2', 'Pulse_Rate']

üéØ Accuracy (Cuckoo + RF): 98.02%

üìÑ Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      2800
           1       0.97      0.81      0.88       280

    accuracy                           0.98      3080
   macro avg       0.97      0.90      0.94      3080
weighted avg       0.98      0.98      0.98      3080


üß© Confusion Matrix:
[[2792    8]
 [  53  227]]

üìä Metrics Summary:
Accuracy   Precision  Recall     F1-Score   Support   
0.9802     0.9737     0.9039     0.9354     3080      
