In [1]:
import pandas as pd

In [2]:
import os

# Limit threads globally (optional but useful)
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["OPENBLAS_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["VECLIB_MAXIMUM_THREADS"] = "2"
os.environ["NUMEXPR_NUM_THREADS"] = "2"

In [3]:
import pandas as pd

# Load and combine datasets (as before)
datasets = {
    'benign': pd.read_csv('/home/azwad/Works/CIC-IoT/labeled_dataset_benign.csv'),
    'dns': pd.read_csv('/home/azwad/Works/CIC-IoT/labeled_dataset_DNS_spoofing.csv'),
    'mitm': pd.read_csv('/home/azwad/Works/CIC-IoT/labeled_dataset_MITM_ArpSpoofing.csv')
}
combined_df = pd.concat([datasets['benign'], datasets['dns'], datasets['mitm']], ignore_index=True)
combined_df = combined_df.drop('dst_mac', axis=1)

print(combined_df.shape)


  'dns': pd.read_csv('/home/azwad/Works/CIC-IoT/labeled_dataset_DNS_spoofing.csv'),
  'mitm': pd.read_csv('/home/azwad/Works/CIC-IoT/labeled_dataset_MITM_ArpSpoofing.csv')


(883525, 135)


In [4]:
import re

def is_hex_mac(mac):
    if pd.isna(mac) or mac == 'unknown':  # Skip NaN or filled values
        return False
    pattern = r'^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$'
    return bool(re.match(pattern, str(mac)))

# Remove rows where src_mac is a hex MAC address
combined_df = combined_df[~combined_df['src_mac'].apply(is_hex_mac)]


print(f"Number of duplicate rows: {combined_df.duplicated().sum()}")
combined_df = combined_df.drop_duplicates()
print(f"New shape after removing duplicates: {combined_df.shape}")

categorical_cols = [col for col in combined_df.columns if combined_df[col].nunique() < 10 or combined_df[col].dtype == 'object']
print(f"Categorical features: {categorical_cols}")



for col in categorical_cols:
    combined_df[col] = combined_df[col].astype(str).fillna('unknown')

#Fill numerical columns with median (or mean)
numerical_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())


# Features (X) and targets
X = combined_df.drop(['Label', 'src_mac'], axis=1, errors='ignore')  
y_attack = combined_df['Label']  # For attack detection
y_device = combined_df['src_mac']  # For device identification


from sklearn.preprocessing import LabelEncoder

le_device = LabelEncoder()
y_device_encoded = le_device.fit_transform(y_device)


from sklearn.decomposition import PCA

# Apply PCA to retain 95% of variance (adjust n_components)

X_numeric = X.select_dtypes(include=['number'])


X = X_numeric.fillna(X_numeric.mean())


pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

print(f"Number of features reduced to: {X_reduced.shape[1]}")
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_):.2f}")

# Update X (note: PCA output is not labeled, use as numpy array)
X = pd.DataFrame(X_reduced, index=X.index)

from sklearn.model_selection import train_test_split

# Split for attack detection
X_train_attack, X_test_attack, y_train_attack, y_test_attack = train_test_split(
    X, y_attack, test_size=0.2, random_state=42, stratify=y_attack
)

# Split for device identification
X_train_device, X_test_device, y_train_device, y_test_device = train_test_split(
    X, y_device_encoded, test_size=0.2, random_state=42, stratify=y_device_encoded
)

Number of duplicate rows: 0
New shape after removing duplicates: (432555, 135)
Categorical features: ['src_mac', 'src_ip', 'dst_ip', 'port_class_dst', 'l4_tcp', 'l4_udp', 'ttl', 'handshake_version', 'handshake_ciphersuites', 'tls_server', 'http_request_method', 'http_host', 'http_response_code', 'user_agent', 'dns_server', 'dns_query_type', 'dns_len_ans', 'device_mac', 'eth_src_oui', 'eth_dst_oui', 'highest_layer', 'http_uri', 'http_content_len', 'http_content_type', 'icmp_type', 'icmp_checksum_status', 'icmp_data_size', 'Label']
Number of features reduced to: 5
Explained variance ratio: 0.95


In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_attack)

# Convert X to numpy as before
X = X.to_numpy().astype(np.float32)
y_attack = y_encoded.astype(np.int64)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y_attack, test_size=0.2, random_state=42)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [8]:
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [9]:
batch_size = 64

train_dataset = TabularDataset(X_train, y_train)
val_dataset = TabularDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [15]:
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.model(x)


In [16]:
input_dim = X.shape[1]
num_classes = len(np.unique(y_train_attack))
model = MLP(input_dim, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [17]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1/20, Loss: 972.6782
Epoch 2/20, Loss: 1.6664
Epoch 3/20, Loss: 1.1455
Epoch 4/20, Loss: 1.2834
Epoch 5/20, Loss: 1.1840


KeyboardInterrupt: 

In [13]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds))


Validation Accuracy: 0.36645050918380323
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     26077
           1       0.00      0.00      0.00     28732
           2       0.37      1.00      0.54     31702

    accuracy                           0.37     86511
   macro avg       0.12      0.33      0.18     86511
weighted avg       0.13      0.37      0.20     86511



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
