In [1]:
#installing necessary libraries
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torchmetrics

Note: you may need to restart the kernel to use updated packages.


In [26]:
#importing necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchmetrics.classification import Accuracy   
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

In [4]:
#Loading the dataset
df= pd.read_csv(r'C:\Users\LENOVO\Downloads\IDS-2018_Intrusion.csv')
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [5]:
df.shape

(1048575, 80)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  float64
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

In [7]:
#Checking for missing values
df.isnull().sum()

Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, dtype: int64

In [8]:
#checking for duplicate values
df.duplicated().sum()

5459

In [9]:
# Remove whitespace from column names
df.columns = df.columns.str.strip()

In [10]:
df['Label'].value_counts()

Label
Benign    762384
Bot       286191
Name: count, dtype: int64

In [11]:
#dropping constant columns
const_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
df = df.drop(columns=const_cols)

In [12]:
#Target mapping
df['Label'] = df['Label'].map({'Benign': 0, 'Bot':1})

In [13]:
#Handling missing/infinite values if any
df = df.replace([np.inf, -np.inf], np.nan)
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [14]:
#Encoding categorical features
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c!='Label']
df = pd.get_dummies(df, columns=[c for c in cat_cols if df[c].nunique()<=20], drop_first=True)

In [15]:
#Feature Scaling
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
#Splitting the dataset into training and testing sets
X = df.drop(columns=['Label'])
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [19]:
#Computing class weights to handle class imbalance
classes = np.unique(y_train)
cw = compute_class_weight('balanced', classes=classes, y=y_train)

In [23]:
#Creating a Neural Network model
layer0= nn.Linear(X_train.shape[1], 512)
layer1= nn.Linear(512, 256)
layer2= nn.Linear(256, 128)
#Weight initialization
nn.init.uniform_(layer0.weight)
nn.init.uniform_(layer1.weight)
nn.init.uniform_(layer2.weight)
model= nn.Sequential(
    layer0,
    nn.ReLU(),
    nn.Dropout(0.25),
    layer1,
    nn.ReLU(),
    nn.Dropout(0.25),
    layer2,
    nn.Sigmoid()
)

In [None]:
#Choosing loss function
scores= torch.tensor([])
one_hot_targets = F.one_hot(torch.LongTensor(y_train.values), num_classes=2).float()
loss = nn.CrossEntropyLoss(weight=torch.tensor(cw).to(device='cpu' if not torch.cuda.amp.is_available() else 'cuda'))

In [31]:
#Defining dataset
X_train_tensor = torch.tensor(X_train.select_dtypes(include=['float64']).values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.astype(float), dtype=torch.long)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

In [30]:
#Setting up optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-5, momentum=0.9)