In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load and preprocess the data

def load_data_in_chunks(file_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunks.append(chunk)
    return pd.concat(chunks, axis=0)

In [None]:
# Load datasets
training_data_path = "/content/drive/MyDrive/CISC684/Project/normalized_dataset_training.csv"
testing_data_path = "/content/drive/MyDrive/CISC684/Project/normalized_dataset_testing.csv"

# Load and concatenate data in chunks
training_data = load_data_in_chunks(training_data_path)
testing_data = load_data_in_chunks(testing_data_path)


In [None]:
# Combine datasets for splitting
combined_data = pd.concat([training_data, testing_data], axis=0)

# Split combined data into features and labels
features = combined_data.drop(['label', 'attack_cat'], axis=1).values
labels = combined_data['label'].values

# Perform an 80/20 split for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [None]:
# Check the balance of the dataset (training + testing combined)
combined_labels = combined_data['label']

# Calculate the counts and percentages for each class
class_counts = combined_labels.value_counts()
class_percentages = (class_counts / combined_labels.shape[0]) * 100

class_counts, class_percentages


(label
 1.0    164649
 0.0     92997
 Name: count, dtype: int64,
 label
 1.0    63.905126
 0.0    36.094874
 Name: count, dtype: float64)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
print("Class balance after SMOTE:")
unique, counts = torch.tensor(y_train).unique(return_counts=True)
for cls, count in zip(unique.cpu().numpy(), counts.cpu().numpy()):
    print(f"Class {int(cls)}: {count} ({(count / len(y_train)) * 100:.2f}%)")

Class balance after SMOTE:
Class 0: 131720 (50.00%)
Class 1: 131720 (50.00%)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.model(x)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = MLPModel(input_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer)


Epoch [1/20], Loss: 0.1896
Epoch [2/20], Loss: 0.1459
Epoch [3/20], Loss: 0.1375
Epoch [4/20], Loss: 0.1327
Epoch [5/20], Loss: 0.1297
Epoch [6/20], Loss: 0.1273
Epoch [7/20], Loss: 0.1253
Epoch [8/20], Loss: 0.1240
Epoch [9/20], Loss: 0.1224
Epoch [10/20], Loss: 0.1214
Epoch [11/20], Loss: 0.1207
Epoch [12/20], Loss: 0.1203
Epoch [13/20], Loss: 0.1198
Epoch [14/20], Loss: 0.1190
Epoch [15/20], Loss: 0.1184
Epoch [16/20], Loss: 0.1179
Epoch [17/20], Loss: 0.1173
Epoch [18/20], Loss: 0.1174
Epoch [19/20], Loss: 0.1167
Epoch [20/20], Loss: 0.1162


In [None]:
# Evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    return all_labels, all_preds

labels, predictions = evaluate_model(model, test_loader)


In [None]:
# Print classification report
print("Accuracy:", accuracy_score(labels, predictions))
print(classification_report(labels, predictions))

Accuracy: 0.9471958082670289
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     18601
           1       0.97      0.95      0.96     32929

    accuracy                           0.95     51530
   macro avg       0.94      0.95      0.94     51530
weighted avg       0.95      0.95      0.95     51530

