**Data Preprocessing for KDD99 Dataset**

In [None]:
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
#LabelEncoder -> Encodes categorical features as numbers.
#MinmaxScaler -> Scales features to range [0,1](default values).
import torch
from torch.utils.data import DataLoader, TensorDataset
#TensorDataset -> Dataset wrapper which combines multiple tensors. Useful when you have features and labels in separate tensors and want to use them together
#DataLoader -> Iterator provides many features like batching, shuffling, parallel data loading, etc

class kddData(object):
  def __init__(self,batch_size):
    kddcup99 = sklearn.datasets.fetch_kddcup99()
    self._encoder = {       #Private variable used within class
          'protocal' : LabelEncoder(),
          'service'  : LabelEncoder(),
          'flag' : LabelEncoder(),
          'label' : LabelEncoder()
    } #self._encoder: Dictionary of LabelEncoder instances for categorical features

    self._scaler = MinMaxScaler()
    self.batch_size = batch_size
    data_x, data_y = self._encode_and_scale_data(kddcup99.data,kddcup99.target)

    self.train_dataset,self.test_dataset = self.__split_data_to_tensor(data_x, data_y)

    self.train_dataloader = DataLoader(self.train_dataset, self.batch_size, shuffle=True)
    self.test_dataloader = DataLoader(self.test_dataset, self.batch_size, shuffle=True)

  def _encode_and_scale_data(self,data_X,data_y):
    # Analyzing the total input samples
    print("output by Vamsidhar Vuddagiri")
    print("Original data samples: ", data_X[:5])
    print("Original labels samples: ", data_y[:5])

    #Normalizing the values that is converting to Labelencoder
    #.fit -> Also known as training the encoder in this context. During the fitting process, the encoder examines all unique values in a categorical feature. assigns a unique integer to each unique category.
    # Eg: Taking the feature protocol, it has many values eg tcp, udp, so on. Converting it into set to get unique values and converting to list to get uniques label encoders.
    self._encoder['protocal'].fit(list(set(data_X[:, 1])))
    self._encoder['service'].fit(list(set(data_X[:,2])))
    self._encoder['flag'].fit(list(set(data_X[:,3])))

    #Uses the fitted label encoders to transform categorical features into numerical values.
    data_X[:,1] = self._encoder['protocal'].transform(data_X[:,1])
    data_X[:,2] = self._encoder['service'].transform(data_X[:,2])
    data_X[:,3] = self._encoder['flag'].transform(data_X[:,3])

    #Normalizing the samples between [0,1]
    data_X = self._scaler.fit_transform(data_X)
    print("Normalized samples: ", data_X[:5])

    #np.pad -> Extends array along with specified diensions. Constant specifies 0
    data_X = np.pad(data_X, ((0, 0), (0, 64 - len(data_X[0]))), 'constant')
    print("padded data samples：", data_X[:5])

    #Converts output labels to 0 & 1. 0 -> normal, 1 -> abnormal(anomaly)
    data_y = np.where(data_y == b'normal.', 0, 1)
    print("encoded data samples：", data_y[:5])

    return data_X , data_y

  def __split_data_to_tensor(self, data_X, data_y):
    normal_label = 0
    normal_idx = data_y == normal_label
    anomaly_idx = ~normal_idx

    X_normal, X_anomaly = data_X[normal_idx], data_X[anomaly_idx]
    y_normal, y_anomaly = data_y[normal_idx], data_y[anomaly_idx]

    print("normal data samples：", X_normal[:5])
    print("anomaly data samples：", X_anomaly[:5])

    X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(X_normal, y_normal, test_size=0.3)
    X_train_anomaly, X_test_anomaly, y_train_anomaly, y_test_anomaly = train_test_split(X_anomaly, y_anomaly, test_size=0.3)

    X_train = np.concatenate((X_train_normal, X_train_anomaly), axis=0)
    y_train = np.concatenate((y_train_normal, y_train_anomaly), axis=0)
    X_test = np.concatenate((X_test_normal, X_test_anomaly), axis=0)
    y_test = np.concatenate((y_test_normal, y_test_anomaly), axis=0)

    print("training dataset samples：", X_train[:5])
    print("test dataset samples：", X_test[:5])

    #TensorDataset -> function in PyTorch is a dataset wrapper that combines multiple tensors into a single dataset.
    train_dataset = TensorDataset(
          torch.from_numpy(X_train.astype(np.float32)),
          torch.from_numpy(y_train.astype(int))
      )

    test_dataset = TensorDataset(
        torch.from_numpy(X_test.astype(np.float32)),
        torch.from_numpy(y_test.astype(int))
    )
    return train_dataset, test_dataset




In [None]:
import torch.nn as nn
import torch.optim as optim

# Hyperparameters
batch_size = 64
lr = 0.0002
z_dim = 41    #Number of features in the dataset
epochs = 50

In [None]:
class Generator(nn.Module):
  def __init__(self,z_dim):  #initialization
    super(Generator,self).__init__()   #Ensures that nn.module is properly initialized
    self.model = nn.Sequential(       #nn.sequential -> Allows us to build neural network by stacking layers in the order they should execute
        nn.Linear(z_dim,256),
        nn.ReLU(True),            #Activation function to introduce non-linearity,True - indicates that operation should be done inplace, modifying input data without allocating new memory
        nn.Linear(256,512),
        nn.ReLU(True),
        nn.Linear(512,64),
        #nn.Relu(True),
        #nn.Linear(1024,28*28)
        nn.Tanh()      #Activation function applies hyperbolic tangent function element wise. To scale output to a range suitable for the data. Espcially if the normalized data is between -1 to 1
    )
  def forward(self,z):
    return self.model(z)

class Discriminator(nn.Module):
  def __init__(self,z_dim):
    super(Discriminator,self).__init__()
    self.model = nn.Sequential(
        #nn.Linear(28*28,1024),
        #nn.LeakyReLU(0.2,inplace = True) # This adds slope for negative values, which helps to address the dying of neurons
        #This small gradient prevents the weights from not updating, which helps avoid the situation where neurons stop learning (dying) and stay inactive.
        # We are using this in GAN discriminator for many reasons Avoiding sparse gradients, Imporoved
        #traing stability, discriminator sensitivity, architecture compatibility
        nn.Linear(64,512),
        nn.LeakyReLU(0.2,inplace=True),
        nn.Linear(512,256),
        nn.LeakyReLU(0.2,inplace=True),
        nn.Linear(256,1),
        nn.Sigmoid()
    )
  def forward(self,x):
    return self.model(x)

**Task - 1:**

In [None]:
#Loss-D The discriminator's objective is to distinguish between real data (from the training set) and fake data (generated by the generator).
# A lower Loss D indicates that the discriminator is getting better at distinguishing real from fake data.
#loss - G The generator's objective is to produce data that is realistic enough to fool the discriminator.
# A lower Loss G suggests that the generator is producing data that is more likely to be classified as real by the discriminator.
#GANs operate in a minimax game where the generator tries to minimize Loss G while the discriminator tries to minimize Loss D.
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Hyperparameters

batch_size = 64
kdd_data = kddData(batch_size=batch_size)
train_dl = kdd_data.train_dataloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step -1 Train the GAN on normal data only, so the generator learns to produce realistic normal data.

generator = Generator(z_dim).to(device)
discriminator = Discriminator(z_dim).to(device)

optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

epochs = 2
criterion = nn.BCELoss()

for epoch in range(epochs):
    for i, data in enumerate(train_dl):

        real_data, labels = data
        normal_data = real_data[labels == 0]  # Extract only normal data based on labels

        # Check if there's any normal data in the batch
        if normal_data.size(0) == 0:
            continue  # Skip batch if there's no normal data

        normal_data = normal_data.to(device)
        batch_size = normal_data.size(0)
        valid = torch.ones(batch_size, 1).to(device)
        fake = torch.zeros(batch_size, 1).to(device)

        optimizer_G.zero_grad()
        z = torch.randn(batch_size, z_dim).to(device)

        gen_data = generator(z)
        g_loss = criterion(discriminator(gen_data), valid)

        g_loss.backward()
        optimizer_G.step()

        optimizer_D.zero_grad()

        # The discriminator is trained to classify normal_data as real by comparing it to valid
        real_loss = criterion(discriminator(normal_data), valid)
        fake_loss = criterion(discriminator(gen_data.detach()), fake)

        d_loss = (real_loss + fake_loss) / 2
        d_loss.backward()
        optimizer_D.step()

        if i % 100 == 0:
            print(f"Output by Vamsidhar Vuddagiri: Epoch [{epoch+1}/{epochs}] Batch [{i}/{len(train_dl)}] \
                  Loss D: {d_loss.item():.4f}, Loss G: {g_loss.item():.4f}")




output by Vamsidhar Vuddagiri
Original data samples:  [[0 b'tcp' b'http' b'SF' 181 5450 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 8 8 0.0
  0.0 0.0 0.0 1.0 0.0 0.0 9 9 1.0 0.0 0.11 0.0 0.0 0.0 0.0 0.0]
 [0 b'tcp' b'http' b'SF' 239 486 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 8 8 0.0
  0.0 0.0 0.0 1.0 0.0 0.0 19 19 1.0 0.0 0.05 0.0 0.0 0.0 0.0 0.0]
 [0 b'tcp' b'http' b'SF' 235 1337 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 8 8 0.0
  0.0 0.0 0.0 1.0 0.0 0.0 29 29 1.0 0.0 0.03 0.0 0.0 0.0 0.0 0.0]
 [0 b'tcp' b'http' b'SF' 219 1337 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 6 6 0.0
  0.0 0.0 0.0 1.0 0.0 0.0 39 39 1.0 0.0 0.03 0.0 0.0 0.0 0.0 0.0]
 [0 b'tcp' b'http' b'SF' 217 2032 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 6 6 0.0
  0.0 0.0 0.0 1.0 0.0 0.0 49 49 1.0 0.0 0.02 0.0 0.0 0.0 0.0 0.0]]
Original labels samples:  [b'normal.' b'normal.' b'normal.' b'normal.' b'normal.']
Normalized samples:  [[0.00000000e+00 5.00000000e-01 3.38461538e-01 9.00000000e-01
  2.61041764e-07 1.05713002e-03 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.0

In [None]:
# Step - 2 After training, use the Discriminator's Score: For a data instance in the test-dataset, pass it through the discriminator.

discriminator.eval()
with torch.no_grad():
  discriminator_anomaly_score = []
  for data in kdd_data.test_dataloader:
    test_data,labels = data
    test_data = test_data.to(device)

    output = discriminator(test_data)
    discriminator_anomaly_score.extend(output.cpu().numpy())

for i in discriminator_anomaly_score[:10]:
  print(i,end = ' ')
print('\n')
print("Output By Vamsidhar Vuddagiri")
print(len(discriminator_anomaly_score))



[0.04108508] [0.04108508] [0.02047094] [0.0194628] [0.97011864] [0.04108508] [0.96617436] [0.68150353] [0.06674822] [0.04108508] 

Output By Vamsidhar Vuddagiri
148207


**Task-2 After training a GAN with normal data, we utilize Generator to detect anomaly data:**

In [None]:
#Step - 1 Train the GAN on normal data, is done in the above code cells

generator.eval()

mse_loss = nn.MSELoss()
generator_recon_error = []

with torch.no_grad():

  for data in kdd_data.test_dataloader:
    test_data,labels= data
    test_data = test_data.to(device)
    z = torch.randn(test_data.size(0),z_dim).to(device)
    gen_data = generator(z) #Step - 2 Generating synthetic data

    error = mse_loss(test_data,gen_data) #step - 3 Calculate Reconstruction Error
    generator_recon_error.append(error.item())
#item -> Using .item() in PyTorch is necessary when one want to extract a scalar value (like a loss or a single element from a tensor) from a PyTorch tensor.

print("Output By Vamsidhar Vuddagiri")
print(generator_recon_error)
print(generator_recon_error[:5])

Output By Vamsidhar Vuddagiri
[0.08483822643756866, 0.07727351784706116, 0.08414161205291748, 0.08524221926927567, 0.08313155174255371, 0.08663501590490341, 0.08269824087619781, 0.08233568072319031, 0.08717755973339081, 0.08617262542247772, 0.07647980749607086, 0.08966077864170074, 0.08200601488351822, 0.08586807548999786, 0.08271805197000504, 0.07702021300792694, 0.084801584482193, 0.09080754965543747, 0.07904180884361267, 0.076869897544384, 0.09350565075874329, 0.07832103967666626, 0.07831224799156189, 0.07695070654153824, 0.08650258183479309, 0.08212855458259583, 0.08364462852478027, 0.08121958374977112, 0.07860980927944183, 0.08284422010183334, 0.07994355261325836, 0.07463324815034866, 0.08462869375944138, 0.0832321047782898, 0.08702519536018372, 0.08111811429262161, 0.0834161564707756, 0.07985278964042664, 0.07940258085727692, 0.08205792307853699, 0.08309320360422134, 0.07962082326412201, 0.08564326167106628, 0.08609050512313843, 0.07992836833000183, 0.08595743030309677, 0.0845723

**Task-3 Designing a score based on Discriminator's score and Generator's reconstruction error**

In [None]:
def normalize(data):
    min_val = min(data)
    max_val = max(data)
    normalized_data = [(val - min_val) / (max_val - min_val) for val in data]
    return normalized_data
norm_discriminator_score = normalize(discriminator_anomaly_score)
norm_generator_recon_error = normalize(generator_recon_error)

anomaly_score = [disc_score + gen_score for disc_score, gen_score in zip(norm_discriminator_score, norm_generator_recon_error)]
#anomaly_score = [0.7 * disc_score + 0.3 * gen_score for disc_score, gen_score in zip(norm_discriminator_score, norm_generator_recon_error)]
anomaly_score = [score[0] for score in anomaly_score]
print("Output By Vamsidhar Vuddagiri")
print(anomaly_score)
print(anomaly_score[:5])

Output By Vamsidhar Vuddagiri
[1.2875819, 1.2361817, 0.5658106, 1.2359478, 0.53041905, 0.635278, 1.0055411, 0.47631326, 1.3004619, 1.5245037, 0.9738647, 0.75919706, 0.5022348, 0.60265535, 1.1314658, 0.29379654, 0.56588113, 0.7993789, 0.38711825, 0.3110165, 0.8939178, 0.33971214, 0.36155513, 1.2182148, 0.6485368, 1.165553, 0.54839677, 1.4201628, 0.34968507, 0.5206655, 0.41902885, 0.21422791, 0.5828776, 0.53394234, 0.6677985, 0.48965383, 1.1848066, 1.0795095, 0.39975938, 1.2168689, 0.5290753, 0.38717237, 0.61874133, 1.2920365, 0.39679575, 0.6294352, 0.5619885, 1.1888382, 1.4257975, 1.4769931, 0.53764755, 0.66381574, 0.48718885, 0.59613687, 1.2069747, 0.6850238, 0.5397024, 1.2619779, 0.35905808, 1.4851136, 1.4242991, 0.46166125, 0.5126261, 0.5257304, 0.7408407, 0.60358095, 0.22783951, 0.55264944, 0.25701678, 0.49932382, 0.50777614, 0.33808622, 0.84235877, 0.31577897, 0.37372237, 0.36539456, 1.143275, 0.48202533, 0.79599005, 0.5451123, 0.7457806, 0.5847534, 0.5179899, 0.6823398, 1.5460596,

In [None]:
#Since, we haven't trained with actual labels, I have assumed a threshold. If the the anomaly score is greater than threshold then the predicted label is 1 else 0
#Like this, we will get a list and, we will calculate the Accuracy, Precision, Recall, F1 Score

# Determine threshold for anomaly detection
anomaly_threshold = 0.4

# Classify instances as normal or anomalous based on the anomaly score
predicted_labels = [1 if score > anomaly_threshold else 0 for score in anomaly_score]


true_labels = [label.item() for _, labels in kdd_data.test_dataloader for label in labels]

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels[:len(anomaly_score)], predicted_labels) * 100
recall = recall_score(true_labels[:len(anomaly_score)], predicted_labels)* 100
f1 = f1_score(true_labels[:len(anomaly_score)], predicted_labels) * 100
accuracy = accuracy_score(true_labels[:len(anomaly_score)], predicted_labels) * 100

print("Output By Vamsidhar Vuddagiri")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Output By Vamsidhar Vuddagiri
Precision: 80.8930
Recall: 83.1821
F1-score: 82.0216
Accuracy: 70.5095
