# Load and Preprocess Data
Import required libraries (numpy, pandas, torch) and load the dataset. Handle missing values and convert categorical variables if needed.

In [6]:
import numpy as np
import pandas as pd
import torch
from ast import literal_eval

# Load and convert sequences
data = pd.read_csv('adfa_ld_processed.csv')
data['sequence'] = data['sequence'].apply(literal_eval)

# Find max length and pad
max_len = max(len(seq) for seq in data['sequence'])

def pad_sequence(seq):
    return np.pad(seq, (0, max_len - len(seq)), 'constant', constant_values=0)

# Convert to numeric array
X = np.array([pad_sequence([int(x) for x in seq]) for seq in data['sequence']])

# Normalize to [0,1]
X = (X - X.min()) / (X.max() - X.min())

# Convert labels 
y = pd.get_dummies(data['label']).values

# Convert to tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

# Add channel dimension for CNN-GAN
X_tensor = X_tensor.unsqueeze(1)

print(f"Data shape: {X_tensor.shape}")
print(f"Value range: [{X_tensor.min():.3f}, {X_tensor.max():.3f}]")

Data shape: torch.Size([1579, 1, 2948])
Value range: [0.000, 1.000]


In [7]:
X_tensor

tensor([[[0.4941, 0.4941, 0.4941,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0176, 0.0971, 0.0176,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.4941, 0.7794, 0.0088,  ..., 0.0000, 0.0000, 0.0000]],

        ...,

        [[0.0176, 0.6500, 0.6500,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.2676, 0.5647, 0.0176,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.7059, 0.7059, 0.0971,  ..., 0.0000, 0.0000, 0.0000]]])

# Scale Features to [-1, 1]
Use MinMaxScaler or custom normalization to scale numerical features to the range [-1, 1], which is optimal for GAN training.

NameError: name 'data_array' is not defined

# Convert to Tensor Format
Convert the normalized numpy arrays to PyTorch tensors with appropriate data types and shapes.

In [9]:
import torch

# Convert numpy arrays to PyTorch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

# Add channel dimension for CNN-GAN
X_tensor = X_tensor.unsqueeze(1)

print(f"X_tensor shape: {X_tensor.shape}")
print(f"y_tensor shape: {y_tensor.shape}")

X_tensor shape: torch.Size([1579, 1, 2948])
y_tensor shape: torch.Size([1579, 2])


In [12]:
# Check actual data dimensions
print(f"X_tensor shape: {X_tensor.shape}")
print(f"Number of malicious samples: {(y_tensor[:, 1] == 1).sum()}")


X_tensor shape: torch.Size([1579, 1, 2948])
Number of malicious samples: 746


In [17]:
import torch
import torch.nn as nn
import torch.utils.data as data

# Reshape tensor - flatten channel dimension
X_tensor = X_tensor.squeeze(1)  # New shape: [1579, 2948]

# Constants
input_dim = 2948
latent_dim = 100
batch_size = 64
hidden_dim = 512

# Generator
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),  # Replace BatchNorm
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.LayerNorm(hidden_dim * 2),  # Replace BatchNorm
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim * 2, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        return self.model(z)

# Discriminator
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# Setup training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
generator = Generator().to(device)
discriminator = Discriminator().to(device)

# Create dataset of malicious samples only

# Optimizers
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
criterion = nn.BCELoss()
# Training loop


# 1. Setup data
malicious_mask = y_tensor[:, 1] == 1
malicious_data = X_tensor[malicious_mask].squeeze(1)  # Remove channel dim
dataset = data.TensorDataset(malicious_data)
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
import torch.nn.utils as utils

# Training loop with improved error handling
num_epochs = 10000
max_grad_norm = 1.0
best_loss = float('inf')
patience = 5
patience_counter = 0

# Training loop
try:
    for epoch in range(num_epochs):
        generator.train()
        discriminator.train()
        
        total_d_loss = 0
        total_g_loss = 0
        batches = 0
        
        for i, (real_samples,) in enumerate(dataloader):
            current_batch_size = real_samples.size(0)
            
            # Train Discriminator
            optimizer_D.zero_grad()
            real_samples = real_samples.to(device)
            real_labels = torch.ones(current_batch_size, 1).to(device)
            fake_labels = torch.zeros(current_batch_size, 1).to(device)
            
            z = torch.randn(current_batch_size, latent_dim).to(device)
            fake_samples = generator(z)
            
            d_real = discriminator(real_samples)
            d_fake = discriminator(fake_samples.detach())
            d_loss = (adversarial_loss(d_real, real_labels) + 
                     adversarial_loss(d_fake, fake_labels)) / 2
            
            d_loss.backward()
            optimizer_D.step()

            # Train Generator
            optimizer_G.zero_grad()
            g_output = discriminator(fake_samples)
            g_loss = adversarial_loss(g_output, real_labels)
            g_loss.backward()
            optimizer_G.step()

            total_d_loss += d_loss.item()
            total_g_loss += g_loss.item()
            batches += 1

        # Generate samples
        if epoch % 10 == 0:
            generator.eval()  # Set to eval mode
            with torch.no_grad():
                z = torch.randn(batch_size, latent_dim, device=device)  # Use full batch
                fake = generator(z)
                print(f"\nEpoch {epoch} Summary:")
                print(f"Average D loss: {total_d_loss/batches:.4f}")
                print(f"Average G loss: {total_g_loss/batches:.4f}")
                print(f"Sample output: {fake[0][:10].cpu().numpy()}\n")
            generator.train()  # Back to train mode

except Exception as e:
    print(f"Error during training: {str(e)}")
    import traceback
    traceback.print_exc()


Epoch 0 Summary:
Average D loss: 0.7019
Average G loss: 0.6676
Sample output: [0.40641993 0.51213396 0.26225832 0.7073802  0.49365905 0.512072
 0.5425186  0.65626186 0.7273992  0.46501726]


Epoch 10 Summary:
Average D loss: 0.7011
Average G loss: 0.6670
Sample output: [0.46024296 0.3321407  0.24131423 0.42760295 0.46592218 0.49297282
 0.39785013 0.63778764 0.61452615 0.48506957]


Epoch 20 Summary:
Average D loss: 0.7021
Average G loss: 0.6662
Sample output: [0.29485673 0.30687013 0.22003867 0.4762129  0.4585861  0.49001122
 0.47168234 0.64999634 0.6198393  0.32966527]


Epoch 30 Summary:
Average D loss: 0.7013
Average G loss: 0.6651
Sample output: [0.30894536 0.31976867 0.21658246 0.68099475 0.60940033 0.5584117
 0.49394962 0.57751983 0.46970698 0.39768016]


Epoch 40 Summary:
Average D loss: 0.7020
Average G loss: 0.6650
Sample output: [0.45597905 0.51905435 0.34841186 0.48495027 0.40114132 0.37034667
 0.41007972 0.6965505  0.50605434 0.4538072 ]


Epoch 50 Summary:
Average D loss:

KeyboardInterrupt: 

# Create Data Loader
Create a PyTorch DataLoader for efficient batch processing during GAN training.

In [22]:
import os
import json
import numpy as np
import torch
from datetime import datetime

# Create output directories
output_dir = "gan_output"
model_dir = os.path.join(output_dir, "models")
samples_dir = os.path.join(output_dir, "samples")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(samples_dir, exist_ok=True)

# Save function for training metrics
def save_metrics(metrics, epoch):
    metrics_file = os.path.join(output_dir, f"metrics_epoch_{epoch}.json")
    with open(metrics_file, 'w') as f:
        json.dump(metrics, f, indent=4)

# Save function for generated samples
def save_samples(samples, epoch):
    samples_file = os.path.join(samples_dir, f"samples_epoch_{epoch}.npy")
    np.save(samples_file, samples.cpu().numpy())

# Save function for model checkpoints
def save_checkpoint(generator, discriminator, g_opt, d_opt, epoch):
    checkpoint = {
        'epoch': epoch,
        'generator_state': generator.state_dict(),
        'discriminator_state': discriminator.state_dict(),
        'g_optimizer': g_opt.state_dict(),
        'd_optimizer': d_opt.state_dict()
    }
    path = os.path.join(model_dir, f"checkpoint_epoch_{epoch}.pt")
    torch.save(checkpoint, path)

# Add to training loop:
if epoch % 100 == 0:
    # Save models
    save_checkpoint(generator, discriminator, optimizer_G, optimizer_D, epoch)
    
    # Save metrics
    metrics = {
        'epoch': epoch,
        'd_loss': total_d_loss/batches,
        'g_loss': total_g_loss/batches
    }
    save_metrics(metrics, epoch)
    
    # Generate and save samples
    generator.eval()
    with torch.no_grad():
        z = torch.randn(batch_size, latent_dim, device=device)
        fake_samples = generator(z)
        save_samples(fake_samples, epoch)
    generator.train()