# Generating flow duration and flow size

**NOTE(20/12/23)** So far with the training, the GAN cannot emulate the distribution of training data accurately with a small number of data samples and a smaller batch. Increasing both values seems to improve the accuracy of distribution. Model will need to optimised to find the most appropriate hyperparameter values.


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd
import time

## Discriminator class

In [9]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            # Input is 2D, first hidden layer is composed of 256 neurons with ReLU activation
            nn.Linear(2, 256), 
            nn.ReLU(),

            # Have to use dropout to avoid overfitting
            nn.Dropout(0.3),

            # second and third layers are composed to 128 and 64 neurons, respectively
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # output is composed of a single neuron with sigmoidal activation to represent a probability
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        output = self.model(x)
        return output

## Generator class 

In [10]:
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
        )

    def forward(self, x):
        output = self.model(x)
        return output

### Prepare data

In [11]:
def train_data_length(data, length):
    return data[:length]
    
def load_data():
    data = torch.load("data.pt")
    data = data.to(torch.float32)
    train_data = train_data_length(data,TRAINING_DATA_LENGTH)
    return train_data

In [12]:
TRAINING_DATA_LENGTH = 1024
BATCH_SIZE = 32

train_data = load_data()
train_labels = torch.zeros(TRAINING_DATA_LENGTH)
train_set = [(train_data[i], train_labels[i]) for i in range(TRAINING_DATA_LENGTH)]
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last = True)

print("Done")

Done


## Prepare for training

In [13]:
generator = Generator()
discriminator = Discriminator()

# Before training the models, need to define some parameters
RANDOM_SEED = 77  # Ensures reproducibility in the randomness
LEARNING_RATE = 0.001  # Used to adapt the network weights
NUM_EPOCHS = 10  # How many repetitions of training using the whole training set will be performed
loss_function = nn.BCELoss()
optimiser_discriminator = optim.Adam(discriminator.parameters(), lr = LEARNING_RATE)
optimiser_generator = optim.Adam(generator.parameters(), lr = LEARNING_RATE)

## Training process

In [14]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    # Taking the real samples of the current batch from the data loader and assign them to real_samples
    # The first dimension of the tensor has the number of elements equal to the batch size. 
    # This is the standard way of organising data in PyTorch, with each line of the tensor representing one sample from the batch.
    for n, (real_samples, _) in enumerate(tqdm(train_loader)):
        # DATA FOR TRAINING THE DISCRIMINATOR

        # Using torch.ones() to create labels with the value 1 for real samples, and then assign to real_samples_labels
        real_samples_labels = torch.ones((BATCH_SIZE, 1))

        # Create the generated samples by storing random data in latent_space_samples
        # This is fed into the generator to obtain generated_samples
        torch.manual_seed(RANDOM_SEED)
        latent_space_samples = torch.randn((BATCH_SIZE, 2))
        generated_samples = generator(latent_space_samples)


        # Use torch.zeros() to assign 0 to the labels for the generated samples
        generated_samples_labels = torch.zeros((BATCH_SIZE, 1))


        # Concatenate the real and generated samples and labels and store them in all_samples
        # and all_samples_labels to train the discriminator
        all_samples = torch.cat((real_samples, generated_samples))
        all_samples_labels = torch.cat((real_samples_labels, generated_samples_labels))



        # TRAINING THE DISCRIMINATOR

        # Clear the gradients at each training step to avoid accumulating them
        discriminator.zero_grad()

        # Calculate the output of the discriminator from the training data in all_samples
        output_discriminator = discriminator(all_samples)

        # Calculate the loss function using discriminator output and all the labels
        loss_discriminator = loss_function(output_discriminator, all_samples_labels)
        
        # Calculate the gradients to update the weights
        loss_discriminator.backward()

        # Update the discriminator weights
        optimiser_discriminator.step()



        # DATA FOR TRAINING THE GENERATOR
        
        # Storing random data in latent_space_samples with a number of lines to equal batch_size
        torch.manual_seed(RANDOM_SEED)
        latent_space_samples = torch.randn((BATCH_SIZE, 2))

        # TRAINING THE GENERATOR
        generator.zero_grad()
        generated_samples = generator(latent_space_samples)

        # Feeding generator's output into the discriminator and store its output, which is used
        # as the output of the whole model
        output_discriminator_generated = discriminator(generated_samples)

        # Calculate the loss function
        loss_generator = loss_function(output_discriminator_generated, real_samples_labels)
        
        # Calculate and update the gradients
        # REMEMBER:
        # When the generator is trained, the discriminator weights are frozen since optimiser_generator
        # was created with its first argument equal to generator.parameters()
        loss_generator.backward()
        optimiser_generator.step()


        # Show loss
        if epoch % 10 == 0 and n == BATCH_SIZE - 1:
            print(f"Epoch: {epoch} Loss D.: {loss_discriminator}")
            print(f"Epoch: {epoch} Loss G.: {loss_generator}\n")

end_time = time.time()
run_time = round(end_time - start_time, 2)
print(f"Run time for {NUM_EPOCHS} epoch(s) is {run_time} seconds.")
print("done")

  0%|          | 0/32 [00:00<?, ?it/s]

Epoch: 0 Loss D.: 50.00044631958008
Epoch: 0 Loss G.: 8.099625587463379



  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

Run time for 10 epoch(s) is 4.73 seconds.
done


## Generating synthetic data from the trained adversaries

In [15]:
# Using 1000 samples of random noise to generate 1000 samples of synthetic data
torch.manual_seed(RANDOM_SEED)
latent_space_samples = torch.randn(1000, 2)
generated_samples = generator(latent_space_samples)

generated_samples = generated_samples.detach().numpy()
df = pd.DataFrame(generated_samples, columns = ["Duration", "Size"])
print(df)

     Duration      Size
0   -0.099430 -0.031040
1   -0.116570  0.035047
2   -0.287762  0.034298
3   -0.164122  0.091254
4   -0.078110 -0.051378
..        ...       ...
995 -0.095357 -0.005502
996 -0.110841 -0.021174
997 -0.089764 -0.039157
998 -0.209945  0.062785
999 -0.249876 -0.099508

[1000 rows x 2 columns]
