In [None]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from glob import glob

In [None]:
FEATURE_DIM = 74
LATENT_DIM = 100

In [None]:
class FlowDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        
        self.features = self.data_frame.drop(columns=["Label"])
        self.features = self.features.select_dtypes(include=[np.number])

        self.transform = transform

    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        features = self.features.iloc[idx].values.astype(np.float32)
        return torch.tensor(features)

In [None]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(LATENT_DIM, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.Linear(1024, FEATURE_DIM),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        return self.model(z)

In [None]:
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(FEATURE_DIM, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1)
        )
    
    def forward(self, x):
        return self.model(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

learning_rate = 0.0001

In [None]:
n_critic = 5
clip_value = 0.01
batch_size = 64
steps = 1e6

In [None]:
files = glob("./data/*.csv")

for file in files:
    basename = os.path.basename(file)
    basename = basename.replace(".csv", "")

    dataset = FlowDataset(file)
    print(f"Training on {file}: {len(dataset)} samples")
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    generator = Generator().to(device)
    critic = Critic().to(device)
    critic_optimizer = optim.RMSprop(critic.parameters(), lr=learning_rate)
    generator_optimizer = optim.RMSprop(generator.parameters(), lr=learning_rate)

    epochs = int(steps / len(dataloader)) + 1

    for epoch in range(epochs):
        for i, real_data in enumerate(dataloader):
            real_data = real_data.to(device)
            batch_size = real_data.size(0)

            critic_optimizer.zero_grad()

            critic_real = critic(real_data)

            z = torch.randn(batch_size, LATENT_DIM, device=device)
            fake_data = generator(z).detach()
            critic_fake = critic(fake_data)

            critic_loss = -(torch.mean(critic_real) - torch.mean(critic_fake))
            critic_loss.backward()
            critic_optimizer.step()

            for p in critic.parameters():
                p.data.clamp_(-clip_value, clip_value)
            
            if i % n_critic == 0:
                generator_optimizer.zero_grad()

                z = torch.randn(batch_size, LATENT_DIM, device=device)
                fake_data = generator(z)
                critic_fake = critic(fake_data)

                generator_loss = -torch.mean(critic_fake)
                generator_loss.backward()
                generator_optimizer.step()
        else:
            print(f"\rEpoch [{epoch:3d}/{epochs}] Batch {i}/{len(dataloader)} \
                Loss D: {critic_loss.item():.4f}, Loss G: {generator_loss.item():.4f}", end="")
        
        if epoch % 100 == 0:
            with torch.no_grad():
                z = torch.randn(64, LATENT_DIM, device=device)
                fake_data = generator(z).cpu().numpy()
                fake_data = pd.DataFrame(fake_data, columns=dataset.features.columns)
                fake_data.to_csv(f"./fake/fake_data_epoch_{epoch}.csv")
    else:
        torch.save({
            'generator_state_dict': generator.state_dict(),
            'critic_state_dict': critic.state_dict(),
            'generator_optimizer_state_dict': generator_optimizer.state_dict(),
            'critic_optimizer_state_dict': critic_optimizer.state_dict(),
        }, f'wgan_cicflowmeter_{basename}.pth')

print("\nTraining complete.")

In [None]:
def generate_fake_data(generator, latent_dim, num_samples=1000):
    generator.eval()
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim).to(device)
        fake_data = generator(z).cpu().numpy()
        fake_data = pd.DataFrame(fake_data, columns=dataset.features.columns)
    return fake_data

generated_samples = generate_fake_data(generator, LATENT_DIM, num_samples=1000)

def denormalize(data, min_values, max_values):
    # Assuming the original data is in the range [0, 1]
    # Adjust this function based on the original data's range
    return data * (max_values - min_values) + min_values

min_values = dataset.features.min().values
max_values = dataset.features.max().values

denormalize_data = denormalize(generated_samples, min_values, max_values)

denormalize_data.to_csv("./denormalized_fake_data.csv")

In [None]:
torch.save({
    'generator_state_dict': generator.state_dict(),
    'critic_state_dict': critic.state_dict(),
    'generator_optimizer_state_dict': generator_optimizer.state_dict(),
    'critic_optimizer_state_dict': critic_optimizer.state_dict(),
}, 'wgan_cicflow_model.pth')