In [1]:
# For tips on running notebooks in Google Colab, see
# https://pytorch.org/tutorials/beginner/colab
%matplotlib inline

In [2]:
#%matplotlib inline
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# Set random seed for reproducibility
manualSeed = 999
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
torch.use_deterministic_algorithms(True) # Needed for reproducible results

  Referenced from: /Users/siyuwu/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  Expected in: /Users/siyuwu/anaconda3/lib/python3.11/site-packages/torch/lib/libc10.dylib
  warn(


Random Seed:  999


In [3]:
# Generator Code

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features=100, out_features=128), # Random noise dimension to 128
            nn.LeakyReLU(0.2),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 3) # Output features to match tabular data columns
        )

    def forward(self, x):
        return self.model(x)

Discriminator Code


In [4]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features=3, out_features=512), # Input features to match tabular data columns
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [5]:
import sys, os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# The datasets needed are computed by the `ComputeGluon.py` script in PseudoData
filename1='https://raw.githubusercontent.com/rabah-khalek/TF_tutorials/master/PseudoData/gluon_NNPDF31_nlo_pch_as_0118_xmin1e-3.dat'
filename2='https://raw.githubusercontent.com/rabah-khalek/TF_tutorials/master/PseudoData/gluon_NNPDF31_nlo_pch_as_0118_xmin1e-4.dat'
filename3='https://raw.githubusercontent.com/rabah-khalek/TF_tutorials/master/PseudoData/gluon_NNPDF31_nlo_pch_as_0118_xmin1e-5.dat'
filename4='https://raw.githubusercontent.com/rabah-khalek/TF_tutorials/master/PseudoData/gluon_NNPDF31_nlo_pch_as_0118_xmin1e-6.dat'

# Headers to skip
lines_to_skip = 5

# Defining the columns (cv = central value, sd = standard deviation)
columns=["x", "gluon_cv", "gluon_sd"]

# Loading data from txt file
# Change filename1 to another filename for data that extends to lower x
# (see exercises at the bottom of this notebook)
df = pd.read_csv(filename1,
                 sep="\s+",
                 skiprows=lines_to_skip,
                 usecols=[0,1,2],
                 names=columns)


In [15]:
# Generate random noise as input for the generator
batch_size = 20
noise = torch.randn(batch_size, 100)

In [16]:
import torch.optim as optim

# Instantiate the generator and discriminator
generator = Generator()
discriminator = Discriminator()

# Binary Cross Entropy Loss
criterion = nn.BCELoss()

# Optimizers
lr = 0.0002
betas = (0.5, 0.999)
optimizerG = optim.Adam(generator.parameters(), lr=lr, betas=betas)
optimizerD = optim.Adam(discriminator.parameters(), lr=lr, betas=betas)


In [17]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch

class TabularDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        return torch.tensor(self.dataframe.iloc[idx].values, dtype=torch.float)
dataset = TabularDataset(df)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
generator = Generator().to(device)
discriminator = Discriminator().to(device)


In [19]:
num_epochs = 100
for epoch in range(num_epochs):
    for i, data in enumerate(dataloader, 0):
        # Update Discriminator: maximize log(D(x)) + log(1 - D(G(z)))
        discriminator.zero_grad()
        # Train with real data
        real_data = data.to(device)
        batch_size = real_data.size(0)
        label = torch.full((batch_size,), 1, dtype=torch.float, device=device) # Real data label = 1
        output = discriminator(real_data).view(-1)
        lossD_real = criterion(output, label)
        lossD_real.backward()
        D_x = output.mean().item()

        # Train with fake data
        noise = torch.randn(batch_size, 100, device=device)
        fake_data = generator(noise)
        label.fill_(0) # Fake data label = 0
        output = discriminator(fake_data.detach()).view(-1)
        lossD_fake = criterion(output, label)
        lossD_fake.backward()
        D_G_z1 = output.mean().item()
        lossD = lossD_real + lossD_fake
        optimizerD.step()

        # Update Generator: maximize log(D(G(z)))
        generator.zero_grad()
        label.fill_(1) # Fake labels are real for generator cost
        output = discriminator(fake_data).view(-1)
        lossG = criterion(output, label)
        lossG.backward()
        D_G_z2 = output.mean().item()
        optimizerG.step()

        if i % 50 == 0:
            print(f'[{epoch}/{num_epochs}][{i}/{len(dataloader)}] '
                  f'Loss_D: {lossD.item():.4f} Loss_G: {lossG.item():.4f} '
                  f'D(x): {D_x:.4f} D(G(z)): {D_G_z1:.4f} / {D_G_z2:.4f}')


[0/100][0/16] Loss_D: 1.5692 Loss_G: 0.7886 D(x): 0.3853 D(G(z)): 0.4546 / 0.4546
[1/100][0/16] Loss_D: 1.5681 Loss_G: 0.7880 D(x): 0.3858 D(G(z)): 0.4549 / 0.4549
[2/100][0/16] Loss_D: 1.5640 Loss_G: 0.7890 D(x): 0.3869 D(G(z)): 0.4544 / 0.4544
[3/100][0/16] Loss_D: 1.5646 Loss_G: 0.7879 D(x): 0.3868 D(G(z)): 0.4549 / 0.4549
[4/100][0/16] Loss_D: 1.5577 Loss_G: 0.7892 D(x): 0.3898 D(G(z)): 0.4544 / 0.4544
[5/100][0/16] Loss_D: 1.5846 Loss_G: 0.7880 D(x): 0.3789 D(G(z)): 0.4549 / 0.4549
[6/100][0/16] Loss_D: 1.5692 Loss_G: 0.7890 D(x): 0.3853 D(G(z)): 0.4544 / 0.4544
[7/100][0/16] Loss_D: 1.5872 Loss_G: 0.7884 D(x): 0.3780 D(G(z)): 0.4547 / 0.4547
[8/100][0/16] Loss_D: 1.5518 Loss_G: 0.7882 D(x): 0.3917 D(G(z)): 0.4548 / 0.4548
[9/100][0/16] Loss_D: 1.5633 Loss_G: 0.7877 D(x): 0.3874 D(G(z)): 0.4550 / 0.4550
[10/100][0/16] Loss_D: 1.5870 Loss_G: 0.7894 D(x): 0.3782 D(G(z)): 0.4542 / 0.4542
[11/100][0/16] Loss_D: 1.5381 Loss_G: 0.7880 D(x): 0.3978 D(G(z)): 0.4548 / 0.4548
[12/100][0/16]

In [20]:
num_samples = 100  # Number of synthetic samples want to generate
noise_dim = 100
noise = torch.randn(num_samples, noise_dim, device=device)


In [21]:
with torch.no_grad(): 
    generator.eval()  # Set the generator to evaluation mode
    synthetic_data = generator(noise)
    generator.train()  # Set it back to train mode if you're continuing training later


In [22]:
import pandas as pd

# real data had column names: 'x', 'gluon_cv', 'gluon_sd'
column_names = ['x', 'gluon_cv', 'gluon_sd']
synthetic_df = pd.DataFrame(synthetic_data.cpu().numpy(), columns=column_names)


In [23]:
# Inspect the first few rows of your synthetic data
print(synthetic_df.head())

# save the synthetic data to a CSV file
synthetic_df.to_csv('synthetic_data.csv', index=False)

          x  gluon_cv  gluon_sd
0  0.498872  0.101677  0.354555
1  0.522283  0.091723  0.390013
2  0.452715  0.056384 -0.148304
3  0.550166  0.146122 -0.042480
4  0.727594  0.597430 -0.381009
