In [81]:
from sklearn.datasets import make_classification
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import numpy as np

In [2]:
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6)
X = torch.tensor(X, dtype=torch.float32)

loader = DataLoader(TensorDataset(X), batch_size=64, shuffle=True)

In [3]:
class BayesianLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.mu = nn.Parameter(torch.randn(out_features, in_features) * 0.1)
        self.log_sigma = nn.Parameter(torch.full((out_features, in_features), -3.0))
        self.bias_mu = nn.Parameter(torch.zeros(out_features))
        self.bias_log_sigma = nn.Parameter(torch.full((out_features,), -3.0))

    def forward(self, x):
        weight = self.mu + torch.exp(self.log_sigma) * torch.randn_like(self.mu)
        bias = self.bias_mu + torch.exp(self.bias_log_sigma) * torch.randn_like(self.bias_mu)
        return x @ weight.t() + bias

    def kl_loss(self):
        return 0.5 * torch.sum(
            torch.exp(2 * self.log_sigma) + self.mu**2 - 1 - 2 * self.log_sigma
        ) + 0.5 * torch.sum(
            torch.exp(2 * self.bias_log_sigma) + self.bias_mu**2 - 1 - 2 * self.bias_log_sigma
        )

class BayesianGenerator(nn.Module):
    def __init__(self, z_dim, hidden_dim, data_dim):
        super().__init__()
        self.fc1 = BayesianLinear(z_dim, hidden_dim)
        self.fc2 = BayesianLinear(hidden_dim, data_dim)
        self.relu = nn.ReLU()

    def forward(self, z):
        x = self.relu(self.fc1(z))
        return self.fc2(x)

    def kl_loss(self):
        return self.fc1.kl_loss() + self.fc2.kl_loss()


In [4]:
class Discriminator(nn.Module):
    def __init__(self, data_dim, hidden_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(data_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


In [50]:
z_dim = 16
gen = BayesianGenerator(z_dim, 32, X.shape[1])
disc = Discriminator(X.shape[1], 32)

g_opt = torch.optim.Adam(gen.parameters(), lr=2e-4)
d_opt = torch.optim.Adam(disc.parameters(), lr=2e-4)

for epoch in range(100):
    for real_data, in loader:
        batch_size = real_data.size(0)
        
        # Train Discriminator
        z = torch.randn(batch_size, z_dim)
        fake_data = gen(z).detach()
        d_real = disc(real_data)
        d_fake = disc(fake_data)
        d_loss = -torch.mean(torch.log(d_real + 1e-8) + torch.log(1 - d_fake + 1e-8))

        d_opt.zero_grad()
        d_loss.backward()
        d_opt.step()

        # Train Generator
        z = torch.randn(batch_size, z_dim)
        fake_data = gen(z)
        g_preds = disc(fake_data)
        g_loss = -torch.mean(torch.log(g_preds + 1e-8))

        kl = gen.kl_loss() / len(loader.dataset)
        total_g_loss = g_loss + kl

        g_opt.zero_grad()
        total_g_loss.backward()
        g_opt.step()
    
    print(f"Epoch {epoch+1} | D Loss: {d_loss.item():.4f} | G Loss: {g_loss.item():.4f} | KL: {kl.item():.4f}")


Epoch 1 | D Loss: 1.4758 | G Loss: 0.6935 | KL: 2.1879
Epoch 2 | D Loss: 1.4853 | G Loss: 0.6669 | KL: 2.1852
Epoch 3 | D Loss: 1.4666 | G Loss: 0.6463 | KL: 2.1825
Epoch 4 | D Loss: 1.4217 | G Loss: 0.6662 | KL: 2.1799
Epoch 5 | D Loss: 1.4226 | G Loss: 0.6096 | KL: 2.1772
Epoch 6 | D Loss: 1.3995 | G Loss: 0.6309 | KL: 2.1746
Epoch 7 | D Loss: 1.3489 | G Loss: 0.6312 | KL: 2.1720
Epoch 8 | D Loss: 1.3164 | G Loss: 0.6255 | KL: 2.1693
Epoch 9 | D Loss: 1.3158 | G Loss: 0.6318 | KL: 2.1667
Epoch 10 | D Loss: 1.3602 | G Loss: 0.6476 | KL: 2.1641
Epoch 11 | D Loss: 1.3732 | G Loss: 0.6442 | KL: 2.1615
Epoch 12 | D Loss: 1.3546 | G Loss: 0.6293 | KL: 2.1589
Epoch 13 | D Loss: 1.2750 | G Loss: 0.6236 | KL: 2.1563
Epoch 14 | D Loss: 1.2696 | G Loss: 0.6252 | KL: 2.1537
Epoch 15 | D Loss: 1.2408 | G Loss: 0.6626 | KL: 2.1511
Epoch 16 | D Loss: 1.2251 | G Loss: 0.6433 | KL: 2.1486
Epoch 17 | D Loss: 1.1852 | G Loss: 0.6276 | KL: 2.1460
Epoch 18 | D Loss: 1.2307 | G Loss: 0.6279 | KL: 2.1434
E

In [5]:
# ------------------ Network Definitions ------------------
class Generator(nn.Module):
    def __init__(self, z_dim, hidden_dim, output_dim):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        #self.fc1 = BayesianLinear(z_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        #self.fc2 = BayesianLinear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, z):
        x = self.relu(self.fc1(z))
        return self.fc2(x)

    
class Discriminator(nn.Module):
    def __init__(self, data_dim, hidden_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(data_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


def sghmc_update(params, grads, momentum, step_size, friction, noise_std):
    for p, g, m in zip(params, grads, momentum):
        if g is None: continue
        m.data = (1 - friction) * m.data + step_size * g + noise_std * torch.randn_like(p)
        p.data -= m.data

# ------------------ Training Loop ------------------
def train_bayesian_gan_sghmc(real_data_loader, z_dim=16, hidden_dim=32, output_dim=X.shape[1], 
                              num_epochs=100, step_size=1e-3, friction=0.05):
    G = Generator(z_dim, hidden_dim, output_dim)
    D = Discriminator(output_dim, hidden_dim)
    d_optimizer = torch.optim.Adam(D.parameters(), lr=1e-4)

    # SGHMC state for Generator
    g_params = list(G.parameters())
    g_momentum = [torch.zeros_like(p) for p in g_params]
    noise_std = torch.sqrt(torch.tensor(2 * step_size * friction))

    for epoch in range(num_epochs):
        for real_batch, in real_data_loader:
            real_batch = real_batch.float()
            batch_size = real_batch.size(0)

            # ------------------ Train Discriminator ------------------
            z = torch.randn(batch_size, z_dim)
            fake_data = G(z).detach()
            
            d_real = D(real_batch)
            d_fake = D(fake_data)
            d_loss = -torch.mean(torch.log(d_real + 1e-8) + torch.log(1 - d_fake + 1e-8))

            d_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

            # ------------------ SGHMC Generator Update ------------------
            real_labels = torch.zeros(batch_size, 1)
            z = torch.randn(batch_size, z_dim)
            fake_data = G(z)
            g_loss = F.binary_cross_entropy(D(fake_data), real_labels)

            G.zero_grad()
            g_loss.backward()
            g_grads = [p.grad for p in g_params]
            sghmc_update(g_params, g_grads, g_momentum, step_size, friction, noise_std)

        print(f"Epoch {epoch} | D Loss: {d_loss.item():.4f} | G Loss: {g_loss.item():.4f}")

    return G

train_bayesian_gan_sghmc(loader)

Epoch 0 | D Loss: 1.3185 | G Loss: 0.5994
Epoch 1 | D Loss: 1.1703 | G Loss: 0.4114
Epoch 2 | D Loss: 1.1065 | G Loss: 0.4029
Epoch 3 | D Loss: 1.2548 | G Loss: 0.6963
Epoch 4 | D Loss: 1.2957 | G Loss: 0.3425
Epoch 5 | D Loss: 1.4968 | G Loss: 0.5220
Epoch 6 | D Loss: 0.9136 | G Loss: 0.0736
Epoch 7 | D Loss: 1.3639 | G Loss: 0.3352
Epoch 8 | D Loss: 0.9493 | G Loss: 0.0696
Epoch 9 | D Loss: 0.8289 | G Loss: 0.0952
Epoch 10 | D Loss: 0.8122 | G Loss: 0.0018
Epoch 11 | D Loss: 0.8407 | G Loss: 0.0764
Epoch 12 | D Loss: 1.1089 | G Loss: 0.0519
Epoch 13 | D Loss: 0.8390 | G Loss: 0.0180
Epoch 14 | D Loss: 1.0571 | G Loss: 0.0127
Epoch 15 | D Loss: 1.0992 | G Loss: 2.5099
Epoch 16 | D Loss: 0.8154 | G Loss: 0.0000
Epoch 17 | D Loss: 0.8250 | G Loss: 0.0000
Epoch 18 | D Loss: 0.8223 | G Loss: 0.0000
Epoch 19 | D Loss: 0.8103 | G Loss: 0.0000
Epoch 20 | D Loss: 0.8103 | G Loss: 0.0002
Epoch 21 | D Loss: 0.9751 | G Loss: 0.0001
Epoch 22 | D Loss: 0.7716 | G Loss: 0.0000
Epoch 23 | D Loss: 0.

Generator(
  (fc1): Linear(in_features=16, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=10, bias=True)
  (relu): ReLU()
)

## Bayesian CTGAN

## Functions

In [77]:
class Generator(nn.Module):
    def __init__(self, z_dim, hidden_dim, output_dim):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        #self.fc1 = BayesianLinear(z_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        #self.fc2 = BayesianLinear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, z):
        x = self.relu(self.fc1(z))
        return self.fc2(x)
    
class Discriminator(nn.Module):
    def __init__(self, data_dim, hidden_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(data_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [80]:
def gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
    """Deals with the instability of the gumbel_softmax for older versions of torch.

    For more details about the issue:
    https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing

    Args:
        logits […, num_features]:
            Unnormalized log probabilities
        tau:
            Non-negative scalar temperature
        hard (bool):
            If True, the returned samples will be discretized as one-hot vectors,
            but will be differentiated as if it is the soft sample in autograd
        dim (int):
            A dimension along which softmax will be computed. Default: -1.

    Returns:
        Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
    """
    for _ in range(10):
        transformed = F.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
        if not torch.isnan(transformed).any():
            return transformed

    raise ValueError('gumbel_softmax returning NaN.')

def apply_activate(data, transformer):
    """Apply proper activation function to the output of the generator."""
    data_t = []
    st = 0
    for column_info in transformer.output_info_list:
        for span_info in column_info:
            if span_info.activation_fn == 'tanh':
                ed = st + span_info.dim
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif span_info.activation_fn == 'softmax':
                ed = st + span_info.dim
                transformed = gumbel_softmax(data[:, st:ed], tau=0.2)
                data_t.append(transformed)
                st = ed
            else:
                raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')

    return torch.cat(data_t, dim=1)

def cond_loss(data, c, m, transformer):
    """Compute the cross entropy loss on the fixed discrete column."""
    loss = []
    st = 0
    st_c = 0
    for column_info in transformer.output_info_list:
        for span_info in column_info:
            if len(column_info) != 1 or span_info.activation_fn != 'softmax':
                # not discrete column
                st += span_info.dim
            else:
                ed = st + span_info.dim
                ed_c = st_c + span_info.dim
                tmp = F.cross_entropy(
                    data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none'
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c

    loss = torch.stack(loss, dim=1)  # noqa: PD013

    return (loss * m).sum() / data.size()[0]

## Code

In [18]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 

In [25]:
X = adult.data.features
y = adult.data.targets

In [None]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [44]:
distinct_values = {col: X[col].unique() for col in X.columns}

# Optional: print in readable format
for col, values in distinct_values.items():
    print(f"{col} size: {len(values)}")

age size: 74
workclass size: 10
fnlwgt size: 28523
education size: 16
education-num size: 16
marital-status size: 7
occupation size: 16
relationship size: 6
race size: 5
sex size: 2
capital-gain size: 123
capital-loss size: 99
hours-per-week size: 96
native-country size: 43


All except fnlwgt are categorical.

In [46]:
X.drop('fnlwgt', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('fnlwgt', inplace=True, axis=1)


In [48]:
discrete_columns = X.columns
discrete_columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [52]:
from data_transformer import DataTransformer

transformer = DataTransformer()
transformer.fit(X, discrete_columns)
train_data = transformer.transform(X)

train_data

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
transformer.output_info_list

[[SpanInfo(dim=74, activation_fn='softmax')],
 [SpanInfo(dim=10, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=7, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=6, activation_fn='softmax')],
 [SpanInfo(dim=5, activation_fn='softmax')],
 [SpanInfo(dim=2, activation_fn='softmax')],
 [SpanInfo(dim=123, activation_fn='softmax')],
 [SpanInfo(dim=99, activation_fn='softmax')],
 [SpanInfo(dim=96, activation_fn='softmax')],
 [SpanInfo(dim=43, activation_fn='softmax')]]

In [55]:
from data_sampler import DataSampler

data_sampler = DataSampler(train_data, transformer.output_info_list, log_frequency=True)

In [72]:
c1, m1, col, opt = data_sampler.sample_condvec(3)
print(c1.shape, m1.shape, col, opt)

(3, 513) (3, 13) [1 9 5] [ 4 31  1]


In [None]:
batch_size = 64
embedding_dim = 128
generator_dim = 128
discriminator_dim = 128
data_dim = X.shape[1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generator = Generator(embedding_dim + data_sampler.dim_cond_vec(), generator_dim, data_dim).to(device)
discriminator = Discriminator(data_dim + data_sampler.dim_cond_vec(), discriminator_dim).to(device)

optimizerG = torch.optim.Adam(
            generator.parameters(),
            betas=(0.5, 0.9),
        )

optimizerD = torch.optim.Adam(
            discriminator.parameters(),
            betas=(0.5, 0.9),
        )

mean = torch.zeros(batch_size, embedding_dim, device=device)
std = mean + 1


In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    for i in range(len(data_sampler)):
        # Sample data
        c1, m1, col, opt = data_sampler.sample_condvec(batch_size)
        c1 = torch.from_numpy(c1).to(device)
        m1 = torch.from_numpy(m1).to(device)
        
        fakez = torch.normal(mean=mean, std=std) # Sample noise
        fakez = torch.cat([fakez, c1], dim=1)

        # Generate fake data
        fake_data = generator(fakez)
        fakeact = apply_activate(fake_data)

        # Get real data
        perm = np.arange(batch_size)
        np.random.shuffle(perm)
        real = data_sampler.sample_data(train_data, batch_size, col[perm], opt[perm])
        c2 = c1[perm]

        # Train Discriminator
        fake_cat = torch.cat([fakeact, c1], dim=1)
        real_cat = torch.cat([real, c2], dim=1)
        y_fake = discriminator(fake_cat)
        y_real = discriminator(real_cat)
        
        d_loss = -torch.mean(torch.log(y_real + 1e-8) + torch.log(1 - y_fake + 1e-8))
        optimizerD.zero_grad()
        d_loss.backward()
        optimizerD.step()
        
        # Train Generator
        optimizerG.zero_grad()
        g_loss = -discriminator(fake_data, c1, m1).mean()
        g_loss.backward()
        optimizerG.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")