In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [45]:
train_csn = pd.read_csv("/hpc/home/js1207/sparkECMO/Adult ECMO RL/train_data_continuous.csv")
train_csn = train_csn.csn.unique()
data = pd.read_csv("non_discritized_states.csv",index_col=0)
train_data = data[data['csn'].isin(train_csn)]
train_data.reset_index(drop=True, inplace=True)
train_data



scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)


In [46]:
train_data

array([[-2.85569266,  1.67395706,  1.70083535, ..., -0.72938625,
        -0.96354668,  1.35900057],
       [-2.85569266,  0.14115349,  0.09902896, ..., -0.67965633,
        -0.54515025,  1.35900057],
       [-2.85569266, -1.27374211,  0.40611101, ..., -0.59014247,
        -0.40568477,  1.35900057],
       ...,
       [ 0.74926432, -0.56629431, -1.43741875, ..., -0.03556639,
         0.15217714, -0.79712112],
       [ 0.74926432, -0.71367927, -0.58153127, ..., -0.03556639,
         0.15217714, -0.79712112],
       [ 0.74926432, -1.15583415, -3.02988818, ..., -0.03556639,
         0.15217714, -0.79712112]])

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# We'll assume 'train_data' is already defined as a pandas dataframe with shape (11147, 43)

class MLPVAE(nn.Module):
    def __init__(self, input_dim=43, hidden_dim=64, latent_dim=16):
        super(MLPVAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
        )
        self.mu_layer = nn.Linear(hidden_dim, latent_dim)
        self.logvar_layer = nn.Linear(hidden_dim, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
        )
    
    def encode(self, x):
        h = self.encoder(x)
        mu = self.mu_layer(h)
        logvar = self.logvar_layer(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def vae_loss(recon_x, x, mu, logvar):
    recon_loss = ((recon_x - x) ** 2).sum(dim=0)  # Compute loss per column
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss, kld

# Convert your dataframe to a torch tensor
train_tensor = torch.tensor(train_data, dtype=torch.float32)

# Create a simple dataset and dataloader
class TabularDataset(data.Dataset):
    def __init__(self, tensor):
        self.tensor = tensor
    def __getitem__(self, idx):
        return self.tensor[idx]
    def __len__(self):
        return self.tensor.shape[0]

dataset = TabularDataset(train_tensor)
dataloader = data.DataLoader(dataset, batch_size=64, shuffle=True)

# Initialize model, optimizer
model = MLPVAE(input_dim=43, hidden_dim=64, latent_dim=16)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Train the model
num_epochs = 100
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    total_recon_loss = torch.zeros(train_tensor.shape[1])
    total_kld = 0
    num_batches = 0

    for batch in dataloader:
        optimizer.zero_grad()
        recon, mu, logvar = model(batch)
        recon_loss, kld = vae_loss(recon, batch, mu, logvar)
        loss = recon_loss.sum() + kld  # Total loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_recon_loss += recon_loss.detach()
        total_kld += kld.item()
        num_batches += 1

    avg_recon_loss = total_recon_loss / (num_batches * dataloader.batch_size)
    avg_kld = total_kld / (num_batches * dataloader.batch_size)

    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_recon_loss.numpy().mean():.3f}")
    # print(f"Avg per-column reconstruction loss: {}")

torch.save(model, "mlp_vae.pth")

Epoch 1/100, Average Loss: 0.932
Epoch 2/100, Average Loss: 0.741
Epoch 3/100, Average Loss: 0.637
Epoch 4/100, Average Loss: 0.574
Epoch 5/100, Average Loss: 0.532
Epoch 6/100, Average Loss: 0.504
Epoch 7/100, Average Loss: 0.482
Epoch 8/100, Average Loss: 0.468
Epoch 9/100, Average Loss: 0.454
Epoch 10/100, Average Loss: 0.445
Epoch 11/100, Average Loss: 0.436
Epoch 12/100, Average Loss: 0.427
Epoch 13/100, Average Loss: 0.420
Epoch 14/100, Average Loss: 0.414
Epoch 15/100, Average Loss: 0.409
Epoch 16/100, Average Loss: 0.404
Epoch 17/100, Average Loss: 0.400
Epoch 18/100, Average Loss: 0.395
Epoch 19/100, Average Loss: 0.393
Epoch 20/100, Average Loss: 0.389
Epoch 21/100, Average Loss: 0.386
Epoch 22/100, Average Loss: 0.383
Epoch 23/100, Average Loss: 0.381
Epoch 24/100, Average Loss: 0.378
Epoch 25/100, Average Loss: 0.377
Epoch 26/100, Average Loss: 0.374
Epoch 27/100, Average Loss: 0.373
Epoch 28/100, Average Loss: 0.372
Epoch 29/100, Average Loss: 0.368
Epoch 30/100, Average L

In [58]:
torch.save(model, "mlp_vae.pth")

In [61]:
model = torch.load("mlp_vae.pth")
model.eval()  # Set to evaluation mode if not training

import torch

def generate_synthetic_data(model, sample, column_idx, new_value):
    """
    Modifies a specific column in the input and generates a synthetic output.
    
    :param model: Trained MLPVAE model
    :param sample: A single input sample (1D tensor)
    :param column_idx: Index of the column to modify
    :param new_value: New value to assign to the column
    :return: Generated output with modified column
    """
    model.eval()
    
    # Convert to batch format (1 sample)
    sample = sample.clone().unsqueeze(0)  # Shape (1, input_dim)
    
    # Encode to latent space
    with torch.no_grad():
        mu, logvar = model.encode(sample)
        z = model.reparameterize(mu, logvar)

    # Modify the column in latent space (alternative: modify directly in input)
    modified_sample = sample.clone()
    modified_sample[0, column_idx] = new_value  # Change the specified column

    # Re-encode after modification
    with torch.no_grad():
        new_mu, new_logvar = model.encode(modified_sample)
        new_z = model.reparameterize(new_mu, new_logvar)

    # Decode back to see changes
    generated_output = model.decode(new_z)

    return generated_output.squeeze().detach().numpy()  # Convert back to NumPy for easier analysis

# Example usage:
sample_idx = 0  # Pick any row from your dataset
sample_data = train_tensor[sample_idx]  # Original sample

column_to_change = 5  # Example: Modify column 5
new_value = 2.0  # New value to assign

synthetic_output = generate_synthetic_data(model, sample_data, column_to_change, new_value)

print("Modified synthetic output:", synthetic_output)


Modified synthetic output: [-1.1699022   0.42113316  0.46528575  0.38823992  0.3147081  -0.24146605
  1.0364234   0.01297785  0.03269807 -0.16800848 -0.03724345  0.23910508
 -0.01513278  1.3327388  -0.64124554  1.1138294   0.54673445 -1.0044123
 -0.08894157 -0.16174182 -0.3544161  -0.11003835  1.2203125   0.14217015
 -0.36982304 -0.2851519   1.8047109  -0.02481666  0.06410962 -0.6459687
  0.07594274  0.05963591 -0.62476987 -0.12943527  0.35917825 -0.95582557
  0.5780245  -0.17491864 -0.79985195 -0.08835298 -0.7651638  -0.874994
  1.542209  ]


  model = torch.load("mlp_vae.pth")


In [33]:
train_csn

array([   43640203,  2178911314,  2207761225,  4510511039,  6648317299,
        7270121193, 10017307254, 10057670084, 10243791111, 11197349249,
       11367061032, 11598700132, 12128379018, 12156649330, 14719067105,
       15668590152, 15738357227, 16251981187, 17249721021, 19073406214,
       19085650202, 19330760302, 19657017003, 31978091155, 32313439276,
       36950570009, 38450329331, 39299217144, 42008659061, 42989860004,
       43323090183, 43917519266, 43946031211, 44991746336, 45457761150,
       46056470115, 46429729018, 48783979065, 48791437223, 48994390085,
       51299946263, 52278286092, 52461796114, 53029789123, 53045406183,
       53096266192, 53243526208, 53393046226, 53467166235, 53476486236,
       54565637009, 54705307026, 55395459196, 55403120241, 55511607117,
       55953247177, 56063101211, 56258940008, 56831407291, 57284130211,
       57642960169, 58928481104, 60780209038, 60845329119, 60879489278,
       60956019056, 61200959078, 61260430112, 61391379093, 62369

In [28]:
sparkECMO/Adult ECMO RL/train_data_continuous.csn

SyntaxError: invalid syntax (2854671802.py, line 1)