In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from model import MLPVAE  # Importing the model
import joblib
from torch.utils import data

In [2]:
# train_csn = pd.read_csv("/hpc/home/js1207/sparkECMO/Adult ECMO RL/train_data_continuous.csv")
# train_csn = train_csn.csn.unique()
train_csn = [   43640203,  2178911314,  2207761225,  4510511039,  6648317299,
        7270121193, 10017307254, 10057670084, 10243791111, 11197349249,
       11367061032, 11598700132, 12128379018, 12156649330, 14719067105,
       15668590152, 15738357227, 16251981187, 17249721021, 19073406214,
       19085650202, 19330760302, 19657017003, 31978091155, 32313439276,
       36950570009, 38450329331, 39299217144, 42008659061, 42989860004,
       43323090183, 43917519266, 43946031211, 44991746336, 45457761150,
       46056470115, 46429729018, 48783979065, 48791437223, 48994390085,
       51299946263, 52278286092, 52461796114, 53029789123, 53045406183,
       53096266192, 53243526208, 53393046226, 53467166235, 53476486236,
       54565637009, 54705307026, 55395459196, 55403120241, 55511607117,
       55953247177, 56063101211, 56258940008, 56831407291, 57284130211,
       57642960169, 58928481104, 60780209038, 60845329119, 60879489278,
       60956019056, 61200959078, 61260430112, 61391379093, 62369850199,
       62405229244, 63083139237, 63121461075, 63152719298, 63433549223,
       63512809230, 63869689272, 64309929313, 64723420073, 64778739353,
       64843720260, 64908640003, 64939920143, 65075060016, 65098750021,
       65157230024, 65189640027, 65209900029, 65264380035, 65390550048,
       65682400100, 65706900107, 65816090128, 65968410147, 66144110163,
       66303050184, 66322050188, 66385050238, 66614490225, 66713900239,
       66713901041, 66724300239, 66795550261, 66803750251, 66926880266,
       67152710303, 67307200314, 67448150349, 67546400366, 67620281055,
       67682541003, 67763261127, 67779421014, 67786001015, 67796811017,
       67843131173, 67872761027, 67958411160, 68121041056, 68177151063,
       68192231065, 68315071120, 68321591079, 68349741082, 68463091091,
       68780921116, 68797341117, 68845081130, 68862911124, 68909051127,
       68989721134, 69241991159, 69356011171, 69372091173, 69487791185,
       69567811194, 69626971200, 69649561202, 69746211211, 69900791287,
       70201931257, 70225691258, 70293951265, 70341401291, 70454151283,
       70666401306, 70840791325]
raw_data = pd.read_csv("non_discritized_states.csv",index_col=0)
train_data = raw_data[raw_data['csn'].isin(train_csn)]

train_data.reset_index(drop=True, inplace=True)
train_data.drop(columns=['csn'], inplace=True)

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
# Save the scaler
joblib.dump(scaler, "scaler.pkl")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(columns=['csn'], inplace=True)


['scaler.pkl']

In [15]:
from sklearn.model_selection import train_test_split

# Load NumPy Training Data
train_data_tensor = torch.tensor(train_data[:, :42], dtype=torch.float32)

# Split into Training and Validation Sets
train_tensor, val_tensor = train_test_split(train_data_tensor, test_size=0.2, random_state=42)

seed = 42  # Choose any fixed seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using CUDA
torch.backends.cudnn.deterministic = True  # Ensures determinism
torch.backends.cudnn.benchmark = False  # Avoids non-deterministic optimizations

# Create Dataset & DataLoader
class TabularDataset(data.Dataset):
    def __init__(self, tensor):
        self.tensor = tensor
    def __getitem__(self, idx):
        return self.tensor[idx]
    def __len__(self):
        return self.tensor.shape[0]

train_dataset = TabularDataset(train_tensor)
val_dataset = TabularDataset(val_tensor)

train_dataloader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=64, shuffle=True)

# Initialize Model & Optimizer
model = MLPVAE(input_dim=42, hidden_dim=256, latent_dim=32)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Early Stopping Parameters
patience = 10
best_val_loss = float("inf")
epochs_no_improve = 0

# Revised VAE Loss Function
def vae_loss(recon_x, x, mu, logvar, beta=1.0):
    recon_loss = nn.MSELoss()(recon_x, x)
    kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kld, recon_loss, kld

# Training Loop with Validation & Early Stopping
num_epochs = 200
model.train()

for epoch in range(num_epochs):
    total_train_loss, total_train_recon_loss, total_train_kld = 0, 0, 0
    total_val_loss, total_val_recon_loss, total_val_kld = 0, 0, 0
    num_train_batches, num_val_batches = 0, 0
    # beta = min(1.0, epoch / 300)  # Gradual increase over full training
    beta = 0.05
    # beta = 1.0 if epoch > 50 else epoch / 50  # Full KL weight only after 50 epochs

    
    # Training Phase
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        recon, mu, logvar = model(batch)
        loss, recon_loss, kld = vae_loss(recon, batch, mu, logvar, beta=beta)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_train_loss += loss.item()
        total_train_recon_loss += recon_loss.item()
        total_train_kld += kld.item()
        num_train_batches += 1

    avg_train_loss = total_train_loss / num_train_batches
    avg_train_recon_loss = total_train_recon_loss / num_train_batches
    avg_train_kld = total_train_kld / num_train_batches

    # Validation Phase
    model.eval()
    with torch.no_grad():
        for batch in val_dataloader:
            recon, mu, logvar = model(batch)
            loss, recon_loss, kld = vae_loss(recon, batch, mu, logvar, beta=beta)
            total_val_loss += loss.item()
            total_val_recon_loss += recon_loss.item()
            total_val_kld += kld.item()
            num_val_batches += 1

    avg_val_loss = total_val_loss / num_val_batches
    avg_val_recon_loss = total_val_recon_loss / num_val_batches
    avg_val_kld = total_val_kld / num_val_batches

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Recon: {avg_val_recon_loss:.4f}, KL: {avg_val_kld:.4f} (Beta={beta:.2f})")

    # Early Stopping Based on Validation Loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), "mlp_vae.pth")
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}.")
        break


Epoch 1/200, Train Loss: 1.0252, Val Loss: 0.9715, Recon: 0.9544, KL: 0.1708 (Beta=0.10)
Epoch 2/200, Train Loss: 0.8819, Val Loss: 0.8441, Recon: 0.8086, KL: 0.3549 (Beta=0.10)
Epoch 3/200, Train Loss: 0.7866, Val Loss: 0.7600, Recon: 0.7108, KL: 0.4924 (Beta=0.10)
Epoch 4/200, Train Loss: 0.7153, Val Loss: 0.6872, Recon: 0.6250, KL: 0.6218 (Beta=0.10)
Epoch 5/200, Train Loss: 0.6507, Val Loss: 0.6277, Recon: 0.5552, KL: 0.7257 (Beta=0.10)
Epoch 6/200, Train Loss: 0.5997, Val Loss: 0.5803, Recon: 0.4988, KL: 0.8143 (Beta=0.10)
Epoch 7/200, Train Loss: 0.5600, Val Loss: 0.5430, Recon: 0.4566, KL: 0.8646 (Beta=0.10)
Epoch 8/200, Train Loss: 0.5271, Val Loss: 0.5129, Recon: 0.4222, KL: 0.9072 (Beta=0.10)
Epoch 9/200, Train Loss: 0.5019, Val Loss: 0.4868, Recon: 0.3932, KL: 0.9361 (Beta=0.10)
Epoch 10/200, Train Loss: 0.4777, Val Loss: 0.4678, Recon: 0.3723, KL: 0.9551 (Beta=0.10)
Epoch 11/200, Train Loss: 0.4594, Val Loss: 0.4490, Recon: 0.3518, KL: 0.9728 (Beta=0.10)
Epoch 12/200, Train

In [7]:
saved_model = torch.load("mlp_vae.pth")
print(saved_model.keys())  # Check the layers present in the checkpoint

odict_keys(['encoder.0.weight', 'encoder.0.bias', 'encoder.1.weight', 'encoder.1.bias', 'encoder.4.weight', 'encoder.4.bias', 'encoder.5.weight', 'encoder.5.bias', 'mu_layer.weight', 'mu_layer.bias', 'logvar_layer.weight', 'logvar_layer.bias', 'decoder.0.weight', 'decoder.0.bias', 'decoder.1.weight', 'decoder.1.bias', 'decoder.4.weight', 'decoder.4.bias', 'decoder.5.weight', 'decoder.5.bias', 'decoder.7.weight', 'decoder.7.bias'])


In [13]:
torch.save(model, "mlp_vae.pth")

In [61]:
model = torch.load("mlp_vae.pth")
model.eval()  # Set to evaluation mode if not training

import torch

def generate_synthetic_data(model, sample, column_idx, new_value):
    """
    Modifies a specific column in the input and generates a synthetic output.
    
    :param model: Trained MLPVAE model
    :param sample: A single input sample (1D tensor)
    :param column_idx: Index of the column to modify
    :param new_value: New value to assign to the column
    :return: Generated output with modified column
    """
    model.eval()
    
    # Convert to batch format (1 sample)
    sample = sample.clone().unsqueeze(0)  # Shape (1, input_dim)
    
    # Encode to latent space
    with torch.no_grad():
        mu, logvar = model.encode(sample)
        z = model.reparameterize(mu, logvar)

    # Modify the column in latent space (alternative: modify directly in input)
    modified_sample = sample.clone()
    modified_sample[0, column_idx] = new_value  # Change the specified column

    # Re-encode after modification
    with torch.no_grad():
        new_mu, new_logvar = model.encode(modified_sample)
        new_z = model.reparameterize(new_mu, new_logvar)

    # Decode back to see changes
    generated_output = model.decode(new_z)

    return generated_output.squeeze().detach().numpy()  # Convert back to NumPy for easier analysis

# Example usage:
sample_idx = 0  # Pick any row from your dataset
sample_data = train_tensor[sample_idx]  # Original sample

column_to_change = 5  # Example: Modify column 5
new_value = 2.0  # New value to assign

synthetic_output = generate_synthetic_data(model, sample_data, column_to_change, new_value)

print("Modified synthetic output:", synthetic_output)


Modified synthetic output: [-1.1699022   0.42113316  0.46528575  0.38823992  0.3147081  -0.24146605
  1.0364234   0.01297785  0.03269807 -0.16800848 -0.03724345  0.23910508
 -0.01513278  1.3327388  -0.64124554  1.1138294   0.54673445 -1.0044123
 -0.08894157 -0.16174182 -0.3544161  -0.11003835  1.2203125   0.14217015
 -0.36982304 -0.2851519   1.8047109  -0.02481666  0.06410962 -0.6459687
  0.07594274  0.05963591 -0.62476987 -0.12943527  0.35917825 -0.95582557
  0.5780245  -0.17491864 -0.79985195 -0.08835298 -0.7651638  -0.874994
  1.542209  ]


  model = torch.load("mlp_vae.pth")


In [33]:
train_csn

array([   43640203,  2178911314,  2207761225,  4510511039,  6648317299,
        7270121193, 10017307254, 10057670084, 10243791111, 11197349249,
       11367061032, 11598700132, 12128379018, 12156649330, 14719067105,
       15668590152, 15738357227, 16251981187, 17249721021, 19073406214,
       19085650202, 19330760302, 19657017003, 31978091155, 32313439276,
       36950570009, 38450329331, 39299217144, 42008659061, 42989860004,
       43323090183, 43917519266, 43946031211, 44991746336, 45457761150,
       46056470115, 46429729018, 48783979065, 48791437223, 48994390085,
       51299946263, 52278286092, 52461796114, 53029789123, 53045406183,
       53096266192, 53243526208, 53393046226, 53467166235, 53476486236,
       54565637009, 54705307026, 55395459196, 55403120241, 55511607117,
       55953247177, 56063101211, 56258940008, 56831407291, 57284130211,
       57642960169, 58928481104, 60780209038, 60845329119, 60879489278,
       60956019056, 61200959078, 61260430112, 61391379093, 62369

In [28]:
sparkECMO/Adult ECMO RL/train_data_continuous.csn

SyntaxError: invalid syntax (2854671802.py, line 1)