In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import ast
import os
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
import torch
from sklearn.decomposition import PCA

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
!pip install synthcity

Collecting synthcity
  Downloading synthcity-0.2.12-py3-none-any.whl.metadata (37 kB)
Collecting torch<2.3,>=2.1 (from synthcity)
  Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nflows>=0.14 (from synthcity)
  Downloading nflows-0.14.tar.gz (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy<2.0,>=1.20 (from synthcity)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lifelines<0.30.0,>=0.29.0 (from synthcity)
  Downloading lifelines-0.29.0-py3-none-any.whl.metadata (3.2 kB)
Collecting opacus>=1.3 (from synthcity)
  Downloading opacus-1.5.3-py3-none-any.whl.metadata (8.4 kB)
Collecting networkx<3.0,>2.0 (from s

In [None]:
from synthcity.plugins.core.models.tabular_vae import TabularVAE



In [None]:
# base path
base_path = "/content/drive/MyDrive/proj72_tab_compare/data"

# Load omic latents
clinical = pd.read_csv(os.path.join(base_path, "clinical.csv"), index_col=0)
mutation = pd.read_csv(os.path.join(base_path, "mutation.csv"), index_col=0)
methylation = pd.read_csv(os.path.join(base_path, "methylation.csv"), index_col=0)
scnv = pd.read_csv(os.path.join(base_path, "scnv.csv"), index_col=0)

task_table = pd.read_csv(os.path.join(base_path, "multimodal_tasks.csv"))

In [None]:
# all number col
clinical = clinical.select_dtypes(include=["number"])
mutation = mutation.select_dtypes(include=["number"])
scnv = scnv.select_dtypes(include=["number"])
methylation = methylation.select_dtypes(include=["number"])

In [None]:
# config
train_patients = ['TCGA-AK-3443', 'TCGA-BP-4971', 'TCGA-BP-4330', 'TCGA-BP-4998', 'TCGA-BP-4768', 'TCGA-BP-4804', 'TCGA-BP-4961', 'TCGA-BP-4161', 'TCGA-BP-4347', 'TCGA-A3-3362', 'TCGA-BP-4981', 'TCGA-BP-4965', 'TCGA-BP-4787', 'TCGA-BP-4986', 'TCGA-BP-4774', 'TCGA-B0-4837', 'TCGA-BP-4807', 'TCGA-BP-5004', 'TCGA-B0-5088', 'TCGA-CJ-4889', 'TCGA-BP-4989', 'TCGA-BP-4331', 'TCGA-A3-3374', 'TCGA-CJ-4871', 'TCGA-AK-3440', 'TCGA-BP-5006', 'TCGA-BP-4337', 'TCGA-BP-4994', 'TCGA-BP-4799', 'TCGA-CJ-4644', 'TCGA-BP-5007', 'TCGA-B8-4148', 'TCGA-BP-4167', 'TCGA-BP-4962', 'TCGA-BP-4960', 'TCGA-BP-4983', 'TCGA-B2-4099', 'TCGA-CZ-4862', 'TCGA-BP-4995', 'TCGA-CJ-4891', 'TCGA-BP-5001', 'TCGA-BP-4797', 'TCGA-BP-4974', 'TCGA-BP-4992', 'TCGA-BP-4771', 'TCGA-BP-4968', 'TCGA-BP-4351', 'TCGA-A3-3317', 'TCGA-BP-4326', 'TCGA-BP-4162', 'TCGA-BP-4975', 'TCGA-A3-3311', 'TCGA-CZ-4854', 'TCGA-BP-4977', 'TCGA-B2-4098', 'TCGA-CJ-4870', 'TCGA-BP-4174', 'TCGA-A3-3323', 'TCGA-AK-3436', 'TCGA-BP-4338', 'TCGA-CJ-4881', 'TCGA-BP-4777', 'TCGA-CJ-4895', 'TCGA-CJ-4638', 'TCGA-BP-4963', 'TCGA-BP-4967', 'TCGA-CJ-4894', 'TCGA-BP-4159', 'TCGA-BP-4346', 'TCGA-CJ-4868', 'TCGA-BP-4329', 'TCGA-B2-3923', 'TCGA-BP-4164', 'TCGA-B0-4838', 'TCGA-CJ-4888', 'TCGA-BP-5008', 'TCGA-BP-4343', 'TCGA-B0-5077', 'TCGA-CJ-4899', 'TCGA-BP-4341', 'TCGA-CJ-4639', 'TCGA-BP-4976', 'TCGA-BP-4761', 'TCGA-AK-3453', 'TCGA-BP-4999', 'TCGA-CJ-4887', 'TCGA-A3-3347', 'TCGA-A3-3326', 'TCGA-CZ-4858', 'TCGA-BP-4766', 'TCGA-B8-4143', 'TCGA-CJ-4640', 'TCGA-CJ-4893', 'TCGA-CJ-4875', 'TCGA-A3-3322', 'TCGA-A3-3349', 'TCGA-CJ-4878', 'TCGA-B8-4620', 'TCGA-BP-4158', 'TCGA-CJ-4890', 'TCGA-BP-4176', 'TCGA-BP-4790', 'TCGA-BP-4169', 'TCGA-BP-4775', 'TCGA-B0-4836', 'TCGA-CZ-4857', 'TCGA-CJ-4634', 'TCGA-BP-4982', 'TCGA-AK-3455', 'TCGA-CJ-4873', 'TCGA-BP-4972', 'TCGA-BP-4970', 'TCGA-BP-4170', 'TCGA-AK-3465', 'TCGA-CJ-4874', 'TCGA-BP-4354', 'TCGA-CJ-4886', 'TCGA-CZ-4861', 'TCGA-BP-4781', 'TCGA-B0-5081', 'TCGA-BP-4163']
test_patients = ['TCGA-B0-5075', 'TCGA-B0-4839', 'TCGA-BP-4160', 'TCGA-CJ-4643', 'TCGA-AK-3427', 'TCGA-B8-4151', 'TCGA-B2-3924', 'TCGA-BP-4988', 'TCGA-BP-4964', 'TCGA-AK-3430', 'TCGA-AK-3451', 'TCGA-A3-3331', 'TCGA-AS-3777', 'TCGA-B0-5084', 'TCGA-CJ-4637', 'TCGA-BP-4756', 'TCGA-BP-5009', 'TCGA-BP-4973', 'TCGA-AK-3429', 'TCGA-BP-4991', 'TCGA-B8-4619', 'TCGA-BP-4985', 'TCGA-CJ-4635', 'TCGA-BP-5000', 'TCGA-B0-5085', 'TCGA-BP-4352', 'TCGA-BP-4987', 'TCGA-A3-3313', 'TCGA-BP-4173', 'TCGA-AK-3456', 'TCGA-CJ-4641', 'TCGA-CJ-4876', 'TCGA-CJ-4636', 'TCGA-CJ-4900', 'TCGA-B8-4154']

input_modalities = ["clinical", "mutation", "scnv"]
target_modality = "methylation"

In [None]:
len(train_patients)

121

In [None]:
# dict
modality_data = {
    "clinical": clinical,
    "mutation": mutation,
    "scnv": scnv,
    "methylation": methylation
}

# get train set
train_input, train_target = [], []

for pid in train_patients:
    try:
        x = np.concatenate([modality_data[m].loc[pid].values for m in input_modalities])
        y = modality_data[target_modality].loc[pid].values
        train_input.append(x)
        train_target.append(y)
    except KeyError:
        continue

In [None]:
# --- 1. construct DataFrame ---
train_cond_df = pd.DataFrame(train_input)
train_target_df = pd.DataFrame(train_target)

# --- 2. add col name（otherwise SynthCity report error）---
train_cond_df.columns = [f"cond_{i}" for i in range(train_cond_df.shape[1])]
train_target_df.columns = [f"target_{i}" for i in range(train_target_df.shape[1])]

# --- 3. remove 0 cols ---
train_cond_df = train_cond_df.dropna(axis=1, how="all")
train_target_df = train_target_df.dropna(axis=1, how="all")

# --- 4. fill mean ---
train_cond_df = train_cond_df.fillna(train_cond_df.mean())
train_target_df = train_target_df.fillna(train_target_df.mean())

# --- 5. test ---
print("✅ train_cond_df shape:", train_cond_df.shape)
print("✅ train_target_df shape:", train_target_df.shape)
print("✅ final train samples:", len(train_cond_df))

✅ train_cond_df shape: (121, 25542)
✅ train_target_df shape: (121, 13738)
✅ final train samples: 121


In [None]:
# set target latent dim
target_dim = 64
pca = PCA(n_components=target_dim)

train_target_latent = pca.fit_transform(train_target_df)
print("🎯 PCA latent shape:", train_target_latent.shape)

train_target_latent_df = pd.DataFrame(train_target_latent)
train_target_latent_df.columns = [f"target_{i}" for i in range(train_target_latent_df.shape[1])]

🎯 PCA latent shape: (121, 64)


In [None]:
# set target latent dim to 256, same as our ldm
cond_dim = 64
pca_cond = PCA(n_components=cond_dim)

train_cond_latent = pca_cond.fit_transform(train_cond_df)
train_cond_latent_df = pd.DataFrame(train_cond_latent, columns=[f"cond_{i}" for i in range(cond_dim)])

print("🎯 PCA condition latent shape:", train_cond_latent_df.shape)

🎯 PCA condition latent shape: (121, 64)


In [None]:
# restore check
train_target_restored = pca.inverse_transform(train_target_latent)
print("🔁 Restored methylation shape:", train_target_restored.shape)

# check dim
print("✅ Is shape match:", train_target_restored.shape == train_target_df.shape)

# check mse
mse_pca = mean_squared_error(train_target_df.values.flatten(), train_target_restored.flatten())
print(f"📉 PCA Reconstruction MSE: {mse_pca:.4f}")

# percent of explained
explained = pca.explained_variance_ratio_.sum()
print(f"🔎 Explained variance by {target_dim} components: {explained:.2%}")

🔁 Restored methylation shape: (121, 13738)
✅ Is shape match: True
📉 PCA Reconstruction MSE: 0.0012
🔎 Explained variance by 64 components: 83.76%


In [None]:
train_cond_restored = pca_cond.inverse_transform(train_cond_latent)
print("🔁 Restored condition shape:", train_cond_restored.shape)

# check dim
print("✅ Is shape match:", train_cond_restored.shape == train_cond_df.shape)

# check mse
mse_cond_pca = mean_squared_error(train_cond_df.values.flatten(), train_cond_restored.flatten())
print(f"📉 Condition PCA Reconstruction MSE: {mse_cond_pca:.4f}")

# percent of explained
explained_cond = pca_cond.explained_variance_ratio_.sum()
print(f"🔎 Explained variance by {cond_dim} components: {explained_cond:.2%}")

🔁 Restored condition shape: (121, 25542)
✅ Is shape match: True
📉 Condition PCA Reconstruction MSE: 0.0004
🔎 Explained variance by 64 components: 98.99%


In [None]:
vae = TabularVAE(
    X=train_target_latent_df,
    cond=train_cond_latent_df,
    n_units_embedding=64,
    n_iter=20,
    batch_size=16,
    device="cpu"
)
vae.fit(train_target_latent_df, cond=train_cond_latent_df)

100%|██████████| 20/20 [03:21<00:00, 10.08s/it]


TabularVAE(
  (model): VAE(
    (encoder): Encoder(
      (shared): MLP(
        (model): Sequential(
          (0): LinearLayer(
            (model): Sequential(
              (0): Linear(in_features=9088, out_features=300, bias=True)
              (1): LeakyReLU(negative_slope=0.01)
            )
          )
          (1): LinearLayer(
            (model): Sequential(
              (0): Dropout(p=0.1, inplace=False)
              (1): Linear(in_features=300, out_features=300, bias=True)
              (2): LeakyReLU(negative_slope=0.01)
            )
          )
          (2): Linear(in_features=300, out_features=300, bias=True)
        )
        (loss): MSELoss()
      )
      (mu_fc): Linear(in_features=300, out_features=64, bias=True)
      (logvar_fc): Linear(in_features=300, out_features=64, bias=True)
    )
    (decoder): Decoder(
      (model): MLP(
        (model): Sequential(
          (0): SkipConnection(LinearLayer)(
            (model): Sequential(
              (0): Linea

In [None]:
# construct test input and target
test_input, test_target = [], []

for pid in test_patients:
    try:
        x = np.concatenate([modality_data[m].loc[pid].values for m in input_modalities])
        y = modality_data[target_modality].loc[pid].values
        test_input.append(x)
        test_target.append(y)
    except KeyError:
        continue

# to DataFrame
test_cond_df = pd.DataFrame(test_input)
test_target_df = pd.DataFrame(test_target)

# add col name
test_cond_df.columns = [f"cond_{i}" for i in range(test_cond_df.shape[1])]
test_target_df.columns = [f"target_{i}" for i in range(test_target_df.shape[1])]

# drop all NaN col + fill with mean
test_cond_df = test_cond_df.dropna(axis=1, how="all").fillna(train_cond_df.mean())
test_target_df = test_target_df.dropna(axis=1, how="all").fillna(train_target_df.mean())


In [None]:
# use built ready PCAs
test_cond_latent = pca_cond.transform(test_cond_df)
test_cond_latent_df = pd.DataFrame(test_cond_latent, columns=[f"cond_{i}" for i in range(test_cond_latent.shape[1])])

In [None]:
# use trained VAE to generate methylation latent
generated_latent = vae.generate(count=len(test_cond_latent_df), cond=test_cond_latent_df)
generated_latent = generated_latent.to_numpy()

In [None]:
# restore methylation PCA using inverse transform
predicted_methylation = pca.inverse_transform(generated_latent)

In [None]:
mse = mean_squared_error(test_target_df.values.flatten(), predicted_methylation.flatten())
rmse = mse ** 0.5
print(f"📊 RMSE (VAE → PCA → methylation vs ground-truth): {rmse:.4f}")

📊 RMSE (VAE → PCA → methylation vs ground-truth): 0.0981
