In [1]:
import jax.random as random
from datasets import LinearGaussianDataset  # 请按实际路径调整
import numpy as np


In [17]:

def generate_linear_gaussian(seed: int,
                             intrinsic_dim: int,
                             ambient_dim: int,
                             noise_var: float = 0.0,
                             n_samples: int = 20000):
    """
    对应 run.py 中：
      --dataset linear_gaussian
      -dd <intrinsic_dim>      -> dimension
      --padding_dim <pad_dim>   -> padding_dimension
      --var_added <noise_var>   -> var_added

    ambient_dim = intrinsic_dim + padding_dim
    """
    padding_dim = ambient_dim - intrinsic_dim
    ds = LinearGaussianDataset(seed=seed,
                                dimension=intrinsic_dim,
                                intrinsic_dimension=intrinsic_dim,
                                padding_dimension=padding_dim,
                                var_added=noise_var)
    # get_batch 返回 shape = (n_samples, intrinsic_dim + padding_dim)
    X = ds.get_batch(n_samples)
    return np.array(X)

if __name__ == "__main__":
    # 对应论文中这几条命令的（seed=2）示例：
    configs = [
        # intrinsic=3, ambient=12
        {"seed": 2,  "intrinsic_dim": 3,  "ambient_dim": 12},
        # intrinsic=3, ambient=20
        {"seed": 2,  "intrinsic_dim": 3,  "ambient_dim": 20},
        # intrinsic=6, ambient=12
        {"seed": 2,  "intrinsic_dim": 6,  "ambient_dim": 12},
        # intrinsic=6, ambient=20
        {"seed": 2,  "intrinsic_dim": 6,  "ambient_dim": 20},
        # intrinsic=9, ambient=12
        {"seed": 2,  "intrinsic_dim": 9,  "ambient_dim": 12},
        # intrinsic=9, ambient=20
        {"seed": 2,  "intrinsic_dim": 9,  "ambient_dim": 20},
        # intrinsic=12, ambient=20
        {"seed": 2,  "intrinsic_dim": 12, "ambient_dim": 20},
    ]

    all_datasets = {}

    for cfg in configs:
        key = f"intrinsic{cfg['intrinsic_dim']}_ambient{cfg['ambient_dim']}"
        X = generate_linear_gaussian(seed=cfg["seed"],
                                 intrinsic_dim=cfg["intrinsic_dim"],
                                 ambient_dim=cfg["ambient_dim"],
                                 noise_var=0.0,
                                 n_samples=20000)
        all_datasets[key] = X
        print(f"Generated {key}: X.shape = {X.shape}")



Generated intrinsic3_ambient12: X.shape = (20000, 12)
Generated intrinsic3_ambient20: X.shape = (20000, 20)
Generated intrinsic6_ambient12: X.shape = (20000, 12)
Generated intrinsic6_ambient20: X.shape = (20000, 20)
Generated intrinsic9_ambient12: X.shape = (20000, 12)
Generated intrinsic9_ambient20: X.shape = (20000, 20)
Generated intrinsic12_ambient20: X.shape = (20000, 20)


## Diffusion Model (DDPM)

In [34]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tqdm import tqdm
import os


In [35]:
class DenoiseMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim + 1, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
        )

    def forward(self, x, t):
        # t: shape [batch_size], expand to [batch_size, 1]
        t = t.unsqueeze(1).float() / 1000  # Normalize timestep
        xt = torch.cat([x, t], dim=1)
        return self.net(xt)


In [36]:
class Diffusion:
    def __init__(self, model, timesteps=1000, beta_start=1e-4, beta_end=0.02):
        self.model = model
        self.timesteps = timesteps
        self.betas = torch.linspace(beta_start, beta_end, timesteps)
        self.alphas = 1. - self.betas
        self.alpha_hats = torch.cumprod(self.alphas, dim=0)

    def q_sample(self, x0, t, noise=None):
        if noise is None:
            noise = torch.randn_like(x0)
        alpha_hat = self.alpha_hats[t].unsqueeze(1).to(x0.device)
        return torch.sqrt(alpha_hat) * x0 + torch.sqrt(1 - alpha_hat) * noise

    def p_losses(self, x0, t):
        noise = torch.randn_like(x0)
        x_noisy = self.q_sample(x0, t, noise)
        noise_pred = self.model(x_noisy, t)
        return nn.MSELoss()(noise_pred, noise)

    def sample(self, num_samples, dim, device):
        x = torch.randn(num_samples, dim).to(device)
        for t in reversed(range(self.timesteps)):
            t_tensor = torch.full((num_samples,), t, device=device, dtype=torch.long)
            beta = self.betas[t].to(device)
            alpha = self.alphas[t].to(device)
            alpha_hat = self.alpha_hats[t].to(device)
            noise_pred = self.model(x, t_tensor)

            x = (1 / torch.sqrt(alpha)) * (
                x - (beta / torch.sqrt(1 - alpha_hat)) * noise_pred
            )
            if t > 0:
                noise = torch.randn_like(x)
                x += torch.sqrt(beta) * noise
        return x


In [37]:
def train_diffusion_and_evaluate(all_datasets, device="cuda" if torch.cuda.is_available() else "cpu"):
    os.makedirs("generated_samples", exist_ok=True)
    results = {}

    for key, X in all_datasets.items():
        print(f"\n🚀 Training on config: {key}, shape = {X.shape}")
        X_tensor = torch.tensor(X, dtype=torch.float32)
        dataset = TensorDataset(X_tensor)
        dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

        model = DenoiseMLP(input_dim=X.shape[1]).to(device)
        diffusion = Diffusion(model, timesteps=1000)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

        # === Train ===
        for epoch in range(10):  # 可加大
            for batch in dataloader:
                x0 = batch[0].to(device)
                t = torch.randint(0, 1000, (x0.shape[0],), device=device)
                loss = diffusion.p_losses(x0, t)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print(f"[{key}] Epoch {epoch+1} - Loss: {loss.item():.4f}")

        # === Sample ===
        model.eval()
        with torch.no_grad():
            samples = diffusion.sample(num_samples=1000, dim=X.shape[1], device=device)
            samples = samples.cpu().numpy()
            np.save(f"generated_samples/{key}_samples.npy", samples)

        # === Evaluate PCA eigenvalue error ===
        error = pca_eigenvalue_error(X, samples, n_components=min(10, X.shape[1]))
        results[key] = error
        print(f"[{key}] ✅ PCA Eigenvalue Error: {error:.6f}")

        # === Visualize if dim is 2 or 3 ===
        if X.shape[1] in [2, 3]:
            fig = plt.figure()
            if X.shape[1] == 2:
                plt.scatter(X[:, 0], X[:, 1], s=5, alpha=0.5, label="Real")
                plt.scatter(samples[:, 0], samples[:, 1], s=5, alpha=0.5, label="Generated")
                plt.legend()
                plt.title(f"{key} - 2D")
            else:
                ax = fig.add_subplot(111, projection='3d')
                ax.scatter(X[:, 0], X[:, 1], X[:, 2], s=5, alpha=0.5, label="Real")
                ax.scatter(samples[:, 0], samples[:, 1], samples[:, 2], s=5, alpha=0.5, label="Generated")
                ax.legend()
                plt.title(f"{key} - 3D")
            plt.savefig(f"generated_samples/{key}_viz.png")
            plt.close()

    print("\n📊 Final PCA Eigenvalue Errors:")
    for key, err in results.items():
        print(f"{key}: {err:.6f}")


In [38]:
train_diffusion_and_evaluate(all_datasets)



🚀 Training on config: intrinsic3_ambient12, shape = (20000, 12)
[intrinsic3_ambient12] Epoch 1 - Loss: 0.2265
[intrinsic3_ambient12] Epoch 2 - Loss: 0.1552
[intrinsic3_ambient12] Epoch 3 - Loss: 0.1789
[intrinsic3_ambient12] Epoch 4 - Loss: 0.0864
[intrinsic3_ambient12] Epoch 5 - Loss: 0.1369
[intrinsic3_ambient12] Epoch 6 - Loss: 0.1539
[intrinsic3_ambient12] Epoch 7 - Loss: 0.1561
[intrinsic3_ambient12] Epoch 8 - Loss: 0.1908
[intrinsic3_ambient12] Epoch 9 - Loss: 0.1048
[intrinsic3_ambient12] Epoch 10 - Loss: 0.0871
[intrinsic3_ambient12] ✅ PCA Eigenvalue Error: 0.007976

🚀 Training on config: intrinsic3_ambient20, shape = (20000, 20)
[intrinsic3_ambient20] Epoch 1 - Loss: 0.1450
[intrinsic3_ambient20] Epoch 2 - Loss: 0.1332
[intrinsic3_ambient20] Epoch 3 - Loss: 0.1830
[intrinsic3_ambient20] Epoch 4 - Loss: 0.1331
[intrinsic3_ambient20] Epoch 5 - Loss: 0.1455
[intrinsic3_ambient20] Epoch 6 - Loss: 0.1230
[intrinsic3_ambient20] Epoch 7 - Loss: 0.0787
[intrinsic3_ambient20] Epoch 8 

### Metrics

In [9]:
from sklearn.decomposition import PCA
import numpy as np

def pca_eigenvalue_error(real, fake, n_components=None):
    pca_real = PCA(n_components=n_components)
    pca_fake = PCA(n_components=n_components)

    pca_real.fit(real)
    pca_fake.fit(fake)

    ev_real = pca_real.explained_variance_
    ev_fake = pca_fake.explained_variance_

    ev_real /= ev_real.sum()
    ev_fake /= ev_fake.sum()

    return np.mean(np.abs(ev_real - ev_fake))


In [40]:
all_datasets['intrinsic3_ambient20'].shape

(20000, 20)