In [None]:
import torch
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from tabpfn_extensions import TabPFNClassifier, TabPFNRegressor, unsupervised

def generate_tabpfn_style_data(n_samples=3000, feature_indices=[0, 1], temp=1.0, seed=42):
    # 加载原始数据集（可替换为任意有监督数据）
    df = load_breast_cancer(return_X_y=False)
    X, y = df["data"], df["target"]
    attribute_names = df["feature_names"]

    # 训练集划分（用于提取基础数据分布）
    X_train, _, y_train, _ = train_test_split(
        X, y, test_size=0.5, random_state=seed
    )

    # 初始化 TabPFN 模型组件
    clf = TabPFNClassifier(n_estimators=3)
    reg = TabPFNRegressor(n_estimators=3)
    model_unsupervised = unsupervised.TabPFNUnsupervisedModel(
        tabpfn_clf=clf,
        tabpfn_reg=reg,
    )

    # 设置 synthetic data 生成器
    exp_synthetic = unsupervised.experiments.GenerateSyntheticDataExperiment(
        task_type="unsupervised",
    )

    # 转换为 PyTorch tensor
    X_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_tensor = torch.tensor(y_train, dtype=torch.float32)

    # 执行生成过程
    results = exp_synthetic.run(
        tabpfn=model_unsupervised,
        X=X_tensor,
        y=y_tensor,
        attribute_names=attribute_names,
        temp=temp,
        n_samples=n_samples,
        indices=feature_indices,
    )

    # 返回生成的数据
    X_synthetic = results["X"].numpy()
    y_synthetic = results["y"].numpy() if "y" in results else None
    return X_synthetic, y_synthetic