In [3]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer

# 设置随机种子
np.random.seed(42)

# 定义列名
column_names = [f'x{i}' for i in range(1, 512)] + ['y']

# 读取 v1.csv 和 v2.csv
v1 = pd.read_csv("v1.csv", header=None, names=column_names)
v2 = pd.read_csv("v2.csv", header=None, names=column_names)

# 合并数据
data = pd.concat([v1, v2], ignore_index=True)

# 检查形状
print("v1 shape:", v1.shape)
print("v2 shape:", v2.shape)
print("Merged data shape:", data.shape)

# 创建元数据对象
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

# 手动设置所有列为数值型
for col in data.columns:
    metadata.update_column(col, sdtype='numerical')

# 定义生成样本数量
num_samples = 41400

# 1. GaussianCopulaSynthesizer
gaussian_synthesizer = GaussianCopulaSynthesizer(metadata)
gaussian_synthesizer.fit(data)
gaussian_synthetic_data = gaussian_synthesizer.sample(num_samples)
gaussian_synthetic_data.to_csv("gaus.csv", index=False, header=False)
print("GaussianCopulaSynthesizer 合成数据已保存至 gaus.csv")

# 2. CTGANSynthesizer
ctgan_synthesizer = CTGANSynthesizer(metadata, epochs=300)
ctgan_synthesizer.fit(data)
ctgan_synthetic_data = ctgan_synthesizer.sample(num_samples)
ctgan_synthetic_data.to_csv("ctgan.csv", index=False, header=False)
print("CTGANSynthesizer 合成数据已保存至 ctgan.csv")

# 3. TVAESynthesizer
tvae_synthesizer = TVAESynthesizer(metadata, epochs=300)
tvae_synthesizer.fit(data)
tvae_synthetic_data = tvae_synthesizer.sample(num_samples)
tvae_synthetic_data.to_csv("tvae.csv", index=False, header=False)
print("TVAESynthesizer 合成数据已保存至 tvae.csv")

v1 shape: (100, 512)
v2 shape: (100, 512)
Merged data shape: (200, 512)




GaussianCopulaSynthesizer 合成数据已保存至 gaus.csv




PerformanceAlert: Using the CTGANSynthesizer on this data is not recommended. To model this data, CTGAN will generate a large number of columns.

Original Column Name   Est # of Columns (CTGAN)
x1                     11
x2                     11
x3                     11
x4                     11
x5                     11
x6                     11
x7                     11
x8                     11
x9                     11
x10                    11
x11                    11
x12                    11
x13                    11
x14                    11
x15                    11
x16                    11
x17                    11
x18                    11
x19                    11
x20                    11
x21                    11
x22                    11
x23                    11
x24                    11
x25                    11
x26                    11
x27                    11
x28                    11
x29                    11
x30                    11
x31                    11




KeyboardInterrupt: 