In [2]:
from ctgan import CTGAN
import pandas as pd
import torch
from sdmetrics.reports.single_table import QualityReport
import torch.version
from sdv.single_table.copulagan import CopulaGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata

In [3]:

print(torch.cuda.is_available())  # Returns True if CUDA is available
print(torch.version.cuda)  # Prints the version of CUDA supported by the installed PyTorch
print(torch.__version__)
print(torch.cuda.get_device_name(0))      # Should print the name of your GPU

True
12.4
2.5.1
NVIDIA GeForce RTX 2080 Ti


In [7]:
def create_ctgan_model(input_csv_path, num_samples, model_save_path, synthetic_data_save_path):
    # Load the dataset
    real_data = pd.read_csv(input_csv_path)
    real_data = real_data.drop(columns=['Code'])  # Remove the first column if it's an index column
    metadata = {
        'columns': {
            'Entity': {'sdtype': 'categorical'},
            'Year': {'sdtype': 'mumerical'},
            'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders': {'sdtype': 'numerical'},
            'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia': {'sdtype': 'numerical'},
            'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Bipolar disorder': {'sdtype': 'numerical'},
            'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Eating disorders': {'sdtype': 'numerical'},
            'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Anxiety disorders': {'sdtype': 'numerical'}
        }
    }
    
    # Identify categorical columns
    categorical_columns = real_data.select_dtypes(include=['object']).columns.tolist()

    # Convert object columns to category
    for column in categorical_columns:
        real_data[column] = real_data[column].astype('category')

    # Initialize CTGAN
    model = CTGAN(cuda=torch.cuda.is_available())

    # Fit the model
    model.fit(real_data, discrete_columns=categorical_columns)
    # Generate synthetic 
    synthetic_data = model.sample(num_samples)

    # Save the model and synthetic data
    model.save(model_save_path)
    synthetic_data.to_csv(synthetic_data_save_path, index=False)
    
    my_report = QualityReport()
    my_report.generate(real_data, synthetic_data, metadata)

    return model, synthetic_data

# Usage example
input_csv_path = 'heath2.csv'
model_save_path = 'model/synthetic_data_mental_illness_ctgan.pkl'
synthetic_data_save_path = 'output/synthetic_data_mental_illness_ctgan.csv'
num_samples = 50

ctgan_model, synthetic_samples = create_ctgan_model(
    input_csv_path=input_csv_path,
    num_samples=num_samples,
    model_save_path=model_save_path,
    synthetic_data_save_path=synthetic_data_save_path
)


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 7/7 [00:00<00:00, 635.78it/s]|
Column Shapes Score: 76.9%

(2/2) Evaluating Column Pair Trends: |██████████| 21/21 [00:00<00:00, 187.33it/s]|
Column Pair Trends Score: 64.52%

Overall Score (Average): 70.71%



## CopulaGAN

In [10]:
def create_ctgan_model(input_csv_path, num_samples, model_save_path, synthetic_data_save_path):
    # Load the dataset
    real_data = pd.read_csv(input_csv_path)
    real_data = real_data.drop(columns=['Code'])  
    
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(real_data)
    model = CopulaGANSynthesizer(cuda=torch.cuda.is_available(), metadata=metadata)
    model.fit(real_data) 
    synthetic_data = model.sample(num_samples)

    model.save(model_save_path)
    synthetic_data.to_csv(synthetic_data_save_path, index=False)
    
    my_report = QualityReport()
    my_report.generate(real_data, synthetic_data, metadata.to_dict())

    return model, synthetic_data

# Usage example
input_csv_path = 'heath2.csv'
model_save_path = 'model/synthetic_data_mental_illness_copula.pkl'
synthetic_data_save_path = 'output/synthetic_data_mental_illness_copula.csv'
num_samples = 50

ctgan_model, synthetic_samples = create_ctgan_model(
    input_csv_path=input_csv_path,
    num_samples=num_samples,
    model_save_path=model_save_path,
    synthetic_data_save_path=synthetic_data_save_path
)




Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 7/7 [00:00<00:00, 582.81it/s]|
Column Shapes Score: 76.61%

(2/2) Evaluating Column Pair Trends: |██████████| 21/21 [00:00<00:00, 203.70it/s]|
Column Pair Trends Score: 69.13%

Overall Score (Average): 72.87%



In [22]:


def create_tvae_model(input_csv_path, num_samples, model_save_path, synthetic_data_save_path):
    # Load the dataset
    real_data = pd.read_csv(input_csv_path)
    real_data = real_data.drop(columns=['Code'])  # Adjust this based on your dataset structure
    
    # Define metadata
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(real_data)
    
    # Initialize the TVAE synthesizer
    model = TVAESynthesizer(metadata=metadata, cuda=torch.cuda.is_available())
    
    # Fit the model
    model.fit(real_data)
    
    # Generate synthetic data
    synthetic_data = model.sample(num_samples)
    
    # Save the trained model and synthetic data
    model.save(model_save_path)
    synthetic_data.to_csv(synthetic_data_save_path, index=False)
    
    # Generate a quality report
    my_report = QualityReport()
    my_report.generate(real_data, synthetic_data, metadata.to_dict())
    
    print("Quality Report:")
    print(my_report.get_score())
    
    return model, synthetic_data

# Usage example
input_csv_path = 'heath2.csv'
model_save_path = 'model/synthetic_data_mental_illness_tvae.pkl'
synthetic_data_save_path = 'output/synthetic_data_mental_illness_tvae.csv'
num_samples = 50

tvae_model, synthetic_samples = create_tvae_model(
    input_csv_path=input_csv_path,
    num_samples=num_samples,
    model_save_path=model_save_path,
    synthetic_data_save_path=synthetic_data_save_path
)




Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 7/7 [00:00<00:00, 466.24it/s]|
Column Shapes Score: 78.4%

(2/2) Evaluating Column Pair Trends: |██████████| 21/21 [00:00<00:00, 194.27it/s]|
Column Pair Trends Score: 70.49%

Overall Score (Average): 74.44%

Quality Report:
0.7444187972229181
