## Synthesizing Data for one Table

In [17]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality, get_column_plot, QualityReport
from sdv.sampling import Condition

### **Sample Table with 5 Columns 10 Rows**

In [3]:
import pandas as pd
import numpy as np

categories_1 = ['Category A', 'Category B']
categories_2 = ['Type X', 'Type Y', 'Type Z']

data = {
    'Category_1': np.random.choice(categories_1, 10),
    'Category_2': np.random.choice(categories_2, 10),
    'Numerical_1': np.random.randint(10, 100, 10),
    'Numerical_2': np.random.rand(10) * 100,
    'Numerical_3': np.random.normal(50, 15, 10)
}

df = pd.DataFrame(data)

df.insert(0, 'ID', range(1, 11))

df.rename(columns={
    'Numerical_1': 'Age',
    'Numerical_2': 'Apparel_Sales',
    'Numerical_3': 'Electronics_Goods_Sales'
}, inplace=True)

df

Unnamed: 0,ID,Category_1,Category_2,Age,Apparel_Sales,Electronics_Goods_Sales
0,1,Category A,Type X,40,66.64035,41.849214
1,2,Category B,Type Z,57,62.349246,68.389856
2,3,Category B,Type X,97,5.710185,75.532637
3,4,Category A,Type Z,39,20.161883,51.181728
4,5,Category A,Type Z,78,85.208133,46.665949
5,6,Category B,Type Y,41,14.18956,61.307478
6,7,Category A,Type Y,49,20.413878,66.459694
7,8,Category B,Type Y,31,23.868608,38.556675
8,9,Category A,Type X,86,58.094261,45.611471
9,10,Category A,Type Y,38,57.184065,39.790349


In [4]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

metadata.update_column(column_name='Age', sdtype='numerical')
metadata.update_column(column_name='Apparel_Sales', sdtype='numerical')
metadata.update_column(column_name='Electronics_Goods_Sales', sdtype='numerical')
metadata.update_column(column_name='Category_1', sdtype='categorical')
metadata.update_column(column_name='Category_2', sdtype='categorical')

print(metadata.to_dict())

{'primary_key': 'ID', 'columns': {'ID': {'sdtype': 'id'}, 'Category_1': {'sdtype': 'categorical'}, 'Category_2': {'sdtype': 'categorical'}, 'Age': {'sdtype': 'numerical'}, 'Apparel_Sales': {'sdtype': 'numerical'}, 'Electronics_Goods_Sales': {'sdtype': 'numerical'}}, 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}


In [5]:
synthesizer = GaussianCopulaSynthesizer(metadata)



In [6]:
synthesizer.fit(df)

In [7]:
synthetic_data = synthesizer.sample(num_rows=100)

In [9]:
synthetic_data.shape

(100, 6)

In [12]:
diagnostic_report = run_diagnostic(real_data=df, 
                                   synthetic_data=synthetic_data, 
                                   metadata=metadata)

Generating report ...

(1/2) Evaluating Data Validity: |██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 519.55it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 118.66it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [13]:
column_name = 'Electronics_Goods_Sales'
if column_name in metadata.columns:
    fig = get_column_plot(real_data=df, synthetic_data=synthetic_data, metadata=metadata, column_name=column_name)
    fig.show()
else:
    print(f"Column '{column_name}' not found in the metadata.")

In [14]:
column_name = 'Age'
if column_name in metadata.columns:
    fig = get_column_plot(real_data=df, synthetic_data=synthetic_data, metadata=metadata, column_name=column_name)
    fig.show()
else:
    print(f"Column '{column_name}' not found in the metadata.")

In [15]:
metadata_dict = metadata.to_dict()

In [18]:
report = QualityReport()
report.generate(real_data=df, synthetic_data=synthetic_data, metadata=metadata_dict)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 592.22it/s]|
Column Shapes Score: 84.6%

(2/2) Evaluating Column Pair Trends: |███████████████████████████████████████████████| 15/15 [00:00<00:00, 138.77it/s]|
Column Pair Trends Score: 53.39%

Overall Score (Average): 68.99%



In [19]:
print("Synthetic Data:")
print(synthetic_data)
print("\nQuality Score:")
print(report.get_score())

Synthetic Data:
           ID  Category_1 Category_2  Age  Apparel_Sales  \
0   773153107  Category B     Type Y   38      21.555952   
1   295010557  Category B     Type Z   77      52.604986   
2   475410988  Category B     Type Y   38       6.290133   
3   421328571  Category A     Type Z   59      32.938631   
4   646285693  Category A     Type Y   94      79.781897   
..        ...         ...        ...  ...            ...   
95  192569005  Category A     Type Y   57      17.248572   
96  960388574  Category A     Type Y   92      70.650213   
97  685786765  Category A     Type X   61      63.587531   
98  413990076  Category A     Type Z   38      33.348042   
99  461969586  Category A     Type Z   38      67.881579   

    Electronics_Goods_Sales  
0                 38.791689  
1                 69.070041  
2                 47.419499  
3                 48.266162  
4                 59.649302  
..                      ...  
95                74.725519  
96                74.12

## Synthesizing Data for Two Tables which are Related to Each Other

In [49]:
import numpy as np
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer  

unique_category_1 = ['Category A', 'Category B', 'Category C', 'Category D', 
                     'Category E', 'Category F', 'Category G', 'Category H', 
                     'Category I', 'Category J']

data_category_1 = {
    'Category_1': unique_category_1,
    'Number_of_Items': np.random.randint(1, 100, 10),
    'Is_Present_in_Warehouse': np.random.choice([0, 1], 10)  # Binary values: 0 (No) or 1 (Yes)
}

df_category_1 = pd.DataFrame(data_category_1)

df_category_1['Is_Present_in_Warehouse'] = df_category_1['Is_Present_in_Warehouse'].astype(bool)

print(df_category_1)

metadata_category_1 = SingleTableMetadata()
metadata_category_1.detect_from_dataframe(data=df_category_1)

metadata_category_1.update_column(column_name='Number_of_Items', sdtype='numerical')
metadata_category_1.update_column(column_name='Is_Present_in_Warehouse', sdtype='boolean')

metadata_category_1.set_primary_key('Category_1')

synthesizer = GaussianCopulaSynthesizer(metadata_category_1)

print(metadata_category_1)

synthesizer.fit(df_category_1)

synthetic_data_category_1 = synthesizer.sample(100)
print(synthetic_data_category_1)


   Category_1  Number_of_Items  Is_Present_in_Warehouse
0  Category A               64                    False
1  Category B               29                    False
2  Category C               95                     True
3  Category D               75                     True
4  Category E               94                    False
5  Category F               21                     True
6  Category G               12                     True
7  Category H               54                     True
8  Category I                2                    False
9  Category J               67                     True
{
    "primary_key": "Category_1",
    "columns": {
        "Category_1": {
            "sdtype": "id"
        },
        "Number_of_Items": {
            "sdtype": "numerical"
        },
        "Is_Present_in_Warehouse": {
            "sdtype": "boolean"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}



There is an existing primary key 'Category_1'. This key will be removed.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



       Category_1  Number_of_Items  Is_Present_in_Warehouse
0   sdv-id-PQzGJB               64                     True
1   sdv-id-EPOaTD               73                     True
2   sdv-id-ITzRka               62                     True
3   sdv-id-yvKnPw               39                     True
4   sdv-id-uTKDmB               22                     True
..            ...              ...                      ...
95  sdv-id-HURgXY               93                    False
96  sdv-id-IMTGhp               22                    False
97  sdv-id-kEdUyK               61                    False
98  sdv-id-NbRDbA               78                    False
99  sdv-id-gJdUOd               14                    False

[100 rows x 3 columns]
