#Technical Requirements

In [None]:
%pip install sdv
%pip install pandas

#Import the dataset

Import the *Adult* dataset and convert it to a pandas dataframe.

In [5]:
import pandas as pd

names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                 'capital-gain', 'capital-loss', 'hours-per-week',
                 'native-country', 'income']

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None, names=names, na_values=['?', ' ?'])

df.head()  # prints the first five rows of the dataset


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#Create metadata object

We import `SingleTableMetadata` which is a class that provides methods to manage metadata about a single table of data, such as the names and types of columns, relationships between columns, etc. SDV’s modelling suite needs this metadata object as input.

In [6]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

We use `detect_from_dataframe()` method to analyze the pandas dataframe `df`, and automatically detect and set metadata about the table.

In [7]:
metadata.detect_from_dataframe(df)

Finally, we load the appropriate APIs and objects from SDV and instantiate the GaussianCopula model. Then we use the `fit()` method to generate the model.

In [8]:
from sdv.single_table import GaussianCopulaSynthesizer

gc_model = GaussianCopulaSynthesizer(metadata)
gc_model.fit(df)

Then we generate and view the synthetic dataset.

In [9]:
gc_synthetic = gc_model.sample(num_rows=df.shape[0] )

gc_synthetic.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,47,Private,164076,HS-grad,10,Divorced,Sales,Husband,White,Female,98874,4,46,United-States,<=50K
1,38,Private,361564,Masters,10,Never-married,Prof-specialty,Not-in-family,White,Female,57457,0,37,United-States,<=50K
2,32,Private,101155,Prof-school,11,Married-civ-spouse,Other-service,Unmarried,White,Male,5013,0,20,United-States,<=50K
3,23,Private,141706,HS-grad,14,Married-civ-spouse,Sales,Own-child,White,Male,41684,2,47,United-States,<=50K
4,36,Private,99347,10th,7,Separated,Machine-op-inspct,Not-in-family,White,Male,3,32,52,United-States,<=50K


# Run diagnostics on the synthetic dataset

We use the Diagnostic Report to compare the real and synthetic datasets. The Diagnostic Report should always produce a score of 100%, which tells you that primary keys are unique and non-null, continuous values in the synthetic data adhere to the min/max range in the original data, discrete values line up with the same categories across real and synthetic data, and column names are the same.

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=gc_synthetic,
    metadata=metadata
)

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 15/15 [00:00<00:00, 186.34it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 180.77it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%


#Produce Quality Report

The code produces a quality report with various metrics and visualizations that show the overall similarities between original and synthetic data.

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=gc_synthetic,
    metadata=metadata
)

fig = quality_report.get_visualization(property_name='Column Shapes')
fig.show()

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 63.78it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:06<00:00, 15.23it/s]

Overall Score: 84.6%

Properties:
- Column Shapes: 87.57%
- Column Pair Trends: 81.63%


We use a correlation matrix to visualize column pairs.

Hover over the cells to see individual similarity scores.

In [None]:
fig = quality_report.get_visualization(property_name='Column Pair Trends')
fig.show()

#Comparing the distribution of individual features

We generate a comparison of real and synthetic data distributions for columns 'age' and 'capital-gain'.

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df,
    synthetic_data=gc_synthetic,
    metadata=metadata,
    column_name='age'
)

fig.show()

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df,
    synthetic_data=gc_synthetic,
    metadata=metadata,
    column_name='capital-gain'
)

fig.show()

# Varying column distribution functions

We want to try a different distribution function for 'captial-gain', but first we must understand which kind of function was used to generate the ill-fitting synthetic data.

We use the below statement to generate a list of the distrubution functions for all columns in the dataset.

In [None]:
gc_model.get_learned_distributions()

{'age': {'distribution': 'beta',
  'learned_parameters': {'loc': 16.5633701072336,
   'scale': 78.2052931361366,
   'a': 1.4888659174328716,
   'b': 3.8300079535837446}},
 'workclass': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.00014000790102874233,
   'scale': 0.9997723707240331,
   'a': 0.9935079305954462,
   'b': 0.987964348598118}},
 'fnlwgt': {'distribution': 'beta',
  'learned_parameters': {'loc': -18811.329923631703,
   'scale': 726959319.8586707,
   'a': 3.965860273449791,
   'b': 13811.374293487625}},
 'education': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.00011055141375610795,
   'scale': 0.9998443698067665,
   'a': 1.000849777729731,
   'b': 0.9990186436603206}},
 'education-num': {'distribution': 'beta',
  'learned_parameters': {'loc': -85.40817976569028,
   'scale': 1883.7601342667583,
   'a': 1299.5019417282163,
   'b': 24333.902706150526}},
 'marital-status': {'distribution': 'beta',
  'learned_parameters': {'loc': 6.179276248577762e-05,


We then replace the original beta distribution with a gamma distribution. Gamma distributions are used for positive-only, skewed data.

Then we fit a new model on the dataframe `df` and create a new synthetic dataset `gc_synthetic2`

In [None]:
gc_model2 = GaussianCopulaSynthesizer(
    metadata,
    numerical_distributions={
        'capital-gain': 'gamma',
    })
gc_model2.fit(df)

In [None]:
gc_synthetic2 = gc_model2.sample(num_rows=df.shape[0] )

We visualize the new version of 'capital-gain' and see that the gamma distribution is a better fit for this feature.

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df,
    synthetic_data=gc_synthetic2,
    metadata=metadata,
    column_name='capital-gain'
)

fig.show()

#Other useful visualizations

A column pair plot is another useful tool for interrogating relationships in the synthetic data.

For instance, the chart below shows that the 'Armed-Forces' category within the synthetic 'occupation' feature has a strong skew towards older age groups, which is quite the opposite to the distribution seen in the real data. This would need to be fixed before using the synthetic dataset for modelling purposes.

In [None]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=df,
    synthetic_data=gc_synthetic,
    metadata=metadata,
    column_names=['age', 'occupation'],
    )

fig.show()

This chart shows that the synthetic dataset hasn't maintained the one-to-one relationship between 'education' and its numeric counterpart 'education-num'. This would also need to be fixed.

In [None]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=df,
    synthetic_data=synthetic,
    metadata=metadata,
    column_names=['education', 'education-num'],
    )

fig.show()

# Generating synthetic data using a CopulaGAN model

We import the `CopulaGANSynthesizer` and use the fit() method to generate a new model on the *Adult* dataset.

In [None]:
from sdv.single_table import CopulaGANSynthesizer

cg_model = CopulaGANSynthesizer(metadata)
cg_model.fit(df)

In [None]:
cg_synthetic = cg_model.sample(num_rows=df.shape[0] )

We look at data quality from the CopulaGAN model, repeating some of the same techniques we used with GaussianCopula.

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=cg_synthetic,
    metadata=metadata
)

fig = quality_report.get_visualization(property_name='Column Pair Trends')
fig.show()

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 48.78it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:06<00:00, 17.20it/s]

Overall Score: 87.39%

Properties:
- Column Shapes: 91.75%
- Column Pair Trends: 83.04%


#Measuring Gower Distance

Install and import `gower` package.

In [None]:
!pip install gower
import gower

Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Installing collected packages: gower
Successfully installed gower-0.1.2


We recommend using a small subset of the *Adult* dataset (e.g. 1,000 rows) to practice, as the Gower matrix calculation can take a long time to run on larger sets.

First, we create the dataframe for our model, based on the top 1,000 rows from the existing dataframe `df`, containing the full *Adult* dataset.

Then we fit a GaussianCopula model on this dataset, and generate a new synthetic dataset called synthetic.


In [None]:
new_df = df.head(1000)

model = GaussianCopulaSynthesizer(metadata)
model.fit(new_df)

synthetic = model.sample(num_rows=df.shape[0] )

We generate the Gower matrix of the real and synthetic data

In [None]:
gowerMatrix=gower.gower_matrix(new_df, synthetic)
print(gowerMatrix)

[[0.4460268  0.41092068 0.39990428 ... 0.30849126 0.33108675 0.5243855 ]
 [0.5248843  0.16020963 0.36551976 ... 0.4707944  0.36200836 0.46959284]
 [0.30875853 0.4095393  0.3763651  ... 0.36865947 0.30754757 0.367513  ]
 ...
 [0.3892014  0.43856084 0.4261839  ... 0.46071026 0.35872692 0.5520257 ]
 [0.37288257 0.26404738 0.31969768 ... 0.3632057  0.25088012 0.37751225]
 [0.4604928  0.2709656  0.39038438 ... 0.5448743  0.41212356 0.53527355]]


We then find the index of the top 10 closest rows and their Gower Distance. In this case, the smallest distance between two rows in our datasets is 0.02205, which means our synthetic dataset is not sufficiently different from the original at the individual row level.

In [None]:
gower.gower_topn(df.iloc[:,], synthetic.iloc[:,], n = 10)

{'index': array([26320, 29200, 18735, 24149, 18316, 22925,  4836, 15360,    42,
         3523]),
 'values': array([0.02205753, 0.02578343, 0.03649067, 0.0374441 , 0.03785798,
        0.04503146, 0.06345809, 0.08126822, 0.08237292, 0.08368524],
       dtype=float32)}