## Installation

In [1]:
# %%capture
!pip install numpy pandas matplotlib pycaret
!pip install -U gretel-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[K     |████████████████████████████████| 320 kB 4.7 MB/s 
Collecting pandas-profiling>=2.8.0
  Downloading pandas_profiling-3.6.1-py2.py3-none-any.whl (328 kB)
[K     |████████████████████████████████| 328 kB 63.4 MB/s 
[?25hCollecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting kmodes>=0.10.1
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 50.6 MB/s 
[?25hCollecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.6 MB/s 
[?25hCollecting spacy<2.4.0
  Downloading spacy-2.3.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.0 MB)
[K     |████

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gretel-client
  Downloading gretel_client-0.15.7-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 5.2 MB/s 
[?25hCollecting urllib3<1.26,>=1.25.3
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 59.8 MB/s 
[?25hCollecting tenacity==6.2.0
  Downloading tenacity-6.2.0-py2.py3-none-any.whl (24 kB)
Collecting smart-open<6.0,>=2.1.0
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 6.1 MB/s 
Collecting requests==2.25.0
  Downloading requests-2.25.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 6.9 MB/s 
Collecting docker==4.4.1
  Downloading docker-4.4.1-py2.py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 55.4 MB/s 
[?25hCollecting kubernetes==12.0.1
  Downloading kubernetes-12.0.

## Log in to gretel using our API key

In [4]:
import pandas as pd
from gretel_client import configure_session

pd.set_option("max_colwidth", None)
configure_session(api_key="prompt", validate=True, clear=True)

Gretel Api Key··········
Using endpoint https://api.gretel.cloud
Logged in as pmorenogonzalez@gmail.com ✅


## Load data

We're going to explore using synthetic data as input to a downstream classification task. 

In [5]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("https://gretel-blueprints-pub.s3.us-west-2.amazonaws.com/rdb/grocery_orders.csv")

In [6]:
df.head()

Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1597,1,8,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2011,4,10,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2822,0,8,29,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,2
3,2889,1,15,8,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,3971,2,18,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Since we are going to train both a synthetic data generating model and a downstream classification model, we need to hold out a small validation set that doesn't get seen by the synthetic model or the classification model to test the eventual classification performance of a classification model trained purely on synthetic data and validated on unseen real data

In [7]:
train_df, valid_df = train_test_split(df, test_size=0.05)

## Train a synthetic model and look at the generated data

In [8]:
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
from gretel_client.projects.models import read_model_config


# Create a project and model configuration.
project = create_or_get_unique_project(name="downstream-ML")

# Choose high-dimensionality config since we have 100+ columns
config = read_model_config("synthetics/high-dimensionality")

# Get a csv to work with, just dump out the train_df.
train_df.to_csv("train.csv", index=False)

model = project.create_model_obj(model_config=config, data_source="train.csv")

# Upload the training data. Train the model.
model.submit_cloud()
poll(model)

synthetic = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic.head()

INFO: Starting poller


{
    "uid": "63ab6771492edd78be81087d",
    "guid": "model_2JVyPYBwsHjp38i8qycT8hxbeqU",
    "model_name": "high-dimensionality",
    "runner_mode": "cloud",
    "user_id": "63ab66debff6212d126c182b",
    "user_guid": "user_2JVy75bNvKTfJq5fWBeB1ikI0g1",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "63ab6769e73fa5a26d624362",
    "project_guid": "proj_2JVyOXiAgC2ye6ZTObuuUywI339",
    "status_history": {
        "created": "2022-12-27T21:45:21.046018Z"
    },
    "last_modified": "2022-12-27T21:45:21.132159Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/actgan@sha256:b48d4d53fea936bce609034f0d005432aa60e6e62d7b3f526ae207f108e32837",
    "container_image_version": "2.10.20",
    "model_type": "actgan",
    "model_type_alias": null,
    "config": {
 

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2022-12-27T21:45:33.995498Z  Analyzing input data and checking for auto-params...
2022-12-27T21:45:33.997165Z  Found 2 auto-params that were set based on input data.
{
    "epochs": 600,
    "batch_size": 600
}
2022-12-27T21:45:34.055585Z  Using updated model configuration: 
{
    "schema_version": "1.0",
    "name": "high-dimensionality",
    "models": [
        {
            "actgan": {
                "privacy_filters": {
                    "outliers": "auto",
                    "similarity": "auto",
                    "max_iterations": 10
                },
                "data_source": [
                    "gretel_c911fdcf2a45498b80c9c4edf9c27724_train.csv"
                ],
                "ref_data": {},
                "params": {
                    "embedding_

Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,3420909,1,14,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2168882,2,16,19,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,800192,4,14,11,0,0,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0
3,1979977,4,18,26,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,2628941,6,7,29,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1


In [9]:
from gretel_client.evaluation import QualityReport

In [10]:
synthetic.to_csv("synthetic.csv", index=False)
report = QualityReport(data_source="synthetic.csv", ref_data="train.csv")

In [11]:
report.run()

INFO: Starting poller


{
    "uid": "63ab6a50c552a427ee525786",
    "guid": "model_2JVztxhR33AtcAq2Zgo0JqP2lYK",
    "model_name": "quizzical-exotic-dog",
    "runner_mode": "cloud",
    "user_id": "63ab66debff6212d126c182b",
    "user_guid": "user_2JVy75bNvKTfJq5fWBeB1ikI0g1",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "63ab6a47e73fa5a26d624366",
    "project_guid": "proj_2JVzsoDrSTm8hiw6z63ng2q1lXs",
    "status_history": {
        "created": "2022-12-27T21:57:36.163053Z"
    },
    "last_modified": "2022-12-27T21:57:36.391064Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/evaluate@sha256:601abac171a5bb70e57bc5eb6f2ed5322ffe1eaacf5cfa241f8b8d4490331129",
    "container_image_version": "2.10.20",
    "model_type": "evaluate",
    "model_type_alias": null,
    "config"

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2022-12-27T21:58:49.577475Z  Starting Gretel Evaluate
2022-12-27T21:58:49.578022Z  Loading data sets for SQS creation...
2022-12-27T21:58:49.792116Z  Creating SQS...
2022-12-27T22:00:50.525513Z  SQS finished, exporting report artifacts...
2022-12-27T22:00:52.385693Z  Evaluate job completed!
2022-12-27T22:00:52.386680Z  Uploading artifacts to Gretel Cloud


In [12]:
print(report.peek())

{'raw_score': 95.9037037037037, 'grade': 'Excellent', 'score': 95}


## Downstream usecase

One huge benefit of synthetic data, outside of privacy preservation, is utility. The data isn't fake, it has all the same correlations as the original data - which means it can be used as input to a machine learning model. We train several classifiers and observe performance on various folds of the data

In [14]:
from pycaret.classification import *

In [15]:
synthetic_df = synthetic.drop(['order_id'], axis=1)
train_df = train_df.drop(['order_id'], axis=1)
valid_df = valid_df.drop(['order_id'], axis=1)

In [26]:
synthetic_df.describe(include = 'all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
order_dow,5000.0,2.5360,1.933042,0.0,1.0,2.0,4.0,6.0
order_hour_of_day,5000.0,13.0334,4.784189,0.0,11.0,13.0,15.0,23.0
days_since_prior_order,5000.0,17.3772,10.591891,0.0,8.0,17.0,29.0,30.0
air fresheners candles,5000.0,0.0068,0.095685,0.0,0.0,0.0,0.0,2.0
asian foods,5000.0,0.0536,0.331586,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...
trash bags liners,5000.0,0.0100,0.099509,0.0,0.0,0.0,0.0,1.0
vitamins supplements,5000.0,0.0104,0.101459,0.0,0.0,0.0,0.0,1.0
water seltzer sparkling water,5000.0,0.2940,0.601362,0.0,0.0,0.0,0.0,6.0
white wines,5000.0,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0


In [27]:
df.describe(include = 'all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
order_id,5000.0,1.691644e+06,983478.788817,1597.0,834548.75,1682633.0,2535833.5,3420909.0
order_dow,5000.0,2.778400e+00,2.094414,0.0,1.00,3.0,5.0,6.0
order_hour_of_day,5000.0,1.359980e+01,4.262731,0.0,10.00,14.0,17.0,23.0
days_since_prior_order,5000.0,1.709300e+01,10.761948,0.0,7.00,15.0,30.0,30.0
air fresheners candles,5000.0,7.800000e-03,0.100703,0.0,0.00,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...
trash bags liners,5000.0,1.320000e-02,0.119284,0.0,0.00,0.0,0.0,2.0
vitamins supplements,5000.0,1.480000e-02,0.133359,0.0,0.00,0.0,0.0,3.0
water seltzer sparkling water,5000.0,2.836000e-01,0.698046,0.0,0.00,0.0,0.0,10.0
white wines,5000.0,4.600000e-03,0.085909,0.0,0.00,0.0,0.0,3.0


In [28]:
synthetic_df.head()

Unnamed: 0,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1,14,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,16,19,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,14,11,0,0,0,0,0,0,1,...,1,1,0,1,0,0,0,1,0,0
3,4,18,26,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,6,7,29,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,1


In [29]:
df.head()

Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1597,1,8,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2011,4,10,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2822,0,8,29,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,2
3,2889,1,15,8,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,3971,2,18,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
synthetic_train_data, synthetic_test_data = train_test_split(synthetic_df, test_size=0.2)
original_train_data, original_test_data = train_test_split(train_df, test_size=0.2)

We want to predict whether a customer will buy frozen pizza (and how many). This turns into a multi-class classifiation problem. We use the Pycaret library to test a huge number of hypothesis classes. This will take a few minutes to fit many different models on a variety of folds

In [17]:
s = setup(synthetic_train_data, target='frozen pizza')
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9257,0.2033,0.23,0.8569,0.89,0.0,0.0,0.442
rf,Random Forest Classifier,0.9257,0.2985,0.23,0.8569,0.89,0.0,0.0,0.686
ada,Ada Boost Classifier,0.9257,0.1984,0.225,0.8572,0.8901,0.0042,0.0104,0.488
dummy,Dummy Classifier,0.9257,0.2,0.23,0.8569,0.89,0.0,0.0,0.036
et,Extra Trees Classifier,0.9253,0.2942,0.2299,0.8569,0.8898,-0.0006,-0.0015,0.764
lightgbm,Light Gradient Boosting Machine,0.9243,0.2364,0.2297,0.8568,0.8893,-0.0025,-0.006,0.784
ridge,Ridge Classifier,0.9153,0.0,0.2324,0.8638,0.8873,0.0199,0.0257,0.055
lr,Logistic Regression,0.9067,0.268,0.2351,0.8658,0.8849,0.0406,0.0446,6.485
gbc,Gradient Boosting Classifier,0.905,0.2737,0.2424,0.8641,0.8829,0.029,0.0339,7.465
svm,SVM - Linear Kernel,0.8993,0.0,0.2436,0.8696,0.8825,0.0607,0.0678,0.458


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
INFO:logs:compare_models() succesfully completed......................................


In [18]:
best

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

We then see how our "Best" classification model performs on the original data when trained on the synthetic data

In [19]:
test_predictions = predict_model(best, data=original_test_data)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9474,0,0.25,0.8975,0.9218,0.0,0.0


In [20]:
valid_predictions = predict_model(best, data=valid_df)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.956,0,0.3333,0.9139,0.9345,0.0,0.0


In [21]:
synthetic_predictions = predict_model(best, data=synthetic_test_data)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.912,0,0.25,0.8317,0.87,0.0,0.0


In [22]:
s = setup(original_train_data, target='frozen pizza')
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9466,0.1477,0.3083,0.896,0.9206,0.0,0.0,0.454
rf,Random Forest Classifier,0.9466,0.1935,0.3083,0.896,0.9206,0.0,0.0,0.665
et,Extra Trees Classifier,0.9466,0.1917,0.3083,0.896,0.9206,0.0,0.0,0.749
dummy,Dummy Classifier,0.9466,0.15,0.3083,0.896,0.9206,0.0,0.0,0.026
lightgbm,Light Gradient Boosting Machine,0.9458,0.1849,0.3081,0.896,0.9203,-0.0013,-0.0026,0.495
ridge,Ridge Classifier,0.9443,0.0,0.3105,0.9004,0.9202,0.0087,0.0189,0.068
gbc,Gradient Boosting Classifier,0.9342,0.1966,0.2988,0.8973,0.9152,0.0056,0.0072,7.947
lr,Logistic Regression,0.9312,0.1791,0.3138,0.9014,0.9155,0.0312,0.0329,6.007
svm,SVM - Linear Kernel,0.9312,0.0,0.3173,0.9012,0.9155,0.0374,0.0382,0.389
dt,Decision Tree Classifier,0.9082,0.163,0.3201,0.9046,0.9063,0.0674,0.0674,0.189


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
INFO:logs:compare_models() succesfully completed......................................


In [23]:
test_predictions = predict_model(best, data=original_test_data)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9474,0,0.25,0.8975,0.9218,0.0,0.0


In [24]:
valid_predictions = predict_model(best, data=valid_df)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.956,0,0.3333,0.9139,0.9345,0.0,0.0


In [25]:
synthetic_predictions = predict_model(best, data=synthetic_test_data)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.912,0,0.25,0.8317,0.87,0.0,0.0
