## Prep and Input API Key

In [20]:
%%capture
!pip install -U gretel-client
!pip install fileupload
! pip jupyter labextension install @jupyter-widgets/jupyterlab-manager


In [21]:
# Specify your Gretel API key. Sign up for free here https://gretel.ai/

import pandas as pd
from gretel_client import configure_session

pd.set_option("max_colwidth", None)
configure_session(api_key="prompt", cache="yes", validate=True)


Found cached Gretel credentials
Using endpoint https://api.gretel.cloud
Logged in as andrewterrell.nec@gmail.com ✅


In [22]:
# Create a project

from gretel_client.projects import create_or_get_unique_project
project = create_or_get_unique_project(name="synthetic-data")


## Create the synthetic data configuration

In [39]:
import json

from gretel_client.projects.models import read_model_config

config = read_model_config("synthetics/default")

# Set the model parameters, 50 epochs is recommended.
config["models"][0]["synthetics"]["params"]["epochs"] = 50

#Uncomment line below to view model configuration details.
#print(json.dumps(config, indent=2))


## Load and preview the source dataset


In [24]:
#Upload your csv datafile
from ipywidgets import FileUpload
upload = FileUpload()
upload

FileUpload(value={}, description='Upload')

In [26]:
with open("input.csv", "w+b") as i:
    i.write(upload.data[0])

In [38]:
# Load and preview
import pandas as pd

dataset_path = "input.csv"
df = pd.read_csv(dataset_path)
df.to_csv("training_data.csv", index=False)

#uncomment last line here to preview the DataFrame that will be used to train the synthetic model.
#df

## Train the synthetic model

In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.


In [28]:
from gretel_client.helpers import poll

model = project.create_model_obj(model_config=config, data_source="training_data.csv")
model.submit_cloud()

poll(model)


[32mINFO: [0mStarting poller


{
    "uid": "626c3a5b333a9a8e7adc956c",
    "guid": "model_28U8qcCkUsQ5HRzS9UkSuz4Q7d2",
    "model_name": "cute-excited-badger",
    "runner_mode": "cloud",
    "user_id": "60372497bff62132eb7280c6",
    "user_guid": "user_26hm1MeAUnFtwqF2byw4YmilxOS",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "626b04389f8f635a7ab81004",
    "project_guid": "proj_28RXvIjvXrnsj7r5ejvsEtjfjc5",
    "status_history": {
        "created": "2022-04-29T19:19:55.233357Z"
    },
    "last_modified": "2022-04-29T19:19:55.336891Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:f83dc37b903af806931b9e6ae083331fbfcfdc5b2bcc542aad557504561db1d0",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2022-04-29T19:20:13.475981Z  Starting synthetic model training
2022-04-29T19:20:13.478893Z  Loading training data
2022-04-29T19:20:13.655186Z  Training data loaded, detected format: 'csv'
2022-04-29T19:20:13.658688Z  Training data loaded
{
    "record_count": 5000,
    "field_count": 15,
    "upsample_count": 5000
}
2022-04-29T19:20:17.051657Z  Creating semantic validators and preparing training data
2022-04-29T19:20:27.981164Z  Beginning ML model training
2022-04-29T19:20:27.981803Z  Running training on 1 batches.
{
    "batch_sizes": "[15]"
}
2022-04-29T19:20:29.261741Z  Tokenizing input data
2022-04-29T19:20:29.774537Z  Shuffling input data
2022-04-29T19:20:31.208321Z  Initializing synthetic model
2022-04-29T19:20:40.383268Z  Training epoch compl

In [33]:
# View the synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")

synthetic_df


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,18,?,137367.0,11th,7,Never-married,?,Own-child,Black,Female,0,0,30,Jamaica,<=50K
1,37,?,110643.0,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K
2,49,?,286750.0,Preschool,1,Married-civ-spouse,?,Wife,Black,Female,0,0,35,United-States,>50K
3,46,Private,97969.0,1st-4th,2,Married-civ-spouse,Sales,Other-relative,White,Female,0,0,20,United-States,>50K
4,43,Private,199058.0,10th,6,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,31,Private,301743.0,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Male,0,0,40,United-States,<=50K
4996,32,Private,237498.0,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,15024,0,50,United-States,>50K
4997,41,Self-emp-inc,33522.0,11th,7,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K
4998,27,Private,216999.0,HS-grad,9,Never-married,Craft-repair,Own-child,White,Female,0,0,40,United-States,<=50K


# View the synthetic data quality report


In [37]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())


KeyboardInterrupt: ignored

# Generate unlimited synthetic data

You can now use the trained synthetic model to generate as much synthetic data as you like.


In [34]:
# Generate more records from the model

record_handler = model.create_record_handler_obj(
    params={"num_records": 100, "max_invalid": 500}
)
record_handler.submit_cloud()
poll(record_handler)


[32mINFO: [0mStarting poller


{
    "uid": "626c472769a56143ca8fe6e4",
    "guid": "model_run_28UFUHjB3eg8HPLiGDlhcXRpzct",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "60372497bff62132eb7280c6",
    "user_guid": "user_26hm1MeAUnFtwqF2byw4YmilxOS",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "626b04389f8f635a7ab81004",
    "project_guid": "proj_28RXvIjvXrnsj7r5ejvsEtjfjc5",
    "status_history": {
        "created": "2022-04-29T20:14:31.687000Z"
    },
    "last_modified": "2022-04-29T20:14:31.811000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:f83dc37b903af806931b9e6ae083331fbfcfdc5b2bcc542aad557504561db1d0",
    "model_id": "626c3a5b333a9a8e7adc956c",
    "model_guid": "model_28U8qcCkUsQ5HRzS9UkSuz4Q7d2",
    "action": "generate",
    "config": {
        

[32mINFO: [0mStatus is created. A Record generation job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2022-04-29T20:15:36.047518Z  Loading model to worker
2022-04-29T20:15:36.910730Z  Checking for synthetic smart seeds
2022-04-29T20:15:36.911103Z  No smart seeds provided, will attempt generation without them
2022-04-29T20:15:36.911878Z  Loading model
2022-04-29T20:15:39.236999Z  Generating records
{
    "num_records": 100
}
2022-04-29T20:15:44.243095Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-04-29T20:15:49.249934Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-04-29T20:15:53.254878

In [36]:
# Write synthetic data to local file

synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression="gzip")

synthetic_df

synthetic_df.to_csv('synth_data.csv')

