# Step 1 - Installing the required dependencies 
Before we can begin we need to make sure we have all the required dependencies installed in our notebook kernel. You will also want to ensure that you have the configured the correct runtime in the notebook (e.g. GPU or CPU)

In [None]:
# In order to avoid future dependency issues we have frozen the versions. 
# This means you may have to alter these as time goes by and new releases
# are available. 
# From https://github.com/gretelai/gretel-synthetics/blob/master/examples/record_factory.ipynb
!pip install gretel-synthetics==0.19.0
!pip install pandas-profiling==3.6.2
!pip install matplotlib==3.6.3

# Restart the runtime for matplot libs updates

# Step 2 - The Setup 
Now that we have a place to put all our data and persist checkpoints lets start by reading in the data and converting our date column in preparation for the training. 

In [None]:
import pandas as pd
from gretel_synthetics.batch import DataFrameBatch

train_df = pd.read_csv("https://gretel-public-website.s3-us-west-2.amazonaws.com/tests/synthetics/data/USAdultIncome14K.csv")

# Step 3 - Training the model
We are now ready to configure the model and begin the training using DGAN and batch training of the dataframe. 

In [None]:
from pathlib import Path

checkpoint_dir = str(Path.cwd() / "test-model-2")

config_template = {
    "epochs": 10000,
    "max_line_len": 2048,
    "vocab_size": 200000,
    "field_delimiter": ",",
    "overwrite": True,
    "checkpoint_dir": checkpoint_dir
}

batcher = DataFrameBatch(df=train_df, config=config_template, batch_size=5)


In [None]:
batcher.create_training_data()
batcher.train_all_batches()


In [None]:
# Trigger the batch for all lines generating the same number as in original set for comparison later
status = batcher.generate_all_batch_lines(num_lines=train_df.shape[0])


In [None]:
synthetic_df = batcher.batches_to_df()


# Step 4 - Model Evaluation - How did we do?
Now that we have both our initial training set and our generated set lets do a side by side comparision with pandas_profiling. 

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

# Produce the data profiling report
original_report = ProfileReport(train_df, title='Original Data')

synthetic_report = ProfileReport(synthetic_df, title='Synthetic Data')

comparison_report = original_report.compare(synthetic_report)
comparison_report.to_file("original_vs_transformed.html") 
comparison_report.to_notebook_iframe()