# Step 1 - Installing the required dependencies 
Before we can begin we need to make sure we have all the required dependencies installed in our notebook kernel. You will also want to ensure that you have the configured the correct runtime in the notebook (e.g. GPU or CPU)

In [None]:
# In order to avoid future dependency issues we have frozen the versions. 
# This means you may have to alter these as time goes by and new releases
# are available. 
# From https://github.com/gretelai/gretel-synthetics/blob/master/examples/timeseries_dgan.ipynb
!pip install gretel-synthetics==0.19.0
!pip install gdown==4.6.0
!pip install pandas-profiling==3.6.2
!pip install matplotlib==3.6.3 

# Be sure and restart the kernel after these installs 

# Step 2 - Persisting models and accessing training data
We need a way to persist our models along with an easy way to pull the training set without having to deal with uploading/downloading to a new runtime. This will save a lot of headache and give us the ability to infer the model later. 

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/')

# Create two new working directories if they do not already exist
import os
from os import path

new_paths = ['/content/drive/MyDrive/synthetic_data_checkpoints/','/content/drive/MyDrive/synthetic_model_training_data/']
for p in new_paths:
  if path.exists(p) == False:
    os.mkdir(p)

# IMPORTANT: At this point you will need to upload a text file containing your training data 
# to the /content/drive/MyDrive/synthetic_model_training_data directory with the name training_set_time_series.csv.
# You only have to do this once unless you want to use new training data. 

In [None]:
# Skip this step if you are using your own data
import gdown

url = "https://drive.google.com/file/d/1-6T9a8kCfF0LilygJWUtTvbSBPJRf4aY/view?usp=share_link"
gdown.download(url, new_paths[1]+'training_set_time_series.csv', quiet=False,fuzzy=True)

In [None]:
import pandas as pd

train_df = pd.read_csv(new_paths[1]+'training_set_time_series.csv')

# We will need to reformat the date column for analysis
train_df['date'] = pd.to_datetime(train_df['date'], format = '%Y-%m-%d')

In [None]:
# where to save the model for later use 
checkpoint = new_paths[0] + 'synthetic_data_model.bin'

# set the numerical only columns you wish to use
# if you use the example provided no need to update these 
column_list = ['mis_and_disinformation', 'mis_and_disinformation_male',  
           'mis_and_disinformation_female','myths','myths_female', 
           'myths_male', 'new_vaccinations_smoothed']


# Step 3 - The Setup 
Now that we have a place to put all our data and persist checkpoints lets start by reading in the data and converting our date column in preparation for the training. 

In [None]:
import pandas as pd
import numpy as np

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType


In [None]:
# We don't want all the columns so lets just select a subset we are interested in 
# lets next create a sensible feature set for training and testing 
features = train_df[column_list]

# remove any NaN
features = features.dropna()

# Step 4 - Feature Engineering
We now need to prepare our data set by extracting features and reshaping for the dGAN model training. 

In [None]:
# Depending on your use case you may/may not want to include the date column 
# If you do include this it will also look for seasonal patterns, but remember
# to convert the date to a number as dGAN only accepts numerical fields 
features = features.to_numpy()

# Obsevations every 1 day
# Observations per day in the set (such as a sensor data set)
obs_per_day = 1
n = features.shape[0]
features = features[:(n*obs_per_day),:].reshape(-1, obs_per_day, features.shape[1])

# Shape is now (# examples, # time points, # features)
print(features.shape)

# Step 5 - Training the model
We are now ready to configure the model and begin the training using DGAN and batch training of the dataframe. 

In [None]:
# Recommended to train with a GPU
torch.cuda.is_available()

In [None]:
# Train DGAN model 
# lets talk about each training parameter in more detail. 
model = DGAN(DGANConfig(
    max_sequence_len=features.shape[1],
    sample_len=1, # must be multiple of obs_per_day
    batch_size=min(1000, features.shape[0]),
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=100,
))

# We have only chosen 100 epochs, but it is likely that you will have to experiment
# with say 10,000 but that will take some time to train (~2 hours)

model.train_numpy(
    features,
    feature_types=[OutputType.CONTINUOUS] * features.shape[2],
)

# Generate synthetic data
_, synthetic_features = model.generate_numpy(features.shape[0])

# Step 6 - Model Evaluation - How did we do?
Now that we have both our initial training set and our generated set lets do a side by side comparision with pandas_profiling. 

In [None]:
# Lets move from 3D to 2D with our original columns
synthetic_df = pd.DataFrame(synthetic_features.reshape(-1, synthetic_features.shape[2]), columns=column_list)


In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

original_features_df = train_df[column_list]

# Produce the data profiling report
original_report = ProfileReport(original_features_df, title='Train Data')

synthetic_report = ProfileReport(synthetic_df, title='Synthetic Data')

comparison_report = original_report.compare(synthetic_report)


In [None]:
# Save your model an a copy of the report so we can take a look at the 
# comparision

comparison_report.to_file("original_vs_transformed.html") 
model.save(checkpoint)

In [None]:
# Now lets have a look at the results 
comparison_report.to_notebook_iframe()