## Generating Synthetic Data (CTGAN)

## Install CTGAN

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install CTGAN, generative model for tabular data.
my_path = '/content/notebooks'
os.symlink('/content/drive/MyDrive/Colab Notebooks/my_env', my_path)
sys.path.insert(0, my_path)
!pip install --target=$pack_path ctgan

## Load Modules

In [None]:
# set device: GPU
import torch
torch.cuda.is_available()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import time
import pickle
from collections import defaultdict
from itertools import islice, combinations
from datetime import datetime as dt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

from ctgan import CTGAN


## Preprocess Source Data

In [None]:
%cd drive/MyDrive/Customs-Declaration-Datasets-en/

In [None]:
# Load train data
df_raw=pd.read_csv('./data/df_syn_eng.csv', encoding='utf-8-sig')

In [None]:
# Select columns to use
df_org=df_raw[["Declaration ID", "Date", "Office ID", "Process Type", "Import Type", "Import Use", "Payment Type", 
                  "Mode of Transport", "Declarant ID", "Importer ID", "Seller ID", "Courier ID", 
                  "HS10 Code", "Country of Departure", "Country of Origin",  "Tax Rate", "Tax Type", 
                   "Country of Origin Indicator", "Net Mass", "Item Price", "Fraud", "Critical Fraud"]]

In [None]:
# Since CTGAN cannot handle large input size, we sample 3000 Train Data from Source Data
df_sample=df_org.sample(3000, replace=False)
df_sample.to_csv('./data/df_sample.csv', index=False, encoding='utf-8-sig')

In [None]:
# Aggregate Reletive Columns
cols = ['HS10 Code', 'Country of Departure', 'Country of Origin', 'Tax Rate', 'Tax Type', 'Net Mass','Fraud', 'Critical Fraud']
df_sample['Aggregated'] =df_sample[cols].apply(lambda row: '^'.join(row.values.astype(str)), axis=1)
df_sample=df_sample.drop(cols, axis=1)
df_sample=df_sample.drop(['Item Price'], axis=1)

In [None]:
df_sample['Date']=df_sample['Date'].astype('str')

In [None]:
df_sample

# Run CTGAN

In [None]:
categorical_columns =['Declaration ID', 'Date','Process Type','Declarant ID','Importer ID','Seller ID',
                      'Courier ID','Country of Origin Indicator','Aggregated']

In [None]:
# It will take around 5~10 min for training 100 epochs.
ctgan = CTGAN(verbose=True)
ctgan.fit(df_sample, categorical_columns, epochs = 100)

In [None]:
# Create data as many as the number of sampled data
count_row = df_sample.shape[0] 
df_syn = ctgan.sample(count_row)

In [None]:
df_syn

In [None]:
# Split aggregated column to original form
df_syn['HS10 Code']=df_syn["Aggregated"].str.split('^').str[0]
df_syn['Country of Departure']=df_syn["Aggregated"].str.split('^').str[1]
df_syn['Country of Origin']=df_syn["Aggregated"].str.split('^').str[2]
df_syn['Tax Rate']=df_syn["Aggregated"].str.split('^').str[3]
df_syn['Tax Type']=df_syn["Aggregated"].str.split('^').str[4]
df_syn['Net Mass']=df_syn["Aggregated"].str.split('^').str[5]
df_syn['Fraud']=df_syn["Aggregated"].str.split('^').str[6]
df_syn['Critical Fraud']=df_syn["Aggregated"].str.split('^').str[7]
df_syn = df_syn.drop(['Aggregated'],axis=1)

In [None]:
df_syn["HS10 Code"] = df_syn["HS10 Code"].astype(str).str.zfill(10)
df_syn["HS10 Code"] = df_syn["HS10 Code"].apply(lambda x : x[:-4])
df_syn = df_syn.rename(columns={'HS10 Code': 'HS6 Code'})

In [None]:
# This is the generated data!
df_syn

# Save Generated Data

In [None]:
df_syn.to_csv('./data/df_syn_example.csv', index=False)