## Generating Synthetic Data (CTGAN)

## Install CTGAN

In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install CTGAN, generative model for tabular data.
my_path = '/content/notebooks'
os.symlink('/content/drive/MyDrive/Colab Notebooks/my_env', my_path)
sys.path.insert(0, my_path)
!pip install --target=$pack_path ctgan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ctgan
  Downloading ctgan-0.6.0-py2.py3-none-any.whl (24 kB)
Collecting rdt<2.0,>=1.2.0
  Downloading rdt-1.2.1-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 413 kB/s 
Collecting pyyaml<6,>=5.4.1
  Downloading PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
[K     |████████████████████████████████| 662 kB 46.0 MB/s 
[?25hCollecting Faker>=10
  Downloading Faker-15.3.4-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 63.7 MB/s 
Collecting psutil<6,>=5.7
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 81.2 MB/s 
Installing collected packages: pyyaml, psutil, Faker, rdt, ctgan
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:


## Load Modules

In [3]:
# set device: GPU
import torch
torch.cuda.is_available()

True

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import time
import pickle
from collections import defaultdict
from itertools import islice, combinations
from datetime import datetime as dt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

from ctgan import CTGAN


## Preprocess Source Data

In [5]:
%cd drive/MyDrive/Customs-Declaration-Datasets-en/

/content/drive/MyDrive/Customs-Declaration-Datasets-en


In [6]:
# Load train data
df_raw=pd.read_csv('./data/df_syn_eng.csv', encoding='utf-8-sig')

In [7]:
# Select columns to use
df_org=df_raw[["Declaration ID", "Date", "Office ID", "Process Type", "Import Type", "Import Use", "Payment Type", 
                  "Mode of Transport", "Declarant ID", "Importer ID", "Seller ID", "Courier ID", 
                  "HS10 Code", "Country of Departure", "Country of Origin",  "Tax Rate", "Tax Type", 
                   "Country of Origin Indicator", "Net Mass", "Item Price", "Fraud", "Critical Fraud"]]

In [8]:
# Since CTGAN cannot handle large input size, we sample 3000 Train Data from Source Data
df_sample=df_org.sample(3000, replace=False)
df_sample.to_csv('./data/df_sample.csv', index=False, encoding='utf-8-sig')

In [9]:
# Aggregate Reletive Columns
cols = ['HS10 Code', 'Country of Departure', 'Country of Origin', 'Tax Rate', 'Tax Type', 'Net Mass','Fraud', 'Critical Fraud']
df_sample['Aggregated'] =df_sample[cols].apply(lambda row: '^'.join(row.values.astype(str)), axis=1)
df_sample=df_sample.drop(cols, axis=1)
df_sample=df_sample.drop(['Item Price'], axis=1)

In [10]:
df_sample['Date']=df_sample['Date'].astype('str')

In [11]:
df_sample

Unnamed: 0,Declaration ID,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,Country of Origin Indicator,Aggregated
30477,29542911,2020-10-20,20,B,11,21,11,10,ZGNH4UK,XON4795,OTRVIBL,,Y,303899070^CL^CL^10.0^A^10.0^0^0
10827,26638975,2020-04-06,30,B,11,21,11,10,WBQKENF,85LF707,O9QXSRZ,,G,6217100000^CN^CN^13.0^A^11.0^1^1
16699,21320132,2020-06-03,30,B,11,21,11,10,A7POIBB,Y9UPOXS,QZT1H9Q,,E,6204430000^CN^CN^13.0^A^2.0^0^0
53959,61927098,2021-06-30,40,B,11,21,11,10,B6C4G68,KQY40SW,159UTXO,,Y,710807000^CN^CN^27.0^A^100000.0^0^0
24361,70428758,2020-08-14,40,B,11,21,11,10,GO1EBDS,S0MPEPK,SVODT0Z,,G,3002129040^CN^CN^0.0^A^0.5^0^0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4954,78069992,2020-02-07,33,B,11,21,11,10,F463TSU,2R3XA57,0TTDXW0,,E,6910103000^CN^CN^8.0^A^24000.0^0^0
29522,10744321,2020-10-12,20,B,11,21,11,10,R5P6P4N,3F88WYA,4VMRI9B,,E,6802930000^CN^CN^8.0^A^18800.0^0^0
35066,89055174,2020-12-04,13,B,11,21,43,40,84WFPKU,47PJGLB,21ZA9NV,W6UCD9,S,3926909000^CN^CN^3.4^FCN1^665.0^0^0
32143,11526604,2020-11-06,20,B,11,21,11,10,P21JCS7,7Y5DNPU,GMO44NQ,,S,3926909000^CN^CN^3.4^FCN1^1083.0^0^0


# Run CTGAN

In [12]:
categorical_columns =['Declaration ID', 'Date','Process Type','Declarant ID','Importer ID','Seller ID',
                      'Courier ID','Country of Origin Indicator','Aggregated']

In [13]:
# It will take around 5~10 min for training 100 epochs.
ctgan = CTGAN(verbose=True)
ctgan.fit(df_sample, categorical_columns, epochs = 100)

Epoch 1, Loss G:  5.6168,Loss D: -0.1284
Epoch 2, Loss G:  5.4019,Loss D: -0.2202
Epoch 3, Loss G:  5.4992,Loss D: -0.2568
Epoch 4, Loss G:  5.5647,Loss D: -0.2643
Epoch 5, Loss G:  5.2840,Loss D: -0.3149
Epoch 6, Loss G:  5.4727,Loss D: -0.3653
Epoch 7, Loss G:  5.4506,Loss D: -0.4327
Epoch 8, Loss G:  5.4517,Loss D: -0.4999
Epoch 9, Loss G:  5.6839,Loss D: -0.5936
Epoch 10, Loss G:  5.3273,Loss D: -0.3963
Epoch 11, Loss G:  5.0795,Loss D: -0.3406
Epoch 12, Loss G:  5.0538,Loss D: -0.1795
Epoch 13, Loss G:  5.3748,Loss D: -0.1864
Epoch 14, Loss G:  5.1902,Loss D: -0.2036
Epoch 15, Loss G:  4.9271,Loss D: -0.0487
Epoch 16, Loss G:  4.7632,Loss D:  0.1840
Epoch 17, Loss G:  4.8691,Loss D:  0.1274
Epoch 18, Loss G:  5.1061,Loss D:  0.0906
Epoch 19, Loss G:  4.9557,Loss D:  0.0532
Epoch 20, Loss G:  4.8867,Loss D:  0.1443
Epoch 21, Loss G:  4.6090,Loss D:  0.0864
Epoch 22, Loss G:  4.9212,Loss D:  0.0876
Epoch 23, Loss G:  4.8644,Loss D:  0.0629
Epoch 24, Loss G:  5.1441,Loss D:  0.0432
E

In [14]:
# Create data as many as the number of sampled data
count_row = df_sample.shape[0] 
df_syn = ctgan.sample(count_row)

In [15]:
df_syn

Unnamed: 0,Declaration ID,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,Country of Origin Indicator,Aggregated
0,37508957,2020-02-13,13,B,11,21,11,41,EAO05UQ,0MX22NJ,5G06UT6,,E,6307909000^CN^CN^10.0^A^20.0^0^0
1,40789470,2020-03-05,39,B,11,25,15,41,P0PG8TK,0H3KUE6,WRC110G,,E,8525802090^US^CN^0.0^A^11.8^1^1
2,77409982,2020-01-26,30,B,10,21,45,42,4QADEO0,3I7CR54,PMN0COE,,G,710807000^CN^CN^27.0^A^100000.0^0^0
3,72078802,2021-03-24,31,B,11,21,19,11,VCV1EJF,1VHJG03,XBU3AO0,MWIDNS,B,8542311000^US^MX^0.0^CIT^0.1^0^0
4,82802220,2020-05-04,29,B,10,21,12,41,VBOQV8H,XIDTSU2,2100L9Z,,G,3908103000^JP^JP^6.5^C^20.0^0^0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,49957284,2020-10-14,20,B,11,21,45,11,OZB7KED,W0RGTOU,SVODT0Z,,E,6304990000^CN^CN^5.2^FCN1^600.0^0^0
2996,31063869,2021-03-07,20,B,11,21,44,10,FAG093P,SU59N3V,IWSDF6W,,B,4911100000^JP^JP^0.0^C^0.5^0^0
2997,82189653,2020-11-24,30,B,10,21,44,55,QZE38LM,5EJ36OK,6ZQTY16,,E,6214900000^CN^CN^3.2^FCN1^53.0^1^1
2998,21998474,2020-06-10,30,B,11,21,44,11,CIN0OY4,D8DTWCP,J52UCA9,MWIDNS,B,8538909000^US^DE^8.0^A^1.0^1^1


In [16]:
# Split aggregated column to original form
df_syn['HS10 Code']=df_syn["Aggregated"].str.split('^').str[0]
df_syn['Country of Departure']=df_syn["Aggregated"].str.split('^').str[1]
df_syn['Country of Origin']=df_syn["Aggregated"].str.split('^').str[2]
df_syn['Tax Rate']=df_syn["Aggregated"].str.split('^').str[3]
df_syn['Tax Type']=df_syn["Aggregated"].str.split('^').str[4]
df_syn['Net Mass']=df_syn["Aggregated"].str.split('^').str[5]
df_syn['Fraud']=df_syn["Aggregated"].str.split('^').str[6]
df_syn['Critical Fraud']=df_syn["Aggregated"].str.split('^').str[7]
df_syn = df_syn.drop(['Aggregated'],axis=1)

In [17]:
# This is the generated data!
df_syn

Unnamed: 0,Declaration ID,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,...,Courier ID,Country of Origin Indicator,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Net Mass,Fraud,Critical Fraud
0,37508957,2020-02-13,13,B,11,21,11,41,EAO05UQ,0MX22NJ,...,,E,6307909000,CN,CN,10.0,A,20.0,0,0
1,40789470,2020-03-05,39,B,11,25,15,41,P0PG8TK,0H3KUE6,...,,E,8525802090,US,CN,0.0,A,11.8,1,1
2,77409982,2020-01-26,30,B,10,21,45,42,4QADEO0,3I7CR54,...,,G,710807000,CN,CN,27.0,A,100000.0,0,0
3,72078802,2021-03-24,31,B,11,21,19,11,VCV1EJF,1VHJG03,...,MWIDNS,B,8542311000,US,MX,0.0,CIT,0.1,0,0
4,82802220,2020-05-04,29,B,10,21,12,41,VBOQV8H,XIDTSU2,...,,G,3908103000,JP,JP,6.5,C,20.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,49957284,2020-10-14,20,B,11,21,45,11,OZB7KED,W0RGTOU,...,,E,6304990000,CN,CN,5.2,FCN1,600.0,0,0
2996,31063869,2021-03-07,20,B,11,21,44,10,FAG093P,SU59N3V,...,,B,4911100000,JP,JP,0.0,C,0.5,0,0
2997,82189653,2020-11-24,30,B,10,21,44,55,QZE38LM,5EJ36OK,...,,E,6214900000,CN,CN,3.2,FCN1,53.0,1,1
2998,21998474,2020-06-10,30,B,11,21,44,11,CIN0OY4,D8DTWCP,...,MWIDNS,B,8538909000,US,DE,8.0,A,1.0,1,1


# Save Generated Data

In [18]:
df_syn.to_csv('./data/df_syn_example.csv', index=False)