## This jupyter-notebook contains the evaluation of synthetic data generated using CTAB-GAN for the Adult dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/CTAB-GAN/

/content/drive/MyDrive/CTAB-GAN


In [3]:
!pip install dython==0.6.4.post1

Collecting dython==0.6.4.post1
  Downloading dython-0.6.4.post1-py3-none-any.whl.metadata (1.8 kB)
Downloading dython-0.6.4.post1-py3-none-any.whl (18 kB)
Installing collected packages: dython
Successfully installed dython-0.6.4.post1


In [4]:
# Importing the model
from model.ctabgan import CTABGAN
# Importing the evaluation metrics
#from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries

import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import glob

import tqdm

is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

In [4]:
device

device(type='cuda')

In [5]:
# Specifying the replication number
num_exp = 1
# Specifying the name of the dataset used
dataset = "Fraud_data"
# Specifying the path of the dataset used
real_path = "Real_Datasets/Fraud_data.csv"
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets"

In [12]:
# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like
# that by default
# Initializing the synthesizer object and specifying input parameters

synthesizer = CTABGAN(
    raw_csv_path=real_path,
    test_ratio=0.20,
    categorical_columns=[
        'Customer_Gender', 'Customer_personal_identifier', 'Customer_identification_number', 'Customer_credit_rating', 'Customer_loan_type',
        'Account_account_number', 'Account_account_type', 'Channel', 'Operating_System', 'Error_Code', 'Type_General_Automatic',
        'IP_Address', 'MAC_Address', 'Access_Medium', 'Location', 'Recipient_Account_Number', 'Fraud_Type'
    ],
    log_columns=[],  # If there are no log-transformed columns, leave it as an empty list
    mixed_columns={
        'Account_one_month_std_dev': [0.0],
        'Account_one_month_max_amount': [0.0],
        'Account_dawn_one_month_max_amount': [0.0],
        'Account_dawn_one_month_std_dev': [0.0]
    },
    integer_columns=[
        'Customer_Birthyear', 'Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
        'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
        'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 'Customer_VPN_Indicator',
        'Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2',
        'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4',
        'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6',
        'Customer_inquery_atm_limit', 'Customer_increase_atm_limit', 'Account_initial_balance', 'Account_balance',
        'Account_indicator_release_limit_excess', 'Account_amount_daily_limit', 'Account_indicator_Openbanking',
        'Account_remaining_amount_daily_limit_exceeded', 'Account_release_suspention', 'Transaction_Amount', 'Transaction_Failure_Status',
        'Transaction_num_connection_failure', 'Another_Person_Account', 'Unused_terminal_status',
        'Flag_deposit_more_than_tenMillion', 'Unused_account_status', 'Recipient_account_suspend_status',
        'Number_of_transaction_with_the_account', 'Transaction_history_with_the_account',
        'First_time_iOS_by_vulnerable_user','Customer_registration_datetime','Account_creation_datetime','Transaction_Datetime',
        'Last_atm_transaction_datetime','Last_bank_branch_transaction_datetime','Transaction_resumed_date','Time_difference_seconds'
    ],

    problem_type={"Classification": 'Fraud_Type'},  # Adjust according to your classification target
    epochs=1
)

model_save_path = f"{fake_file_root}/{dataset}/ctabgan_model.pth"

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    # 모델 전체 저장
    #torch.save(synthesizer.synthesizer, model_save_path)
    #print(f"Model saved to {model_save_path}")
    syn = synthesizer.generate_samples()
    syn_df = pd.DataFrame(syn, columns=synthesizer.data_prep.df.columns)

    syn_df.to_csv(f"{fake_file_root}/{dataset}/{dataset}_fake_{i}.csv", index=False)



100%|██████████| 1/1 [00:37<00:00, 37.49s/it]


Finished training in 53.688679933547974  seconds.
(1700, 63)
Fraud_Type: a, Count: 100
Fraud_Type: j, Count: 100
Fraud_Type: h, Count: 100
Fraud_Type: k, Count: 100
Fraud_Type: c, Count: 100
Fraud_Type: g, Count: 100
Fraud_Type: i, Count: 100
Fraud_Type: b, Count: 100
Fraud_Type: f, Count: 100
Fraud_Type: d, Count: 100
Fraud_Type: e, Count: 100
Fraud_Type: l, Count: 100
Fraud_Type: m, Count: 500


IndexError: index 1190 is out of bounds for axis 0 with size 1176

In [None]:
import torch
from model.synthesizer.ctabgan_synthesizer import CTABGANSynthesizer  # Ensure this import matches your module structure

# Define the path to the saved model
model_load_path = f"{fake_file_root}/{dataset}/ctabgan_model.pth"

# Create a new CTABGAN instance with the same parameters as before
synthesizer = CTABGAN(
    raw_csv_path="Real_Datasets/Fraud_data.csv",
    test_ratio=0.20,
    categorical_columns=[
        'Customer_Gender', 'Customer_personal_identifier', 'Customer_identification_number', 'Customer_credit_rating', 'Customer_loan_type',
        'Account_account_number', 'Account_account_type', 'Channel', 'Operating_System', 'Error_Code', 'Type_General_Automatic',
        'IP_Address', 'MAC_Address', 'Access_Medium', 'Location', 'Recipient_Account_Number', 'Fraud_Type'
    ],
    log_columns=[],  # If there are no log-transformed columns, leave it as an empty list
    mixed_columns={
        'Account_one_month_std_dev': [0.0],
        'Account_one_month_max_amount': [0.0],
        'Account_dawn_one_month_max_amount': [0.0],
        'Account_dawn_one_month_std_dev': [0.0]
    },
    integer_columns=[
        'Customer_Birthyear', 'Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
        'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
        'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 'Customer_VPN_Indicator',
        'Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2',
        'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4',
        'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6',
        'Customer_inquery_atm_limit', 'Customer_increase_atm_limit', 'Account_initial_balance', 'Account_balance',
        'Account_indicator_release_limit_excess', 'Account_amount_daily_limit', 'Account_indicator_Openbanking',
        'Account_remaining_amount_daily_limit_exceeded', 'Account_release_suspention', 'Transaction_Amount', 'Transaction_Failure_Status',
        'Transaction_num_connection_failure', 'Another_Person_Account', 'Unused_terminal_status',
        'Flag_deposit_more_than_tenMillion', 'Unused_account_status', 'Recipient_account_suspend_status',
        'Number_of_transaction_with_the_account', 'Transaction_history_with_the_account',
        'First_time_iOS_by_vulnerable_user', 'Customer_registration_datetime', 'Account_creation_datetime', 'Transaction_Datetime',
        'Last_atm_transaction_datetime', 'Last_bank_branch_transaction_datetime', 'Transaction_resumed_date', 'Time_difference_seconds'
    ],
    problem_type={"Classification": 'Fraud_Type'},  # Adjust according to your classification target
    epochs=1
)

# Load the saved state dictionary into the synthesizer
synthesizer.synthesizer.load_state_dict(torch.load(model_load_path))

# Now use the CTABGAN instance to generate samples
syn = synthesizer.generate_samples()
syn_df = pd.DataFrame(syn, columns=synthesizer.data_prep.df.columns)
syn_df.to_csv(f"{fake_file_root}/{dataset}/{dataset}_fake_{i}.csv", index=False)
print(f"Generated synthetic data saved to {fake_file_root}/{dataset}/{dataset}_fake_{i}.csv")


In [None]:
# Collecting the paths to all corresponding generated datasets for evaluation
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

#### ML Utility Evaluation

In [None]:
# Specifying the list of classifiers to conduct ML utility evaluation
classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,1.064592,0.009517,0.061383
dt,6.285188,0.063739,0.071529
rf,2.589825,0.027153,0.040049
mlp,2.763845,0.013811,0.126618
svm,2.896919,0.049234,0.124052


#### Statistical Similarity Evaluation

In [None]:
# Specifying the categorical columns of the dataset used
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.009362,0.1204,0.761534


#### Nearest Neighbour Privacy Analysis

In [None]:
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)

privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,0.485676,0.216545,0.22867,0.632722,0.442052,0.431408


In [None]:
# Storing generated data for future use if needed
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)