In [1]:
%env XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda/"

env: XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda/"


In [2]:
from time import sleep
import pandas as pd
import tracemalloc
# import numpy as np
from ganblr.models import GANBLR
from data_utils import (
    transfrom_dataframe_discrete,
    preprocess_superstore,
    preprocess_credit_risk,
    preprocess_mushroom
)
from logger_utils import CSVLogger
from sklearn.model_selection import train_test_split
from metric_utils import get_trtr_metrics, get_sdv_metrics
from datetime import datetime
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from pathlib import Path

import os
import gc


2023-11-01 17:45:39.602340: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-01 17:45:39.602374: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-01 17:45:39.602406: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-01 17:45:39.609568: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
EPOCHS = [150]
K = [1]

overall_logfile = Path(f"./new_logs/log_{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv")

In [4]:
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/lib/cuda/' 

timestamp_id = datetime.now().strftime("%Y%m%d-%H%M%S")

with open(overall_logfile, "w") as f:
    # write with ; as delimiter
    f.write("Event;Model;Epochs;K;Dataset;Test;Metric;Value\n")

In [5]:
def process_dataset(dataset_name, X, y, df_enc, encoders, X_train, X_test, y_train, y_test, epochs, K, timestamp_id, overall_logfile):
    for epoch in epochs:
        for k in K:
            for i in range(1, 4):
                ganblr = GANBLR()
                ganblr.fit(X, y, epochs=epoch, k=k)

                # sample as many rows as the original dataset
                synth_data = pd.DataFrame(
                    ganblr.sample(X.shape[0]),
                    columns=df_enc.columns,
                )

                # decode the categorical columns
                synth_data_clear = synth_data.copy()
                for col in df_enc.columns:
                    synth_data_clear[col] = encoders[col].inverse_transform(
                        synth_data[[col]].astype(int)
                    )

                synth_data_clear.to_csv(f"./synth_data/{timestamp_id}_ganblr_synth_data_{dataset_name}_{epoch}_{k}_{i}.csv")

                # get metrics
                get_trtr_metrics(
                    X_train,
                    X_test,
                    y_train,
                    y_test,
                    synth_data,
                    dataset_name,
                    "GANBLR",
                    overall_logfile,
                    epoch,
                    k,
                )

                get_sdv_metrics(
                    real_data=df_enc,
                    synth_data=synth_data,
                    dataset_name=dataset_name,
                    model="GANBLR",
                    overall_logfile=overall_logfile,
                    epochs=epoch,
                    k=k,
                    timestamp=timestamp_id,
                    i=i
                )

                del ganblr
                gc.collect()
        
        for i in range(1, 4):
            metadata = SingleTableMetadata()
            metadata.detect_from_dataframe(data=df_enc)
            ctgan = CTGANSynthesizer(metadata, epochs=epoch)
            ctgan.fit(df_enc)

            synth_data_ctgan = pd.DataFrame(
                ctgan.sample(X.shape[0]),
                columns=df_enc.columns,
            )

            synth_data_ctgan_clear = synth_data_ctgan.copy()
            for col in df_enc.columns:
                synth_data_ctgan_clear[col] = encoders[col].inverse_transform(
                    synth_data_ctgan[[col]].astype(int)
                )

            synth_data_ctgan_clear.to_csv(f"./synth_data/{timestamp_id}_ctgan_synth_data_{dataset_name}_{epoch}_{k}_{i}.csv")

            get_trtr_metrics(
                X_train,
                X_test,
                y_train,
                y_test,
                synth_data_ctgan,
                dataset_name,
                "CTGAN",
                overall_logfile,
                epoch,
                0,
            )

            get_sdv_metrics(
                real_data=df_enc,
                synth_data=synth_data_ctgan,
                dataset_name=dataset_name,
                model="CTGAN",
                overall_logfile=overall_logfile,
                epochs=epoch,
                k=0,
                timestamp=timestamp_id,
                i=i
            )

            del ctgan
            gc.collect()

In [6]:
SUPERSTORE_PATH = Path("datasets/SampleSuperstore.csv")
CREDIT_RISK_PATH = Path("datasets/credit_risk_dataset.csv")
MUSHROOMS_PATH = Path("datasets/mushrooms.csv")

SUPERSTORE_DF = pd.read_csv(SUPERSTORE_PATH)
CREDIT_RISK_DF = pd.read_csv(CREDIT_RISK_PATH)
MUSHROOMS_DF = pd.read_csv(MUSHROOMS_PATH)

SUPERSTORE_DF = preprocess_superstore(SUPERSTORE_DF)
CREDIT_RISK_DF = preprocess_credit_risk(CREDIT_RISK_DF)
MUSHROOMS_DF = preprocess_mushroom(MUSHROOMS_DF)

SUPERSTORE_DF_ENC, SUPERSTORE_ENCODERS = transfrom_dataframe_discrete(SUPERSTORE_DF)
CREDIT_RISK_DF_ENC, CREDIT_RISK_ENCODERS = transfrom_dataframe_discrete(CREDIT_RISK_DF)
MUSHROOMS_DF_ENC, MUSHROOMS_ENCODERS = transfrom_dataframe_discrete(MUSHROOMS_DF)


# cast all columns to categorical
# SUPERSTORE_DF_ENC = SUPERSTORE_DF_ENC.astype("category")
# CREDIT_RISK_DF_ENC = CREDIT_RISK_DF_ENC.astype("category")
# MUSHROOMS_DF_ENC = MUSHROOMS_DF_ENC.astype("category")

X_super = SUPERSTORE_DF_ENC.drop("Profit", axis=1)
y_super = SUPERSTORE_DF_ENC["Profit"]

X_super_train, X_super_test, y_super_train, y_super_test = train_test_split(
    X_super, y_super, test_size=0.2, random_state=42
)


X_credit = CREDIT_RISK_DF_ENC.drop("loan_status", axis=1)
y_credit = CREDIT_RISK_DF_ENC["loan_status"]

X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42
)

X_mushrooms = MUSHROOMS_DF_ENC.drop("class", axis=1)
y_mushrooms = MUSHROOMS_DF_ENC["class"]

X_mushrooms_train, X_mushrooms_test, y_mushrooms_train, y_mushrooms_test = train_test_split(
    X_mushrooms, y_mushrooms, test_size=0.2, random_state=42
)

  return fit_method(estimator, *args, **kwargs)


In [None]:
process_dataset(
    "superstore",
    X_super,
    y_super,
    SUPERSTORE_DF_ENC,
    SUPERSTORE_ENCODERS,
    X_super_train,
    X_super_test,
    y_super_train,
    y_super_test,
    EPOCHS,
    K,
    timestamp_id,
    overall_logfile
)

In [None]:
process_dataset(
    "credit_risk",
    X_credit,
    y_credit,
    CREDIT_RISK_DF_ENC,
    CREDIT_RISK_ENCODERS,
    X_credit_train,
    X_credit_test,
    y_credit_train,
    y_credit_test,
    EPOCHS,
    K,
    timestamp_id,
    overall_logfile
)

In [None]:
process_dataset(
    "mushrooms",
    X_mushrooms,
    y_mushrooms,
    MUSHROOMS_DF_ENC,
    MUSHROOMS_ENCODERS,
    X_mushrooms_train,
    X_mushrooms_test,
    y_mushrooms_train,
    y_mushrooms_test,
    EPOCHS,
    K,
    timestamp_id,
    overall_logfile
)

In [8]:
import psutil

process = psutil.Process(os.getpid())
print(process.memory_info().rss / 1024 ** 2)  # in megabytes

9287.19921875


In [10]:
%memit

UsageError: Line magic function `%memit` not found.
