# Analysis
This notebook was used to create analytics for the corresponding master thesis. 

## Aggregating results to DataFrame
This code section loads the final evaluation experiments results and creates a pandas dataframe, displaying all results in a nice overview.

To load the data, create a `secrets.json` file in the following format:

```
{
    "Experiment_Folder": "PATH/TO/YOUR/EXPERIMENT_RESULTS"
}
```


In [1]:
import os
from tabsynth import lib
import numpy as np
import pandas as pd
import json
from pathlib import Path

base_path = os.path.join(json.load(open("secrets.json", "r"))["Experiment_Folder"],"adult")

method2exp = {      
    "real":                         "REAL_baseline/outputs/exp/adult/ddpm_real/final_eval/",
    "tvae":                         "TVAE_identity_ml/outputs/exp/adult/tvae/final_eval/",
    "smote":                        "SMOTE_identity/outputs/exp/adult/smote/final_eval/",
    "ctabgan":                      "CTABGAN_identity_ml/outputs/exp/adult/ctabgan/final_eval/",
    "ctabgan+":                     "CTABGAN_Plus_identity_ml/outputs/exp/adult/ctabgan-plus/final_eval/",
    "tab-ddpm":                     "TabDDPM_identity_ml_q/outputs/exp/adult/ddpm_identity_best/final_eval/",
    "tab-ddpm-bgm":                 "TabDDPM_bgm_ml_q/outputs/exp/adult/ddpm_bgm_best/final_eval/",
    "tab-ddpm-ft" :                 "TabDDPM_ft_ml_q/outputs/exp/adult/ddpm_ft_best/final_eval/",
    "ctabgan_simTune":              "CTABGAN_identity_s/outputs/exp/adult/ctabgan/final_eval",
    "ctabgan+_simTune":             "CTABGAN_Plus_identity_s/outputs/exp/adult/ctabgan-plus/final_eval",#
    "tvae_simTune":                 "TVAE_identity_s/outputs/exp/adult/tvae/final_eval",
    "tab-ddpm-simTune":             "TabDDPM_identity_s_q/outputs/exp/adult/ddpm_identity_sim_tune_best/final_eval/",
    "tab-ddpm-bgm-simTune" :        "TabDDPM_bgm_s_q/outputs/exp/adult/ddpm_bgm_sim_tune_best/final_eval/",
    "tab-ddpm-ft-simTune":          "TabDDPM_ft_s_q/outputs/exp/adult/ddpm_ft_sim_tune_quantile_best/final_eval/",
    "tab-ddpm-simTune-minmax":      "TabDDPM_identity_s_m/outputs/exp/adult/ddpm_identity_sim_tune_minmax_best/final_eval/",
    "tab-ddpm-bgm-simTune-minmax":  "TabDDPM_bgm_s_m/outputs/exp/adult/ddpm_bgm_sim_tune_minmax_best/final_eval/",
    "tab-ddpm-bgm-simTune-none":     "TabDDPM_bgm_s_n/outputs/exp/adult/ddpm_bgm_sim_tune_none_best/final_eval/",#
} 
for k,v in method2exp.items():
    method2exp[k] = Path(os.path.join(base_path, v))

eval_file = "eval_catboost.json"
sim_file = "results_similarity.json"
eval_sim = "eval_similarity.json"
show_std = False
columns = ["method"] 
# df = pd.DataFrame(columns=["method"] + [_[:3].upper() for _ in DATASETS])
df = []
for algo in method2exp: 
    base_df = pd.DataFrame([algo], columns=["method"])
    metric_df = pd.DataFrame()
    metrics=["acc","f1","roc_auc"]

    if not os.path.exists(method2exp[algo] / eval_file):
        print(f"File {eval_file} not found for {algo}")
        metric_df = pd.DataFrame([["---"]*len(metrics)], columns=metrics)

    else:
        res_dict = lib.load_json(method2exp[algo] / eval_file)
        for metric in metrics:
            if algo == "real":
                res = f'{res_dict["real"]["test"][metric + "-mean"]:.4f}' 
                if show_std: res += f'+-{res_dict["real"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]
            else:
                res = f'{res_dict["synthetic"]["test"][metric + "-mean"]:.4f}'
                if show_std: res += f'+-{res_dict["synthetic"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]

    sim_metrics = ["score", "basic_score","corr_score", "ml_score","sup_score", "pmse_score"]

    if os.path.exists(method2exp[algo] / eval_sim):
        sim_res_dict = lib.load_json(method2exp[algo] / eval_sim)
        tmp = {}
        for k,v in sim_res_dict.items():
            if "count" in k: continue
            if "std" in k and show_std: 
                if show_std:
                    v = f'+-{float(v):.4f}'
                continue
            # if "mean" in k remove "-mean" from key
            if "mean" in k: 
                k = k.replace("-mean", "")
                v = f'{float(v):.4f}'
            tmp[k] = v 	    
            
        sim_df = pd.DataFrame([tmp], columns=sim_metrics)

    else:
        sim_res_dict = lib.load_json(method2exp[algo] / sim_file)
        sim_df = pd.DataFrame([sim_res_dict["sim_score"]], columns=sim_metrics)
    # rename score to sim_score
    sim_df = sim_df.rename(columns={"score": "sim_score"})    
    # format all floats to :.4f
    sim_df = sim_df.applymap(lambda x: f'{x:.4f}' if isinstance(x, float) else x)

    base_df = pd.concat([base_df, metric_df, sim_df], axis=1)
    
    df.append(base_df)


calculate_diff = False
results = pd.concat(df, axis=0)
metrics = results.columns.tolist()
metrics.remove("method")
cols= ["method"]
if calculate_diff:
    for metric in metrics:
        results[metric] = pd.to_numeric(results[metric])
        results[f"{metric}-diff"] = results[metric] - results.loc[results["method"] == "real", metric].values[0]
        cols.append(metric)
        cols.append(f"{metric}-diff")
    results=results.reindex(cols, axis=1)

results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,method,acc,f1,roc_auc,sim_score,basic_score,corr_score,ml_score,sup_score,pmse_score
0,real,0.8742,0.8152,0.9276,0.9598,0.9922,0.9433,0.9975,0.9839,0.882
1,tvae,0.845,0.7805,0.9003,0.6575,0.8544,0.8139,0.962,0.6572,0.0
2,smote,0.8582,0.7912,0.9104,0.7228,0.9528,0.8651,0.9925,0.8038,0.0
3,ctabgan,0.8499,0.775,0.8995,0.7406,0.9397,0.8321,0.9845,0.9468,0.0
4,ctabgan+,0.8547,0.7747,0.907,0.7503,0.9692,0.8818,0.9902,0.8915,0.0186
5,tab-ddpm,0.8598,0.7941,0.9128,0.7586,0.973,0.9189,0.9923,0.8741,0.0349
6,tab-ddpm-bgm,0.8632,0.7985,0.9165,0.7418,0.9642,0.9183,0.9955,0.8307,0.0004
7,tab-ddpm-ft,0.7849,0.5516,0.8212,0.5951,0.495,0.6482,0.8691,0.9633,0.0
8,ctabgan_simTune,0.85,0.7756,0.8999,0.7405,0.9385,0.8328,0.9841,0.9474,0.0
9,ctabgan+_simTune,0.8507,0.7683,0.9018,0.7839,0.9701,0.8884,0.9866,0.9075,0.1668


### Latex
export above results to latex tables

In [3]:
# round all numeric values in the table
# transform all numeric strings to float
results = results.applymap(lambda x: float(x) if isinstance(x, str) and x.replace(".","",1).isdigit() else x)
results=results.round(3)

In [8]:
# subset of real, tvae, ctabgan, ctabgan+, smote, ddpm
results_ = results[results["method"].isin(["tab-ddpm-bgm-simTune-none"])]
# results_ = results
# subser sim_score, basic_score, corr_score, ml_score, sup_score, pmse_score as latex table
# output=results_[["method", "sim_score", "basic_score", "corr_score", "ml_score", "sup_score", "pmse_score"]].round(3).to_latex(index=False)
output=results_[["method", "acc","f1","roc_auc"]].round(3).to_latex(index=False)
# remove any "\n" in the table and replace with " " and any \\ to \
output = output.replace("\n", " ").replace("\\\\", "\\").replace("\\\\", "\\")
output


  output=results_[["method", "acc","f1","roc_auc"]].round(3).to_latex(index=False)


'\\begin{tabular}{lrrr} \\toprule                    method &   acc &    f1 &  roc\\_auc \\ \\midrule tab-ddpm-bgm-simTune-none & 0.855 & 0.784 &    0.907 \\ \\bottomrule \\end{tabular} '

In [4]:
# return all tvae and ctabgan+ results separeted by a "&"" for latex table
results.round(3).to_latex(index=False, escape=False, column_format="lrrrrrr")

  results.round(3).to_latex(index=False, escape=False, column_format="lrrrrrr")


'\\begin{tabular}{lrrrrrr}\n\\toprule\n                     method &   acc &    f1 &  roc_auc &  sim_score &  basic_score &  corr_score &  ml_score &  sup_score &  pmse_score \\\\\n\\midrule\n                       real & 0.874 & 0.815 &    0.928 &      0.960 &        0.992 &       0.943 &     0.998 &      0.984 &       0.882 \\\\\n                   tab-ddpm & 0.860 & 0.794 &    0.913 &      0.759 &        0.973 &       0.919 &     0.992 &      0.874 &       0.035 \\\\\n               tab-ddpm-bgm & 0.863 & 0.798 &    0.916 &      0.742 &        0.964 &       0.918 &     0.996 &      0.831 &       0.000 \\\\\n           tab-ddpm-simTune & 0.856 & 0.782 &    0.908 &      0.852 &        0.976 &       0.921 &     0.991 &      0.952 &       0.420 \\\\\n       tab-ddpm-bgm-simTune & 0.859 & 0.792 &    0.911 &      0.857 &        0.982 &       0.858 &     0.991 &      0.920 &       0.532 \\\\\n    tab-ddpm-simTune-minmax & 0.856 & 0.778 &    0.910 &      0.869 &        0.938 &       0.930 &

## Plots
These scripts are used to create the plots for all synthetic datasets and save them (or certain parts of it) in specific folders.

In [2]:
import os
from PIL import Image, ImageDraw, ImageFont
# Not used at the end
def combine_images(filename, images_dict, numb_rows, numb_cols, rect_x, rect_y, rect_w, rect_h, horiz_space=0, vert_space=0, name_pos=None, font_path=None, font_size=16):
    """
        Combines rectangular parts of multiple images from a dictionary into a single new image with the specified number of rows and columns.
    
    Parameters:
    -----------
    filename : str
        The name of the image file to extract from each image in the dictionary.
    images_dict : dict
        A dictionary containing the names and paths of the images to extract rectangular parts from.
    numb_rows : int
        The desired number of rows in the new image.
    numb_cols : int
        The desired number of columns in the new image.
    rect_x : int
        The x-coordinate of the top-left corner of the rectangular part to extract from each image.
    rect_y : int
        The y-coordinate of the top-left corner of the rectangular part to extract from each image.
    rect_w : int
        The width of the rectangular part to extract from each image.
    rect_h : int
        The height of the rectangular part to extract from each image.
    horiz_space : int, optional
        The horizontal space to insert between the rectangular parts in the new image (default is 0).
    vert_space : int, optional
        The vertical space to insert between the rectangular parts in the new image (default is 0).
    name_pos : str, optional
        The position to write the name of each image on the new image. Can be either "top" or "bottom" (default is None).
    font_path : str, optional
        The path to the font file to use for writing the names (default is None).
    font_size : int, optional
        The size of the font to use for writing the names (default is 16).
    
    Returns:
    --------
    new_img : PIL.Image.Image
        A new image with the rectangular parts of all the specified images combined into a single image.
    """
    # Initialize an empty list to store all the image parts
    image_parts = []

    # Iterate over all the entries in the dictionary
    for name, path in images_dict.items():
        # Construct the full path to the image file
        full_path = os.path.join(path, filename)

        # Load the image and extract the rectangular part
        with Image.open(full_path) as img:
            rect = img.crop((rect_x, rect_y, rect_x+rect_w, rect_y+rect_h))
            # Append the extracted part to the list of image parts along with its name
            image_parts.append((rect, name))

    # Calculate the total number of image parts and the required number of rows and columns
    total_parts = len(image_parts)
    rows = min(numb_rows, total_parts)
    cols = min(numb_cols, total_parts)

    # Calculate the dimensions of the new image including space between the image parts
    part_w, part_h = image_parts[0][0].size
    new_w = cols * part_w + (cols-1) * horiz_space
    new_h = rows * part_h + (rows-1) * vert_space

    # Create a new empty image with the calculated dimensions
    new_img = Image.new('RGB', (new_w, new_h), color="white")

    # Create a font object for writing the names on the image if specified
    if name_pos is not None and font_path is not None:
        font = ImageFont.truetype(font_path, font_size)

    # Iterate over all the image parts and paste them onto the new image
    for i, (part, name) in enumerate(image_parts):
        row = i // cols
        col = i % cols
        x = col * (part_w + horiz_space)
        y = row * (part_h + vert_space)
        new_img.paste(part, (x, y))

        # Write the name on the image if specified and at the correct position
        if name_pos is not None:
            draw = ImageDraw.Draw(new_img)
            name_w, name_h = draw.textsize(name, font=font)
            if name_pos == "top":
                text_x = x + (part_w - name_w) // 2
                text_y = y - name_h - font_size // 2
            elif name_pos == "bottom":
                text_x = x + (part_w - name_w) // 2
                text_y = y + part_h + font_size // 2
            draw.text((text_x, text_y), name, font=font, fill='white')

    return new_img

In [3]:
import os
from datetime import datetime
from PIL import Image, ImageDraw

def save_extracted_images(filename, img_dict, rect_x, rect_y, rect_w, rect_h, output_dir, max_h=False, max_w=False, space=20):
    """
    Extracts a rectangular portion of an image based on provided coordinates and saves the processed image.

    Parameters
    ----------
    filename : str
        Name of the file to be processed.
    img_dict : dict
        Dictionary where keys are names and values are paths to the image files.
    rect_x : int
        The x-coordinate of the top-left corner of the rectangular portion to be extracted.
    rect_y : int
        The y-coordinate of the top-left corner of the rectangular portion to be extracted.
    rect_w : int
        The width of the rectangular portion to be extracted.
    rect_h : int
        The height of the rectangular portion to be extracted.
    output_dir : str
        Path to the directory where the processed images will be saved.
    max_h : bool, optional
        If True, replaces rect_h with the maximum height of the image, by default False.
    max_w : bool, optional
        If True, replaces rect_w with the maximum width of the image, by default False.
    space : int, optional
        The amount of space (in pixels) to be added to the top, bottom, left, and right of the extracted image, by default 0.

    Returns
    -------
    None
    """
    # Create output directory with timestamped name
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    timestamp = f"{filename[:-4]}_{timestamp}"
    output_dir = os.path.join(output_dir, timestamp)
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through each entry in the dict
    for name, path in img_dict.items():
        # Load the image
        img = Image.open(os.path.join(path, filename))

        # Get the maximum width and height of the image
        max_width, max_height = img.size

        # If max_h is True, replace rect_h with max_height
        if max_h:
            rect_h = max_height

        # If max_w is True, replace rect_w with max_width
        if max_w:
            rect_w = max_width

        # Crop the image based on the provided rectangle coordinates
        cropped_img = img.crop((rect_x, rect_y, rect_x+rect_w, rect_y+rect_h))

        # Add vertical and horizontal white space to the cropped image
        new_width = cropped_img.width + space*2
        new_height = cropped_img.height + space*2
        new_img = Image.new(mode='RGB', size=(new_width, new_height), color=(255, 255, 255))
        new_img.paste(cropped_img, (space, space))

        # Save the processed image in the output directory
        output_path = os.path.join(output_dir, f"{name}.jpg")
        new_img.save(output_path)


In [4]:
base_path = json.load(open("secrets.json", "r"))["Experiment_Folder"]

img_dict = {
    "real": "changed_plots/real/plots",
    "tab-ddpm": "changed_plots/tab-ddpm/plots",
    "tab-ddpm-bgm": "changed_plots/tab-ddpm-bgm/plots",
    "tab-ddpm-simTune": "changed_plots/tab-ddpm-simTune/plots",
    "tab-ddpm-bgm-simTune" : "changed_plots/tab-ddpm-bgm-simTune/plots",
    "tab-ddpm-simTune-minmax": "changed_plots/tab-ddpm-simTune-minmax/plots",
    "tab-ddpm-bgm-simTune-minmax": "changed_plots/tab-ddpm-bgm-simTune-minmax/plots",
    "tab-ddpm-bgm-simTune-none": "changed_plots/tab-ddpm-bgm-simTune-none/plots",#
    "tab-ddpm-ft" : "changed_plots/tab-ddpm-ft/plots",
    "tab-ddpm-ft-simTune": "changed_plots/tab-ddpm-ft-simTune/plots",
    "smote": "changed_plots/smote/plots",
    "ctabgan+": "changed_plots/ctabgan+/plots",
    "ctabgan+_simTune": "changed_plots/ctabgan+_simTune/plots",#
    "ctabgan": "changed_plots/ctabgan/plots",
    "ctabgan_simTune": "changed_plots/ctabgan_simTune/plots",
    "tvae": "changed_plots/tvae/plots",
    "tvae_simTune": "changed_plots/tvae_simTune/plots",
} 
for k,v in img_dict.items():
    img_dict[k] = Path(os.path.join(base_path, v))
    img_dict[k].mkdir(parents=True, exist_ok=True)

extract certain image parts:

In [5]:
out_dir= os.path.join(base_path, "out_images")

#  correlation difference
# extract_dict={"filename":"correlation_difference.png",
#               "rect_x":1600, 
#               "rect_y":0, 
#               "rect_w":800, 
#               "rect_h":1000,
#                 "space":0,
#                "max_h":True,
#                "max_w":False}

# # distribution-education
extract_dict={"filename":"distributions.png",
              "rect_x":540, 
              "rect_y":80, 
              "rect_w":530, 
              "rect_h":540,
                "space":0,
               "max_h":False,
               "max_w":False}

# # distribution-age
# extract_dict={"filename":"distributions.png",
#               "rect_x":1065, 
#               "rect_y":1280, 
#               "rect_w":530, 
#               "rect_h":450,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# # distribution-hours-per-week
# extract_dict={"filename":"distributions.png",
#               "rect_x":540, 
#               "rect_y":2480, 
#               "rect_w":530, 
#               "rect_h":450,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# PCA - fake
# extract_dict={"filename":"pca.png",
#               "rect_x":580, 
#               "rect_y":40, 
#               "rect_w":530, 
#               "rect_h":550,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# # # distribution-full
# extract_dict={"filename":"distributions.png",
#               "rect_x":0, 
#               "rect_y":0, 
#               "rect_w":530, 
#               "rect_h":450,
#                 "space":0,
#                "max_h":True,
#                "max_w":True}


# # # # cumsum-full
# extract_dict={"filename":"cumsums.png",
#               "rect_x":0, 
#               "rect_y":0, 
#               "rect_w":530, 
#               "rect_h":450,
#                 "space":0,
#                "max_h":True,
#                "max_w":True}

# #cumsum-native-country
# extract_dict={"filename":"cumsums.png",
#               "rect_x":540, 
#               "rect_y":1290, 
#               "rect_w":530, 
#               "rect_h":560,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# #cumsum-hoursperweek
# extract_dict={"filename":"cumsums.png",
#               "rect_x":540, 
#               "rect_y":2470, 
#               "rect_w":530, 
#               "rect_h":460,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

save_extracted_images(img_dict=img_dict, **extract_dict, output_dir=out_dir)

## Folder Structure of the Project


In [2]:


import os

def print_directory_contents(path, ignore_list=None, allowed_extensions=None, keep_closed=None, level=0):
    if ignore_list is None:
        ignore_list = []

    if allowed_extensions is None:
        allowed_extensions = ['.py', '.md']

    if keep_closed is None:
        keep_closed = []

    if os.path.exists(path):
        # Add indentation for sub-directories/files
        indent = '|   ' * level

        for item in os.listdir(path):
            item_path = os.path.join(path, item)

            # Check if the item should be ignored
            if item in ignore_list:
                continue

            if os.path.isdir(item_path):
                # Print directory name
                print(f"{indent}+---📁{item}")

                # Check if the directory is in the keep_closed list
                if item not in keep_closed:
                    # Recursively call the function for sub-directories
                    print_directory_contents(item_path, ignore_list, allowed_extensions, keep_closed, level + 1)
            else:
                continue
                # Only print files with allowed extensions
                if any(item.endswith(ext) for ext in allowed_extensions):
                    # Print file name
                    print(f"{indent}+--- {item}")
    else:
        print("The given path does not exist.")

if __name__ == "__main__":
    # Use the current working directory as the starting path
    cwd = os.getcwd()

    # Specify the folders/files to ignore
    ignore_list = ['.git', 
                   '.vscode', 
                   '__pycache__', 
                   ".amlignore",
                   ".gitignore",
                   ".github",
                   ".gitmodules",
                   ".pytest_cache",
                   "tmp",
                   "__init__.py",
                   "catboost_info",
                   "legacy"]

    # Specify the allowed file types
    allowed_extensions = ['.py', '.md']

    # Specify the keep_closed list
    keep_closed = ['exp', 
                   "data",
                   "CTAB-GAN",
                   "CTAB-GAN-Plus",
                   "CTGAN",
                   "legacy",
                   "smote",
                   "CTGAN",
                   ]

    print(f"Contents of the current working directory ({cwd}):")
    print_directory_contents(cwd, ignore_list, allowed_extensions, keep_closed)


Contents of the current working directory (c:\Users\SvenG\Documents\Git_Repos\Tabular-Data-Synthesis-Repos\tab-ddpm):
+---📁outputs
|   +---📁src
|   |   +---📁tabsynth
|   |   |   +---📁exp
+---📁processor_state
+---📁src
|   +---📁tabsynth
|   |   +---📁CTABGAN
|   |   |   +---📁model
|   |   |   |   +---📁eval
|   |   |   |   +---📁pipeline
|   |   |   |   +---📁synthesizer
|   |   +---📁CTABGAN_Plus
|   |   |   +---📁model
|   |   |   |   +---📁eval
|   |   |   |   +---📁pipeline
|   |   |   |   +---📁privacy_utils
|   |   |   |   +---📁synthesizer
|   |   |   +---📁model copy
|   |   |   |   +---📁eval
|   |   |   |   +---📁pipeline
|   |   |   |   +---📁privacy_utils
|   |   |   |   +---📁synthesizer
|   |   +---📁CTGAN
|   |   +---📁data
|   |   +---📁evaluation
|   |   +---📁exp
|   |   +---📁lib
|   |   +---📁processor_state
|   |   +---📁scripts
|   |   +---📁smote
|   |   +---📁tabular_processing
|   |   |   +---📁bgm_utils
|   |   |   +---📁ft_utils
|   |   +---📁tab_ddpm
|   |   +---📁tuned_models
|   |   | 