## Aggregating results to DataFrame

In [1]:
import os
import lib
import numpy as np
import pandas as pd
import json
from pathlib import Path

base_path = json.load(open("secrets.json", "r"))["Experiment_Folder"]

method2exp = {
    "real": "adult/20_12_2022-REAL-BASELINE/outputs/exp/adult/ddpm_real/final_eval/",
    "tab-ddpm": "adult/21_12_2022-identity-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_identity_best/final_eval/",
    "tab-ddpm-bgm": "adult/20_12_2022-bgm-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_bgm_best/final_eval/",
    "tab-ddpm-simTune": "adult/12_02_2023-identity_sim_tune-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_identity_sim_tune_best/final_eval/",
    "tab-ddpm-bgm-simTune" : "adult/12_02_2023-bgm_sim_tune-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_bgm_sim_tune_best/final_eval/",
    "tab-ddpm-simTune-minmax": "adult/02_03_2023-identity_sim_tune_min_max/outputs/exp/adult/ddpm_identity_sim_tune_minmax_best/final_eval/",
    "tab-ddpm-bgm-simTune-minmax": "adult/02_03_2023-bgm_sim_tune_min_max/outputs/exp/adult/ddpm_bgm_sim_tune_minmax_best/final_eval/",
    "tab-ddpm-ft" : "adult/08_02_2023-ft-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_ft_best/final_eval/",
    "tab-ddpm-ft-simTune": "adult/15_03_2023-ft-tabddpm-SimTune/outputs/exp/adult/ddpm_ft_sim_tune_quantile_best/final_eval/",
    "smote": "adult/02_01_2023-identity-SMOTE/outputs/exp/adult/smote/final_eval/",
    "ctabgan+": "adult/05_01_2023-identity-CTABGAN-Plus/outputs/exp/adult/ctabgan-plus/final_eval/",
    "ctabgan": "adult/02_01_2023-identity-CTABGAN/outputs/exp/adult/ctabgan/final_eval/",
    "ctabgan_simTune": "adult/08_03_2023-identity-CTABGAN-simtune/outputs/exp/adult/ctabgan/final_eval",
    "tvae": "adult/03_01_2023-identity-TVAE/outputs/exp/adult/tvae/final_eval/",
    "tvae_simTune": "adult/13_02_2023-TVAE_sim_tune-identity-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/tvae/final_eval",
} 
for k,v in method2exp.items():
    method2exp[k] = Path(os.path.join(base_path, v))

eval_file = "eval_catboost.json"
sim_file = "results_similarity.json"
eval_sim = "eval_similarity.json"
show_std = False
columns = ["method"] 
# df = pd.DataFrame(columns=["method"] + [_[:3].upper() for _ in DATASETS])
df = []
for algo in method2exp: 
    base_df = pd.DataFrame([algo], columns=["method"])
    metric_df = pd.DataFrame()
    metrics=["acc","f1","roc_auc"]

    if not os.path.exists(method2exp[algo] / eval_file):
        print(f"File {eval_file} not found for {algo}")
        metric_df = pd.DataFrame([["---"]*len(metrics)], columns=metrics)

    else:
        res_dict = lib.load_json(method2exp[algo] / eval_file)
        for metric in metrics:
            if algo == "real":
                res = f'{res_dict["real"]["test"][metric + "-mean"]:.4f}' 
                if show_std: res += f'+-{res_dict["real"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]
            else:
                res = f'{res_dict["synthetic"]["test"][metric + "-mean"]:.4f}'
                if show_std: res += f'+-{res_dict["synthetic"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]

    sim_metrics = ["score", "basic_score","corr_score", "ml_score","sup_score", "pmse_score"]

    if os.path.exists(method2exp[algo] / eval_sim):
        sim_res_dict = lib.load_json(method2exp[algo] / eval_sim)
        tmp = {}
        for k,v in sim_res_dict.items():
            if "count" in k: continue
            if "std" in k and show_std: 
                if show_std:
                    v = f'+-{float(v):.4f}'
                continue
            # if "mean" in k remove "-mean" from key
            if "mean" in k: 
                k = k.replace("-mean", "")
                v = f'{float(v):.4f}'
            tmp[k] = v 	    
            
        sim_df = pd.DataFrame([tmp], columns=sim_metrics)

    else:
        sim_res_dict = lib.load_json(method2exp[algo] / sim_file)
        sim_df = pd.DataFrame([sim_res_dict["sim_score"]], columns=sim_metrics)
    # rename score to sim_score
    sim_df = sim_df.rename(columns={"score": "sim_score"})    
    # format all floats to :.4f
    sim_df = sim_df.applymap(lambda x: f'{x:.4f}' if isinstance(x, float) else x)

    base_df = pd.concat([base_df, metric_df, sim_df], axis=1)
    
    df.append(base_df)


calculate_diff = False
results = pd.concat(df, axis=0)
metrics = results.columns.tolist()
metrics.remove("method")
cols= ["method"]
if calculate_diff:
    for metric in metrics:
        results[metric] = pd.to_numeric(results[metric])
        results[f"{metric}-diff"] = results[metric] - results.loc[results["method"] == "real", metric].values[0]
        cols.append(metric)
        cols.append(f"{metric}-diff")
    results=results.reindex(cols, axis=1)

results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,method,acc,f1,roc_auc,sim_score,basic_score,corr_score,ml_score,sup_score,pmse_score
0,real,0.8742,0.8152,0.9276,0.9598,0.9922,0.9433,0.9975,0.9839,0.882
1,tab-ddpm,0.8598,0.7941,0.9128,0.7586,0.973,0.9189,0.9923,0.8741,0.0349
2,tab-ddpm-bgm,0.8632,0.7985,0.9165,0.7418,0.9642,0.9183,0.9955,0.8307,0.0004
3,tab-ddpm-simTune,0.8556,0.7823,0.9078,0.852,0.9764,0.921,0.991,0.9522,0.4196
4,tab-ddpm-bgm-simTune,0.8586,0.7917,0.9109,0.8567,0.9823,0.8579,0.9913,0.9197,0.5323
5,tab-ddpm-simTune-minmax,0.8561,0.7779,0.91,0.8686,0.9375,0.9296,0.9901,0.9282,0.5575
6,tab-ddpm-bgm-simTune-minmax,0.8568,0.7871,0.9088,0.8555,0.9812,0.9127,0.9921,0.9151,0.4763
7,tab-ddpm-ft,0.7849,0.5516,0.8212,0.5951,0.495,0.6482,0.8691,0.9633,0.0
8,tab-ddpm-ft-simTune,0.7665,0.4506,0.7116,0.5883,0.5125,0.619,0.8182,0.992,0.0
9,smote,0.8582,0.7912,0.9104,0.7228,0.9528,0.8651,0.9925,0.8038,0.0


# Latex

In [2]:
# round all numeric values in the table
# transform all numeric strings to float
results = results.applymap(lambda x: float(x) if isinstance(x, str) and x.replace(".","",1).isdigit() else x)
results=results.round(3)

In [3]:
# subset of real, tvae, ctabgan, ctabgan+, smote, ddpm
results_ = results[results["method"].isin(["tab-ddpm-simTune-minmax", "tab-ddpm-bgm-simTune-minmax"])]
# subser sim_score, basic_score, corr_score, ml_score, sup_score, pmse_score as latex table
a=results_[["method", "sim_score", "basic_score", "corr_score", "ml_score", "sup_score", "pmse_score"]].round(3).to_latex(index=False)
# a=results_[["method", "acc","f1","roc_auc"]].round(3).to_latex(index=False)
# remove any "\n" in the table and replace with " " and any \\ to \
a = a.replace("\n", " ").replace("\\\\", "\\").replace("\\\\", "\\")
a


  a=results_[["method", "sim_score", "basic_score", "corr_score", "ml_score", "sup_score", "pmse_score"]].round(3).to_latex(index=False)


'\\begin{tabular}{lrrrrrr} \\toprule                      method &  sim\\_score &  basic\\_score &  corr\\_score &  ml\\_score &  sup\\_score &  pmse\\_score \\ \\midrule     tab-ddpm-simTune-minmax &      0.869 &        0.938 &       0.930 &     0.990 &      0.928 &       0.558 \\ tab-ddpm-bgm-simTune-minmax &      0.856 &        0.981 &       0.913 &     0.992 &      0.915 &       0.476 \\ \\bottomrule \\end{tabular} '

In [19]:
# return all tvae and ctabgan+ results separeted by a "&"" for latex table
results.round(3).to_latex(index=False, escape=False, column_format="lrrrrrr")

  results.round(3).to_latex(index=False, escape=False, column_format="lrrrrrr")


'\\begin{tabular}{lrrrrrr}\n\\toprule\n                     method &   acc &    f1 &  roc_auc &  sim_score &  basic_score &  corr_score &  ml_score &  sup_score &  pmse_score \\\\\n\\midrule\n                       real & 0.874 & 0.815 &    0.928 &      0.960 &        0.992 &       0.943 &     0.998 &      0.984 &       0.882 \\\\\n                   tab-ddpm & 0.860 & 0.794 &    0.913 &      0.759 &        0.973 &       0.919 &     0.992 &      0.874 &       0.035 \\\\\n               tab-ddpm-bgm & 0.863 & 0.798 &    0.916 &      0.742 &        0.964 &       0.918 &     0.996 &      0.831 &       0.000 \\\\\n           tab-ddpm-simTune & 0.856 & 0.782 &    0.908 &      0.852 &        0.976 &       0.921 &     0.991 &      0.952 &       0.420 \\\\\n       tab-ddpm-bgm-simTune & 0.859 & 0.792 &    0.911 &      0.857 &        0.982 &       0.858 &     0.991 &      0.920 &       0.532 \\\\\n    tab-ddpm-simTune-minmax & 0.856 & 0.778 &    0.910 &      0.869 &        0.938 &       0.930 &

# Images

In [21]:
import os
from PIL import Image, ImageDraw, ImageFont

def combine_images(filename, images_dict, numb_rows, numb_cols, rect_x, rect_y, rect_w, rect_h, horiz_space=0, vert_space=0, name_pos=None, font_path=None, font_size=16):
    # Initialize an empty list to store all the image parts
    image_parts = []

    # Iterate over all the entries in the dictionary
    for name, path in images_dict.items():
        # Construct the full path to the image file
        full_path = os.path.join(path, filename)

        # Load the image and extract the rectangular part
        with Image.open(full_path) as img:
            rect = img.crop((rect_x, rect_y, rect_x+rect_w, rect_y+rect_h))
            # Append the extracted part to the list of image parts along with its name
            image_parts.append((rect, name))

    # Calculate the total number of image parts and the required number of rows and columns
    total_parts = len(image_parts)
    rows = min(numb_rows, total_parts)
    cols = min(numb_cols, total_parts)

    # Calculate the dimensions of the new image including space between the image parts
    part_w, part_h = image_parts[0][0].size
    new_w = cols * part_w + (cols-1) * horiz_space
    new_h = rows * part_h + (rows-1) * vert_space

    # Create a new empty image with the calculated dimensions
    new_img = Image.new('RGB', (new_w, new_h), color="white")

    # Create a font object for writing the names on the image if specified
    if name_pos is not None and font_path is not None:
        font = ImageFont.truetype(font_path, font_size)

    # Iterate over all the image parts and paste them onto the new image
    for i, (part, name) in enumerate(image_parts):
        row = i // cols
        col = i % cols
        x = col * (part_w + horiz_space)
        y = row * (part_h + vert_space)
        new_img.paste(part, (x, y))

        # Write the name on the image if specified and at the correct position
        if name_pos is not None:
            draw = ImageDraw.Draw(new_img)
            name_w, name_h = draw.textsize(name, font=font)
            if name_pos == "top":
                text_x = x + (part_w - name_w) // 2
                text_y = y - name_h - font_size // 2
            elif name_pos == "bottom":
                text_x = x + (part_w - name_w) // 2
                text_y = y + part_h + font_size // 2
            draw.text((text_x, text_y), name, font=font, fill='white')

    return new_img

In [6]:
import os
from datetime import datetime
from PIL import Image, ImageDraw

def save_extracted_images(filename, img_dict, rect_x, rect_y, rect_w, rect_h, output_dir, max_h=False, max_w=False, space=20):
    """
    Extracts a rectangular portion of an image based on provided coordinates and saves the processed image.

    Args:
        filename (str): Name of the file to be processed.
        img_dict (dict): Dictionary where keys are names and values are paths to the image files.
        rect_x (int): The x-coordinate of the top-left corner of the rectangular portion to be extracted.
        rect_y (int): The y-coordinate of the top-left corner of the rectangular portion to be extracted.
        rect_w (int): The width of the rectangular portion to be extracted.
        rect_h (int): The height of the rectangular portion to be extracted.
        output_dir (str): Path to the directory where the processed images will be saved.
        max_h (bool): If True, replaces rect_h with the maximum height of the image.
        max_w (bool): If True, replaces rect_w with the maximum width of the image.
        space (int): The amount of space (in pixels) to be added to the top, bottom, left, and right of the extracted image.

    Returns:
        None
    """
    # Create output directory with timestamped name
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    timestamp = f"{filename[:-4]}_{timestamp}"
    output_dir = os.path.join(output_dir, timestamp)
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through each entry in the dict
    for name, path in img_dict.items():
        # Load the image
        img = Image.open(os.path.join(path, filename))

        # Get the maximum width and height of the image
        max_width, max_height = img.size

        # If max_h is True, replace rect_h with max_height
        if max_h:
            rect_h = max_height

        # If max_w is True, replace rect_w with max_width
        if max_w:
            rect_w = max_width

        # Crop the image based on the provided rectangle coordinates
        cropped_img = img.crop((rect_x, rect_y, rect_x+rect_w, rect_y+rect_h))

        # Add vertical and horizontal white space to the cropped image
        new_width = cropped_img.width + space*2
        new_height = cropped_img.height + space*2
        new_img = Image.new(mode='RGB', size=(new_width, new_height), color=(255, 255, 255))
        new_img.paste(cropped_img, (space, space))

        # Save the processed image in the output directory
        output_path = os.path.join(output_dir, f"{name}.jpg")
        new_img.save(output_path)


In [27]:
base_path = json.load(open("secrets.json", "r"))["Experiment_Folder"]

img_dict = {
    "real": "changed_plots/real/plots",
    "tab-ddpm": "changed_plots/tab-ddpm/plots",
    "tab-ddpm-bgm": "changed_plots/tab-ddpm-bgm/plots",
    "tab-ddpm-simTune": "changed_plots/tab-ddpm-simTune/plots",
    "tab-ddpm-bgm-simTune" : "changed_plots/tab-ddpm-bgm-simTune/plots",
    "tab-ddpm-simTune-minmax": "changed_plots/tab-ddpm-simTune-minmax/plots",
    "tab-ddpm-bgm-simTune-minmax": "changed_plots/tab-ddpm-bgm-simTune-minmax/plots",
    "tab-ddpm-ft" : "changed_plots/tab-ddpm-ft/plots",
    "tab-ddpm-ft-simTune": "changed_plots/tab-ddpm-ft-simTune/plots",
    "smote": "changed_plots/smote/plots",
    "ctabgan+": "changed_plots/ctabgan+/plots",
    "ctabgan": "changed_plots/ctabgan/plots",
    "ctabgan_simTune": "changed_plots/ctabgan_simTune/plots",
    "tvae": "changed_plots/tvae/plots",
    "tvae_simTune": "changed_plots/tvae_simTune/plots",
} 
for k,v in img_dict.items():
    img_dict[k] = Path(os.path.join(base_path, v))

In [30]:
out_dir= os.path.join(base_path, "out_images")

#  correlation difference
# extract_dict={"filename":"correlation_difference.png",
#               "rect_x":1600, 
#               "rect_y":0, 
#               "rect_w":800, 
#               "rect_h":1000,
#                 "space":0,
#                "max_h":True,
#                "max_w":False}

# # distribution-education
# extract_dict={"filename":"distributions.png",
#               "rect_x":550, 
#               "rect_y":80, 
#               "rect_w":540, 
#               "rect_h":540,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# # distribution-age
# extract_dict={"filename":"distributions.png",
#               "rect_x":1070, 
#               "rect_y":1280, 
#               "rect_w":530, 
#               "rect_h":450,
#                 "space":0,
#                "max_h":False,
#                "max_w":False}

# PCA - fake
extract_dict={"filename":"pca.png",
              "rect_x":580, 
              "rect_y":40, 
              "rect_w":530, 
              "rect_h":550,
                "space":0,
               "max_h":False,
               "max_w":False}


save_extracted_images(img_dict=img_dict, **extract_dict, output_dir=out_dir)