In [23]:
import sys
from pathlib import Path
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
project_root = Path.cwd().parent
sys.path.append(str(project_root))
from visualization.utils import defaultColors
from helpers import load_from_path, save_pdf

In [24]:

from scipy.stats import norm

def calculate_var_pos(mean, std_dev, alpha=0.9):

    # Calculate the z-score for the given alpha level
    z_score = norm.ppf(alpha)

    # Calculate mVaR for each variable
    var = mean - z_score * std_dev
    return var


In [25]:
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import yaml

In [26]:
import seaborn as sns

cm = sns.light_palette("purple", as_cmap=True)

exp = "GAS-001"
# agg_type = "agg"
agg_type = "raw"

def get_dfs(exp):
    df = pd.read_excel(f"./final/{exp}/XRD+synthsis_data.xlsx")
    path = Path(f"./final/{exp}/unroll_new/0/")
    # load the data
    res_dict = load_from_path(path)
    eval_samples = res_dict["eval_samples"]
    eval_samples["id"] = eval_samples.index
    # mvar_hv = eval_samples["MVaR_Hypervolume_indicator"]

    df.dropna(how="any", axis=1, inplace=True)

    # merge eval_samples onto df on id
    df = pd.merge(df, eval_samples, on="id")

    df.reset_index(drop=True, inplace=True)

    df_mean = df.select_dtypes(include=["float64", "int64"]).groupby("id").mean()
    df_std = df.select_dtypes(include=["float64", "int64"]).groupby("id").std()

    # only std non-zero columns
    df_std = df_std.loc[:, df_std.mean() > 0]

    df_mean_std = pd.merge(
        df_mean, df_std, left_index=True, right_index=True, suffixes=("", "_std")
    )

    for col in df_std.columns:
        df_mean_std[col + "_VaR"] = calculate_var_pos(
            mean=df_mean[col],
            std_dev=df_std[col]
        )
        df_mean_std.rename(columns={col: col + "_mean"}, inplace=True)


    df_mean_std["id_idx"] = df_mean_std.index

    df_mean_std["initial_sampling"] = df_mean_std["id_idx"] < 12
    df["initial_sampling"] = df["id"] < 12
    df_mean_std["zero"] = 0

    # reset index
    df_mean_std.reset_index(drop=True, inplace=True)
    
    return df, df_mean_std


In [27]:
df, df_mean_std = get_dfs(exp=exp)

In [28]:

def is_float_column(column, df):
    return df[column].dtype in [np.float64, np.float32]


def get_color(column, df):
    labels = df[column]
    if is_float_column(column, df):
        # Use a continuous colormap for float-valued columns
        norm = plt.Normalize(labels.min(), labels.max())
        cmap = plt.cm.viridis
        colors = cmap(norm(labels))
    else:
        # Use distinct colors for category or integer-valued columns
        unique_labels = np.unique(labels)
        cmap = plt.cm.tab10
        colors_dict = {label: cmap(i) for i, label in enumerate(unique_labels)}
        colors = np.array([colors_dict[label] for label in labels])

    return colors


def scatter_plot(df, x, y, cc, mc, filter_data=""):

    if filter_data == "filter":
        df = df[df["C_OH_C_Zn"] > 1.6]

    colors = get_color(cc, df)
    colors_dict = {label: colors[i] for i, label in enumerate(df[cc].unique())}

    markers_all = ["o", "d"]
    markers_dict = {
        label: markers_all[idx % len(markers_all)]
        for idx, label in enumerate(df[mc].unique())
    }
    markers = [markers_dict[label] for label in df[mc]]

    fig, ax = plt.subplots(1, 1, figsize=(10, 7))

    for i in range(df.shape[0]):
        row = df.iloc[i]
        color = colors[i]
        marker = markers[i]
        sc = ax.plot(
            row[x],
            row[y],
            marker=marker,
            color=color,
            markersize=10,
            markeredgewidth=0.5,
            markeredgecolor="black",
            alpha=1.0,
        )

    for m in df[mc].unique():
        ax.plot([], [], marker=markers_dict[m], color="k", label=m, linestyle="None")

    # for c in df[cc].unique():
    #     ax.plot([], [], marker="o", color=colors_dict[c], label=f"{cc}: {c}", linestyle="None")

    ax.legend(fontsize=12, title_fontsize=12, title="Initial Sampling")

    # custom colorbar
    if is_float_column(cc, df):
        # Add a colorbar if 'cc' is a float column
        sm = plt.cm.ScalarMappable(
            cmap=plt.cm.viridis,
            norm=plt.Normalize(vmin=df[cc].min(), vmax=df[cc].max()),
        )
        sm.set_array([])
        fig.colorbar(sm, ax=ax, label=cc)

    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.grid(True)

    return fig, ax


In [31]:
# %config InlineBackend.figure_format='svg'
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.lines import Line2D

plt.rcParams.update(
    {
        "text.usetex": True,
        "font.family": "serif",
        "font.serif": ["Helvetica"],
        # font size
        "font.size": 16,
    }
)

plots_list = [
    ["C_OH_C_Zn", "C_Zn", "Peak Ratio_VaR", "initial_sampling", ""],
    ["C_OH_C_Zn", "C_Zn", "Peak Ratio_VaR", "initial_sampling", "filter"],
    ["Peak Ratio_mean", "Aspect Ratio_mean", "C_Zn", "initial_sampling", ""],
    ["Peak Ratio_mean", "Aspect Ratio_mean", "C_Zn", "initial_sampling", "filter"],
    ["Peak Ratio_VaR", "Aspect Ratio_VaR", "C_Zn", "initial_sampling", ""],
    ["Peak Ratio_VaR", "Aspect Ratio_VaR", "C_Zn", "initial_sampling", "filter"],
    ["C_OH_C_Zn", "C_Zn", "Aspect Ratio_VaR", "initial_sampling", ""],
    ["C_OH_C_Zn", "C_Zn", "Aspect Ratio_VaR", "initial_sampling", "filter"],
    [
        "Aspect Ratio_mean",
        "Aspect Ratio_std",
        "C_OH_C_Zn",
        "initial_sampling",
        "filter",
    ],
    [
        "Peak Ratio_mean",
        "Peak Ratio_std",
        "C_OH_C_Zn",
        "initial_sampling",
        "filter",
    ],
    [
        "Aspect Ratio_mean",
        "Aspect Ratio_std",
        "C_OH",
        "initial_sampling",
        "filter",
    ],
    [
        "Aspect Ratio_mean",
        "Aspect Ratio_std",
        "C_Zn",
        "initial_sampling",
        "filter",
    ],
    ["Peak Ratio_mean", "Peak Ratio_std", "C_Zn", "initial_sampling", "filter"],
    [
        "Aspect Ratio_mean",
        "Aspect Ratio_std",
        "initial_sampling",
        "initial_sampling",
        "filter",
    ],
    [
        "Peak Ratio_std",
        "Aspect Ratio_std",
        "initial_sampling",
        "initial_sampling",
        "filter",
    ],
    ["Peak Ratio_mean", "Aspect Ratio_mean", "N_ZnO", "initial_sampling", "filter"],
    ["Peak Ratio_std", "Aspect Ratio_std", "N_ZnO", "initial_sampling", "filter"],
    ["Peak Ratio_VaR", "Aspect Ratio_VaR", "N_ZnO", "initial_sampling", "filter"],
    ["C_OH_C_Zn", "C_Zn", "Peak Ratio_VaR", "initial_sampling", "filter"],
    ["Aspect Ratio_mean", "Aspect Ratio_std", "Q_AC", "initial_sampling", "filter"],
    ["Peak Ratio_mean", "Peak Ratio_std", "Q_AC", "initial_sampling", "filter"],
    ["Peak Ratio_mean", "Peak Ratio_std", "Q_AIR", "initial_sampling", "filter"],
]


for config in plots_list:
    
    # if any of config not in df_mean_std.columns:
    #     continue
    if not all([c in df_mean_std.columns for c in config]):
        continue
    
    x,y,cc,mc,filter_data = config 
    
    fig, ax = scatter_plot(df_mean_std, x, y, cc, mc, filter_data)
    save_pdf(
        fig,
        Path("."),
        f"{exp}/{agg_type}/{cc}/{mc}",
        f"{x}+{y}+{cc}+{mc}+{filter_data}".replace("/", "_"),
    )
