In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as  sns
import numpy as np

In [None]:
root_folder = "tunings"


# get all paths
all_paths = []
for dataset_folder in os.listdir(root_folder):
    if not os.path.isdir(os.path.join(root_folder, dataset_folder)):
        continue
    for tune_folder in os.listdir(os.path.join(root_folder, dataset_folder)):
        tune_path = os.path.join(root_folder, dataset_folder, tune_folder)
        all_paths.append(tune_path)
# get all configs, and best params
all_cfgs = []
all_best_params = []
all_scores = []
for tune_path in all_paths:
    try:
        # load cfg and best params
        with open(os.path.join(tune_path, "cfg.json"), "r") as f:
            cfg = json.load(f)
        best_params = pd.read_csv(os.path.join(tune_path, "best_params.csv"), index_col=0)
        scores = pd.read_csv(os.path.join(tune_path, "scores.csv"), index_col=0)
        
        try:
            clipped_means = pd.read_csv(os.path.join(tune_path, "metrics_mean_clipped.csv"), index_col=0)
            clipped_means = clipped_means.T.iloc[0].to_dict()
            cfg.update(clipped_means)
        except FileNotFoundError:
            print(f"Could not all metrics in {tune_path}")
        
        for param in best_params:
            if param in cfg:
                cfg[param] = best_params[param].values[0]
                
        for key in scores:
            cfg[key] = scores[key].values[0]
        # store
        all_cfgs.append(cfg)
        all_best_params.append(best_params)
        all_scores.append(scores)
    except FileNotFoundError:
        print(f"Could not load all file in {tune_path}")

In [None]:
# merge best param list of dfs in one big df
all_best_params_df = pd.concat(all_best_params)#.drop(columns=["fill_type"])
# merge cfg list to it
df = pd.DataFrame(all_cfgs)

df = df[df["opt_steps"] == 5]
# merge cfg and best params df, avoiding reindexing error
#all_df = pd.concat([all_cfgs_df.reset_index(drop=True), all_best_params_df.reset_index(drop=True)], axis=1)
# drop all coumns that contain only one unique value
df = df.drop(columns=df.nunique()[df.nunique()<=1].index).sort_values("cross_val_score_tune")
# set reduction factor NaNs to 16
#all_df.loc[all_df["reduction_factor"] == "NaN", "reduction_factor"] = 16

In [None]:
# best model per db
best_model = df.groupby(["db_name", "model_type"]).apply(lambda x: x.groupby("model_size").mean().sort_values("cross_val_score_tune").iloc[0])

In [None]:
best_model["test_score"]

In [None]:
best_model["cross_val_score_tune"]

In [None]:
df.sort_values("cross_val_score_tune")

In [None]:
df_db

In [None]:
metric = "cross_val_score_tune"
#metric = "bs"
#metric = "test_score"
#metric = "r2"
#metric = "rmse"

dl_df = df[(df.model_type == "transformer") | (df.model_type == "rnn")]
#dl_df = df

# eval model size
for db in dl_df.db_name.unique():
    print(db)
    
    df_db = dl_df[dl_df["db_name"] == db]
    #df_db = df
    
    sns.catplot(data=df_db, x="model_size", y=metric,
                                order=['xt', 'tiny', "small", "base", "large", "xl"],
                hue="model_type",
                linestyles=["-", "--"],
                kind="point"
               )
    plt.title(f"{db}")
    plt.show()
    
    
    ## test results - only used for debugging, not model selection!
    #sns.catplot(data=df_db, x="model_size", y="test_score",
    #                            order=['tiny', "small", "base", "large"],#, "xl"]
    #            hue="model_type",
    #            linestyles=["-", "--"],
    #            kind="point"
    #           )
    #plt.title(f"{db} test")
    #plt.show()

In [None]:
df[df["db_name"] == "UKE"].sort_values("cross_val_score_tune")

In [None]:
df[df["model_type"] != "xgb"]

In [None]:
lstms = df[df["model_type"] == "rnn"]
lstms = lstms.drop(columns=lstms.nunique()[lstms.nunique()<=1].index)
lstms

In [None]:
x = "cross_val_score_tune"
y = "test_score"

import seaborn as sns
#p = sns.regplot(x=x, y=y, data=all_df)

sns.jointplot(x=x, y=y, data=all_df)


In [None]:
all_df = all_df.drop(columns=["tree_method", "gpu"])
# fill freeze nan embed nans with 0
all_df.loc[all_df["freeze_nan_embed"] == "NaN", "freeze_nan_embed"] = 0
all_df = all_df[all_df["freeze_nan_embed"] != 1].drop(columns=["freeze_nan_embed"])
# fill norm nan embed nans with 0
all_df.loc[all_df["norm_nan_embed"] == "NaN", "norm_nan_embed"] = 0
all_df = all_df[all_df["norm_nan_embed"] != 0].drop(columns=["norm_nan_embed"])

In [None]:
final_df = all_df[all_df["max_epochs"] == 30]
final_df = final_df[final_df["opt_steps"] == 200]

In [None]:
final_df.to_csv("tunings/final_df.csv")

In [None]:
final_df.columns

In [None]:
score_comp = final_df[["model_type", "db_name", "val_score_mean", "test_score_mean",
                       "nan_embed_size",
                      "pretrained", "hidden_size", "fill_type", "flat_block_size", "val_score_std", 
                      "test_score_std", "gpt_name", "max_len", "block_size"]]

In [None]:
score_comp

In [None]:
all_cfgs[0].keys()

In [None]:
final_df.columns

In [None]:
nan_embeds = all_df.dropna(subset=["nan_embed_size"])
nan_embeds = nan_embeds[nan_embeds["model_type"] =="mlp"]
nan_embeds = nan_embeds[nan_embeds["fill_type"] == "none"]
nan_embeds = nan_embeds[nan_embeds["norm_nan_embed"] == 1]

reduced = nan_embeds[["db_name", "fill_type", "nan_embed_size",
                     "norm_nan_embed", "freeze_nan_embed", "val_score_mean"]]
reduced.sort_values(by=["val_score_mean"], ascending=False, inplace=False)

In [None]:
all_df.columns

In [None]:
db = "UKE"
fill_type = "ffill"# "none", "median"
param = "param_count"   # hidden_size, num_transformer_blocks, param_count
#"num_transformer_blocks"


val_name = "cross_val_score_tune"
test_name = "test_score"

plot_df = all_df[all_df["db_name"] == db].sort_values("model_type").drop(columns=["db_name"])

first_models = plot_df[plot_df["fill_type"] == fill_type]

import matplotlib.pyplot as plt

ax = plt.figure(figsize=(8,5)).gca()



# rnn best hidden size
rnn_df = first_models[first_models["model_type"] == "rnn"].sort_values("hidden_size").copy()
rnn_df["param_count"] = rnn_df["hidden_size"] * rnn_df["rnn_layers"]
rnn_df = rnn_df.groupby(groupby).mean().reset_index()
ax1 = rnn_df.plot(x=param, y=val_name, kind="line", label="rnn val", style="-X", ax=ax)
ax = rnn_df.plot(x=param, y=test_name, kind="line", label="rnn test", style="-X", ax=ax)

transformer_df = first_models[first_models["model_type"] == "transformer"].sort_values("hidden_size").copy()
transformer_df["param_count"] = transformer_df["hidden_size"] * transformer_df["num_transformer_blocks"]
transformer_df = transformer_df.groupby(groupby).mean().reset_index()
ax1 = transformer_df.plot(x=param, y=val_name, kind="line", label="trans val", style="-X", ax=ax)
ax = transformer_df.plot(x=param, y=test_name, kind="line", label="trans test", style="-X", ax=ax)


plt.grid(True)
plt.ylabel("MSE")
plt.xlabel(param)
plt.title(f"Score changes in dependence of hidden size for {db}")
# x-axis in log2 scale
#plt.xscale('log', base=2)
# put legend at top
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
            fancybox=True, shadow=True, ncol=5)
#rnn_df[["hidden_size", "val_score_mean", "val_score_std", "test_score_mean", "test_score_std"]]


In [None]:
rnn_df.sort_values("cross_val_score_tune")

In [None]:
all_df[all_df["db_name"] == "UKE"].sort_values("cross_val_score_tune")

In [None]:
all_df[all_df["db_name"] == "UKE"].sort_values("cross_val_score_tune").iloc[0].to_dict()

In [None]:
db = "UKE"
fill_type = "median"  # "none", "median"

plot_df = score_comp[score_comp["db_name"] == db].sort_values("model_type").drop(columns=["db_name"])

first_models = plot_df[plot_df["fill_type"] == fill_type]

import matplotlib.pyplot as plt

ax = plt.figure(figsize=(8,5)).gca()
# rnn best hidden size
rnn_df = first_models[first_models["model_type"] == "rnn"].sort_values("hidden_size").copy()
ax1 = rnn_df.plot(x="hidden_size", y="val_score_mean", kind="line", label="val", style="-X", ax=ax)
ax = rnn_df.plot(x="hidden_size", y="test_score_mean", kind="line", label="test", style="-X", ax=ax)

mlp_df = first_models[first_models["model_type"] == "mlp"].sort_values("hidden_size").copy()
mlp_df.plot(x="hidden_size", y="val_score_mean", kind="line", label="val", style="-o", ax=ax)
mlp_df.plot(x="hidden_size", y="test_score_mean", kind="line", label="test", style="-o", ax=ax)

plt.grid(True)
plt.ylabel("R2 score")
plt.xlabel("Hidden Size")
plt.title(f"Score changes in dependence of hidden size for {db}")
# x-axis in log2 scale
#plt.xscale('log', base=2)
# put legend at top
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
            fancybox=True, shadow=True, ncol=5)
#rnn_df[["hidden_size", "val_score_mean", "val_score_std", "test_score_mean", "test_score_std"]]


In [None]:
def clean_df(df):
    # drop columsn that only have one unique value
    df = df.drop(columns=df.nunique()[df.nunique()<=1].index)
    # for each column name that appears twice, only keep one
    df = df.loc[:,~df.columns.duplicated()]
    return df


In [None]:
# investigate block size performance
rnn_models = all_df[all_df["model_type"] == "rnn"].sort_values("block_size")
rnn_models = rnn_models[rnn_models["db_name"] == "MIMIC"]
rnn_models = rnn_models[rnn_models["fill_type"] == "none"]

rnn_models = clean_df(rnn_models)
best = rnn_models.groupby("block_size").apply(lambda x: x.nlargest(1, "test_score_mean")).reset_index(drop=True)
print(best)
print()

mlp_models = all_df[all_df["model_type"] == "mlp"].sort_values("block_size")
mlp_models = mlp_models[mlp_models["db_name"] == "UKE"]
#rnn_models = rnn_models[rnn_models["fill_type"] == "none"]
# sort out rows where, if the fill type is none and the nan_embed_size different from 512
mlp_models = mlp_models[(mlp_models["fill_type"] == "median") | (mlp_models["nan_embed_size"] == 512)]
mlp_models = clean_df(mlp_models)
best = mlp_models.groupby("block_size").apply(lambda x: x.nlargest(1, "test_score_mean")).reset_index(drop=True)
print(best)
mlp_models



In [None]:
all_df.sort_values("block_size")

In [None]:
plot_df.sort_values("block_size")

In [None]:
# plot gpt sizes
gpt_models = all_df[all_df["model_type"] == "gpt"]
gpt_models = gpt_models[gpt_models["pretrained"] == 1]
gpt_models = gpt_models[gpt_models["db_name"] == "UKE"]
gpt_models = gpt_models[gpt_models["fill_type"] == "median"]
# drop columsn that only have one unique value
gpt_models = gpt_models.drop(columns=gpt_models.nunique()[gpt_models.nunique()<=1].index)
# for each column name that appears twice, only keep one
gpt_models = gpt_models.loc[:,~gpt_models.columns.duplicated()]



gpt_sizes = gpt_models[gpt_models["reduction_factor"] == 16]
gpt_sizes = gpt_sizes[gpt_sizes["max_len"] == 512].sort_values("reduction_factor")

ax = gpt_sizes.plot(x="gpt_name", y="test_score_mean", kind="line", label="test", style="-X")
ax = gpt_sizes.plot(x="gpt_name", y="val_score_mean", kind="line", label="val", style="-X", ax=ax)

# set correct x-ticks with gpt names
plt.xticks(range(len(gpt_sizes)), gpt_sizes["gpt_name"])
plt.grid(True)
#plt.yticks(np.arange(0.52, 0.55, 0.01))

gpt_sizes


In [None]:
gpt_reduction_facs = gpt_models[gpt_models["gpt_name"] == "gpt2"]
gpt_reduction_facs = gpt_reduction_facs[gpt_reduction_facs["max_len"] == 512].sort_values("reduction_factor")

ax = gpt_reduction_facs.plot(x="reduction_factor", y="test_score_mean", kind="line", label="val", style="-X")
gpt_reduction_facs.plot(x="reduction_factor", y="val_score_mean", kind="line", label="test", style="-X", ax=ax)

# enabled grid 
plt.grid(True)
# make nicer
plt.ylabel("R2 score")
plt.xlabel("Reduction Factor")
plt.title(f"Score changes in dependence of reduction factor for {db}")
# only show relevant y ticks
import numpy as np
#plt.yticks(np.arange(0.4, 0.55, 0.01))

# gpt_reduction_facs


In [None]:
gpt_reduction_facs

In [None]:
# plot for best model per model type
best_models = first_models.copy().sort_values("model_type")
#best_models = best_models[best_models[(best_models["hidden_size"] == 2048) & (best_models["hidden_size"] == 2048)]]
#best_models = best_models[best_models["gpt_name"] == "gpt2"]

best_models = best_models[best_models["block_size"] == 128]

#best_models = best_models[(best_models["pretrained"] == 0) | (best_models["model_type"] != "gpt")]
#best_models = best_models.groupby("model_type").mean().reset_index().sort_values("test_score_mean", ascending=False)
# capitalize model_type
best_models["model_type"] = best_models["model_type"].str.upper()
ax1 = best_models.plot(x="model_type", y="val_score_mean", kind="line", label="val", style="-X")
ax2 = best_models.plot(x="model_type", y="test_score_mean", kind="line", label="test", ax=ax1, style="-X")
plt.ylabel("R2 score")
plt.xlabel("Model Type")
plt.title(f"Score changes in dependence of model type for {db}")
# enabled grid
plt.grid(True)


#sns.barplot(x="model_type", y="val_score_mean", data=best_models)
# plot with std as error bar
#sns.barplot(x="model_type", y="val_score_mean", data=best_models, yerr="val_score_std")
best_models


In [None]:
fill_type_df = plot_df[plot_df["hidden_size"] == 2048].sort_values("model_type")

fill_type_df
#best_models = plot_df.groupby("model_type").mean().reset_index().sort_values("test_score_mean", ascending=False)



In [None]:
rows = []
for model_type in fill_type_df["model_type"].unique():
    model_type_df = fill_type_df[fill_type_df["model_type"] == model_type]
    for fill_type in model_type_df["fill_type"].unique():
        sub_df = model_type_df[model_type_df["fill_type"] == fill_type]
        best = sub_df.nlargest(1, "val_score_mean")
        rows.append(best)

pd.concat(rows).sort_values("test_score_mean", ascending=False)
       
#fill_type_df.groupby("model_type").apply(lambda model_df: model_df.groupby("fill_type").apply(lambda x: x.nlargest(1, "val_score_mean")))

In [None]:
fill_type_df.groupby("model_type").mean().reset_index().sort_values("test_score_mean", ascending=False)