In [None]:
import os
import pandas as pd

In [None]:
def read_output_run_many(fp):
    """ Output of zero-shot """
    with open(fp, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    
    data = []
    for l in lines:
        if len(l.split(',')) > 1:
            data.append(l.split(','))
    return pd.DataFrame(data, columns=["dataset", "mr", "mrr", "hits@1", "hits@3", "hits@10"])

def add_info_dataset(row):
    for name in ["prop", "subevent", "role", "causation"]:
        row[name] = 1 if f"{name.capitalize()}1" in row["dataset"] else 0
    row["syntax"] = row["dataset"].split("Syntax")[1]
    return row

In [None]:
import re
import os
import subprocess
from tqdm import tqdm

def get_info_folder_name(folder):
    """Extract training parameters from folder name using regex"""
    try:
        bpe = int(folder.split("bpe_")[1].split("_")[0])
    except:
        bpe = 0
    
    return {
        'checkpoint': folder.split("ckpt_")[1].split("_")[0],
        'epochs': int(folder.split("epochs_")[1].split("_")[0]),
        'batch_per_epoch': folder.split("bpe_")[1].split("_")[0],
        'batch_size': int(folder.split("bs_")[1].split("_")[0])
    }

def read_all(folder):
    data = []
    modes = os.listdir(folder)
    for m in modes:
        settings = os.listdir(os.path.join(folder, m))
        for s in tqdm(settings):
            params = get_info_folder_name(s)
            pf = os.path.join(folder, m, s)
            command = f"python get_model_results.py {pf}"
            if not os.path.exists(os.path.join(pf, "results.csv")):
                subprocess.run(command, shell=True)
            df = pd.read_csv(os.path.join(pf, "results.csv"), index_col=0)
            for k, v in params.items():
                df[k] = v
            df["mode"] = m
            data.append(df)
    return pd.concat(data)


In [None]:
df = read_all("experiments/inductive")
df.head(5)

In [None]:
import plotly.express as px

mappings = {}
for col in ["syntax", "mode"]:
    # Create mapping dictionary
    categories = df[col].unique()
    mapping = {cat: i for i, cat in enumerate(categories)}
    # Add numeric version of the column
    df[f"{col}_numeric"] = df[col].map(mapping)
    mappings[col] = mapping

for k, v in mappings.items():
    print(f"{k}: {v}")

# Create dimensions for the parallel coordinates plot
dimensions = prepare_dimensions(df, ["prop", "subevent", "role", "causation"])
fig = px.parallel_coordinates(
    df, color="test_mrr",
    dimensions=["mode_numeric", "prop", "subevent", "role", "causation", "syntax_numeric", "epochs", "batch_per_epoch", "batch_size"]
)
fig.show()