In [1]:
import os, sys
import pandas as pd

col_names = ['task', 'dataset', 'eval_type', 'data_setting', 'learning-rate', 'batch_size', 'epochs', 'seed', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

RESULTS_PATH = 'Crowd/' # Experts, LLM
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'acd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_asp.tsv'), sep = '\t')
        elif cond_parameters[0] == 'tasd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        else:
            df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
                    
        df = df.set_index(df.columns[0])

        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)
results_all["f1-micro"] = pd.to_numeric(results_all["f1-micro"], errors="coerce")


config_cols = ["dataset", "data_setting", "eval_type", "learning_rate", "epoch"]

# 2. Mittelung von 'orig-o' + 'orig-d' fÃ¼r jeden Seed + weitere Konfigs
# -> z. B. ["dataset", "task", "eval_type", "seed", "learning_rate", "epoch"]
group_cols_for_merge = [col for col in config_cols if col != 'data_setting'] + ['task', 'batch_size']
if 'seed' in results_all.columns:
    group_cols_for_merge.append('seed')
print(RESULTS_PATH)
results_all

Crowd/


Unnamed: 0,task,dataset,eval_type,data_setting,learning-rate,batch_size,epochs,seed,f1-micro,f1-macro,accuracy
0,tasd,crowd-tasd,test,orig-o,0.0001,16,15,10,0.5068,0.5016,0.3394
1,tasd,crowd-tasd,test,orig-o,0.0001,16,15,15,0.5187,0.5068,0.3502
2,tasd,crowd-tasd,test,orig-o,0.0001,16,15,20,0.5246,0.5206,0.3555
3,tasd,crowd-tasd,test,orig-o,0.0001,16,15,25,0.4985,0.4914,0.332
4,tasd,crowd-tasd,test,orig-o,0.0001,16,15,5,0.5157,0.5084,0.3474


In [8]:
import numpy as np

task = 'tasd'
data_setting = 'orig-o'
eval_count = 5
eval_type = 'test'

if eval_type == 'dev':
    eval_count = 1
else:
    eval_count = 5

config_cols = ["data_setting", 'learning-rate', 'batch_size', 'epochs', "eval_type"]

# Filtere nach den relevanten Parametern
df_filtered = results_all[np.logical_and.reduce([
    results_all['task'] == task, 
    results_all['data_setting'] == data_setting,
    results_all['eval_type'] == eval_type
])]

# Gruppiere und filtere auf Gruppen mit genau eval_count EintrÃ¤gen
df_grouped = df_filtered.groupby(config_cols).filter(lambda x: len(x) == eval_count)

# Berechne dann den Durchschnitt nur Ã¼ber diese Gruppen
df_best_per_lang = df_grouped.groupby(config_cols)[["f1-micro", "f1-macro", "accuracy"]].mean().reset_index()

df_best_per_lang['f1-micro'] = df_best_per_lang['f1-micro'].apply(lambda x: round(x*100,2))
df_best_per_lang['f1-macro'] = df_best_per_lang['f1-macro'].apply(lambda x: round(x*100,2))
print(RESULTS_PATH, task)
print("Method: MvP")
df_best_per_lang

Experts/ tasd
Method: MvP


Unnamed: 0,data_setting,learning-rate,batch_size,epochs,eval_type,f1-micro,f1-macro,accuracy
0,orig-o,0.0001,16,15,test,64.01,59.42,0.47074


In [1]:
import os, sys, json
import pandas as pd

# Only keep relevant columns
col_names = ['task', 'dataset', 'eval_type', 'data_setting', 'learning-rate', 'batch_size', 'epochs', 'seed', 'train_runtime', 'gpu_util']
runs = []

RESULTS_PATH = 'LLM/'
folder_names = [
    folder for folder in os.listdir(RESULTS_PATH) 
    if os.path.isdir(os.path.join(RESULTS_PATH, folder)) 
    and folder != '.ipynb_checkpoints'
]

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        
        # Load config.json
        config_path = os.path.join(RESULTS_PATH, folder_name, 'config.json')
        with open(config_path, "r") as f:
            config = json.load(f)

        # Append train_runtime and gpu_util if present
        cond_parameters.append(config.get("train_runtime", None))
        cond_parameters.append(config.get("gpu_util", None))

        runs.append(cond_parameters)

    except Exception as e:
        print(f"Skipping {folder_name}: {e}")
        pass

# Build DataFrame
results_all = pd.DataFrame(runs, columns=col_names)

print(results_all.head())


   task   dataset eval_type data_setting learning-rate batch_size epochs seed  \
0  tasd  llm-tasd      test       orig-o        0.0001         16     15   10   
1  tasd  llm-tasd      test       orig-o        0.0001         16     15   15   
2  tasd  llm-tasd      test       orig-o        0.0001         16     15   20   
3  tasd  llm-tasd      test       orig-o        0.0001         16     15   25   
4  tasd  llm-tasd      test       orig-o        0.0001         16     15    5   

   train_runtime  gpu_util  
0    3523.328309    11.781  
1    3523.546426    11.781  
2    3524.520263    11.781  
3    3527.028939    11.781  
4    3584.843706    11.906  


In [2]:
import numpy as np

task = 'tasd'
data_setting = 'orig-o'
eval_count = 5
eval_type = 'test'

if eval_type == 'dev':
    eval_count = 1
else:
    eval_count = 5

# ðŸš¨ remove 'train_runtime' and 'gpu_util' from grouping columns
config_cols = ["data_setting", "eval_type", "learning-rate", "epochs"]

# Filter relevant rows
df_filtered = results_all[np.logical_and.reduce([
    results_all['task'] == task, 
    results_all['data_setting'] == data_setting,
    results_all['eval_type'] == eval_type
])]

# Keep only groups with exactly eval_count entries
df_grouped = df_filtered.groupby(config_cols).filter(lambda x: len(x) == eval_count)

# Aggregate runtime + gpu util
df_best_per_lang = (
    df_grouped.groupby(config_cols)[["train_runtime", "gpu_util"]]
    .mean()
    .reset_index()
)

# Convert train_runtime (seconds) â†’ hh:mm format
df_best_per_lang['train_runtime'] = df_best_per_lang['train_runtime'].apply(
    lambda x: f"{int(x//3600):02d}:{int((x%3600)//60):02d}"
)

# Round gpu_util to 2 decimals
df_best_per_lang['gpu_util'] = df_best_per_lang['gpu_util'].round(2)

print(RESULTS_PATH)
print("Method: MvP")
print(df_best_per_lang)


LLM/
Method: MvP
  data_setting eval_type learning-rate epochs train_runtime  gpu_util
0       orig-o      test        0.0001     15         00:58     11.81
