In [1]:
import os, sys
import pandas as pd

col_names = ['task', 'eval_type', 'data_setting', 'learning_rate', 'batch_size', 'epoch', 'seed', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

RESULTS_PATH = 'Students/'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')

        if cond_parameters[0] == 'acd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_asp.tsv'), sep = '\t')
        else:
            df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
                    
        df = df.set_index(df.columns[0])

        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)
results_all["f1-micro"] = pd.to_numeric(results_all["f1-micro"], errors="coerce")

config_cols = ["dataset", "data_setting", "eval_type", "learning_rate", "epoch"]

# 2. Mittelung von 'orig-o' + 'orig-d' für jeden Seed + weitere Konfigs
# -> z. B. ["dataset", "task", "eval_type", "seed", "learning_rate", "epoch"]
group_cols_for_merge = [col for col in config_cols if col != 'data_setting'] + ['task', 'batch_size']
if 'seed' in results_all.columns:
    group_cols_for_merge.append('seed')
print(RESULTS_PATH)
results_all

Students/


Unnamed: 0,task,eval_type,data_setting,learning_rate,batch_size,epoch,seed,f1-micro,f1-macro,accuracy
0,acsaRedo,test,orig-o,2e-05,16,40,10,0.776,0.7484,0.634
1,acsaRedo,test,orig-o,2e-05,16,40,15,0.7662,0.7311,0.6211
2,acsaRedo,test,orig-o,2e-05,16,40,20,0.7694,0.741,0.6252
3,acsaRedo,test,orig-o,2e-05,16,40,25,0.7875,0.7552,0.6495
4,acsaRedo,test,orig-o,2e-05,16,40,5,0.7913,0.7617,0.6547


In [3]:
import numpy as np

task = 'acsaRedo'
data_setting = 'orig-o'
eval_count = 5
eval_type = 'test'

if eval_type == 'dev':
    eval_count = 1
else:
    eval_count = 5

config_cols = ["data_setting", "eval_type", "learning_rate", "epoch"]

# Filtere nach den relevanten Parametern
df_filtered = results_all[np.logical_and.reduce([
    results_all['task'] == task, 
    results_all['data_setting'] == data_setting,
    results_all['eval_type'] == eval_type
])]

# Gruppiere und filtere auf Gruppen mit genau eval_count Einträgen
df_grouped = df_filtered.groupby(config_cols).filter(lambda x: len(x) == eval_count)

# Berechne dann den Durchschnitt nur über diese Gruppen
df_best_per_lang = df_grouped.groupby(config_cols)[["f1-micro", "f1-macro", "accuracy"]].mean().reset_index()

df_best_per_lang['f1-micro'] = df_best_per_lang['f1-micro'].apply(lambda x: round(x*100,2))
df_best_per_lang['f1-macro'] = df_best_per_lang['f1-macro'].apply(lambda x: round(x*100,2))
print(RESULTS_PATH)
print("Method: BERT-CLF")
df_best_per_lang

Students/
Method: BERT-CLF


Unnamed: 0,data_setting,eval_type,learning_rate,epoch,f1-micro,f1-macro,accuracy
0,orig-o,test,2e-05,40,77.81,74.75,0.6369


In [8]:
import os, sys, json
import pandas as pd

# Only keep relevant columns
col_names = ['task', 'eval_type', 'data_setting','learning-rate', 'batch_size', 'epochs', 'seed', 'train_runtime', 'gpu_util']
runs = []

RESULTS_PATH = 'LLM/'
folder_names = [
    folder for folder in os.listdir(RESULTS_PATH) 
    if os.path.isdir(os.path.join(RESULTS_PATH, folder)) 
    and folder != '.ipynb_checkpoints'
]

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        
        # Load config.json
        config_path = os.path.join(RESULTS_PATH, folder_name, 'config.json')
        with open(config_path, "r") as f:
            config = json.load(f)

        # Append train_runtime and gpu_util if present
        cond_parameters.append(config.get("train_runtime", None))
        cond_parameters.append(config.get("gpu_util", None))

        runs.append(cond_parameters)

    except Exception as e:
        print(f"Skipping {folder_name}: {e}")
        pass

# Build DataFrame
results_all = pd.DataFrame(runs, columns=col_names)

print(results_all.head())


   task eval_type data_setting learning-rate batch_size epochs seed  \
0  acsa      test       orig-o         2e-05         16     40   10   
1  acsa      test       orig-o         2e-05         16     40   15   
2  acsa      test       orig-o         2e-05         16     40   20   
3  acsa      test       orig-o         2e-05         16     40   25   
4  acsa      test       orig-o         2e-05         16     40    5   

   train_runtime  gpu_util  
0     382.255409     2.654  
1     381.811307     2.654  
2     381.440980     2.654  
3     380.979937     2.654  
4     378.051189     2.654  


In [9]:
import numpy as np

task = 'acsa'
data_setting = 'orig-o'
eval_count = 5
eval_type = 'test'

if eval_type == 'dev':
    eval_count = 1
else:
    eval_count = 5

# 🚨 remove 'train_runtime' and 'gpu_util' from grouping columns
config_cols = ["data_setting", "eval_type", "learning-rate", "epochs"]

# Filter relevant rows
df_filtered = results_all[np.logical_and.reduce([
    results_all['task'] == task, 
    results_all['data_setting'] == data_setting,
    results_all['eval_type'] == eval_type
])]

# Keep only groups with exactly eval_count entries
df_grouped = df_filtered.groupby(config_cols).filter(lambda x: len(x) == eval_count)

# Aggregate runtime + gpu util
df_best_per_lang = (
    df_grouped.groupby(config_cols)[["train_runtime", "gpu_util"]]
    .mean()
    .reset_index()
)

# Convert train_runtime (seconds) → hh:mm format
df_best_per_lang['train_runtime'] = df_best_per_lang['train_runtime'].apply(
    lambda x: f"{int(x//3600):02d}:{int((x%3600)//60):02d}"
)

# Round gpu_util to 2 decimals
df_best_per_lang['gpu_util'] = df_best_per_lang['gpu_util'].round(2)

print(RESULTS_PATH)
print("Method: BERT-CLF")
print(df_best_per_lang)


LLM/
Method: BERT-CLF
  data_setting eval_type learning-rate epochs train_runtime  gpu_util
0       orig-o      test         2e-05     40         00:06      2.65
