In [6]:
import wandb
import pandas as pd
import numpy as np

def get_best_epoch_metrics(run, metric_name):
    # Get the history of the run
    history = run.history(samples=10000)
    
    # Find the epoch with the best metric value
    best_epoch = history[metric_name].idxmax()
    best_metrics = history.loc[best_epoch]
    
    print(f"Run {run.url} - Best epoch: {best_epoch} - {metric_name}: {best_metrics[metric_name]}")

    return best_metrics, best_epoch

def gather_metrics_for_experiment_group(experiment_group):
    # Initialize a W&B API instance
    api = wandb.Api()

    # Fetch runs matching the experiment group
    runs = api.runs("crop-classification/messis", filters={"config.experiment_group": experiment_group})
    
    print(f"Found {len(runs)} runs for experiment group '{experiment_group}'")

    # Organize runs by setup name
    setups = {}
    for run in runs:
        setup_name = run.config['name']
        if setup_name not in setups:
            setups[setup_name] = []
        setups[setup_name].append(run)

    # print setup names with number of runs
    for setup_name, setup_runs in setups.items():
        print(f"Setup '{setup_name}' has {len(setup_runs)} runs")

    # Define the metrics to extract
    metrics = [
        'val_f1_tier1_majority',
        'val_f1_tier2_majority',
        'val_f1_tier3_majority',
        'val_weighted_accuracy_tier1_majority',
        'val_weighted_accuracy_tier2_majority',
        'val_weighted_accuracy_tier3_majority'
    ]
    
    # Prepare a dictionary to store the results
    results = {metric: {} for metric in metrics}
    
    # Loop over each setup and gather metrics
    for setup_name, setup_runs in setups.items():
        setup_metrics = {metric: [] for metric in metrics}
        
        for run in setup_runs:
            best_metrics, best_epoch = get_best_epoch_metrics(run, 'val_f1_tier3_majority')
            for metric in metrics:
                setup_metrics[metric].append(best_metrics[metric])
        
        for metric in metrics:
            metric_values = setup_metrics[metric]
            results[metric][setup_name] = {
                'mean': np.mean(metric_values),
                'std': np.std(metric_values)
            }
    
    # Create a DataFrame for the results
    data = {}
    for metric in metrics:
        data[metric] = {}
        for setup_name in setups.keys():
            data[metric][f'{setup_name} Mean'] = results[metric][setup_name]['mean']
            data[metric][f'{setup_name} Std Dev'] = results[metric][setup_name]['std']
    
    metrics_df = pd.DataFrame(data)
    
    return metrics_df

# Run the function for a given experiment group
experiment_group = "exp-9-dropout2d"
metrics_df = gather_metrics_for_experiment_group(experiment_group)

metrics_df

Found 16 runs for experiment group 'exp-9-dropout2d'
Setup 'p-0.5' has 1 runs
Setup 'p-0.2' has 5 runs
Setup 'refhead-zuericrop' has 5 runs
Setup 'p-0.1' has 5 runs
Best epoch: 4858 - val_f1_tier3_majority: 0.1980932652950287
Best epoch: 2446 - val_f1_tier3_majority: 0.18668609857559204
Best epoch: 2697 - val_f1_tier3_majority: 0.19291472434997559
Best epoch: 3250 - val_f1_tier3_majority: 0.19807904958724976
Best epoch: 3300 - val_f1_tier3_majority: 0.20723950862884521
Best epoch: 3300 - val_f1_tier3_majority: 0.2096947729587555
Best epoch: 2999 - val_f1_tier3_majority: 0.19841836392879486
Best epoch: 3350 - val_f1_tier3_majority: 0.1991298943758011
Best epoch: 3099 - val_f1_tier3_majority: 0.20475436747074127
Best epoch: 2496 - val_f1_tier3_majority: 0.21412593126296997
Best epoch: 2496 - val_f1_tier3_majority: 0.20715390145778656
Best epoch: 3551 - val_f1_tier3_majority: 0.19764737784862518
Best epoch: 3401 - val_f1_tier3_majority: 0.2019387036561966
Best epoch: 3953 - val_f1_tier3_m

Unnamed: 0,val_f1_tier1_majority,val_f1_tier2_majority,val_f1_tier3_majority,val_weighted_accuracy_tier1_majority,val_weighted_accuracy_tier2_majority,val_weighted_accuracy_tier3_majority
p-0.5 Mean,0.361608,0.277691,0.198093,0.931084,0.821537,0.787611
p-0.5 Std Dev,0.0,0.0,0.0,0.0,0.0,0.0
p-0.2 Mean,0.484903,0.314039,0.198923,0.932252,0.824115,0.780164
p-0.2 Std Dev,0.038136,0.010112,0.008623,0.002456,0.003953,0.005442
refhead-zuericrop Mean,0.49284,0.317704,0.204716,0.931945,0.824158,0.778112
refhead-zuericrop Std Dev,0.058787,0.016909,0.005751,0.001627,0.003603,0.00886
p-0.1 Mean,0.487323,0.31484,0.205222,0.93254,0.82581,0.779976
p-0.1 Std Dev,0.050225,0.011993,0.00522,0.003776,0.005709,0.010209
