In [1]:
import wandb
import pandas as pd
import numpy as np

def get_best_epoch_metrics(run, metric_name):
    history = run.history(samples=10000)
    
    # Find the epoch with the best metric value
    best_epoch = history[metric_name].idxmax()
    best_metrics = history.loc[best_epoch]
    
    print(f"Run {run.url} - Best epoch: {best_epoch} - {metric_name}: {best_metrics[metric_name]}")

    return best_metrics, best_epoch

def gather_metrics_for_experiment_group(experiment_group):
    api = wandb.Api()

    # Fetch runs matching the experiment group
    runs = api.runs("crop-classification/messis", filters={"config.experiment_group": experiment_group})
    print(f"Found {len(runs)} runs for experiment group '{experiment_group}'")

    # Organize the runs by setup name
    setups = {}
    for run in runs:
        setup_name = run.config['name']
        if setup_name not in setups:
            setups[setup_name] = {
                'runs': [],
                'metrics': {metric: [] for metric in [
                    'val_f1_tier1_majority',
                    'val_f1_tier2_majority',
                    'val_f1_tier3_majority',
                    'val_weighted_accuracy_tier1_majority',
                    'val_weighted_accuracy_tier2_majority',
                    'val_weighted_accuracy_tier3_majority'
                ]},
                'best_steps': []
            }
        setups[setup_name]['runs'].append(run)

    # Print setup names with number of runs
    for setup_name, setup_data in setups.items():
        print(f"Setup '{setup_name}' - {len(setup_data['runs'])} runs")

    # Gather metrics for each setup
    for setup_name, setup_data in setups.items():
        for run in setup_data['runs']:
            best_metrics, best_epoch = get_best_epoch_metrics(run, 'val_f1_tier3_majority')
            setup_data['best_steps'].append(best_epoch)
            for metric in setup_data['metrics']:
                setup_data['metrics'][metric].append(best_metrics[metric])
    
    # Prepare data for DataFrame
    data = []
    for setup_name, setup_data in setups.items():
        metrics_summary = {
            'setup': setup_name,
            'num_runs': len(setup_data['runs']),
            'run_handles': [run.id for run in setup_data['runs']],
            'best_steps': setup_data['best_steps']
        }
        for metric, values in setup_data['metrics'].items():
            metrics_summary[f'{metric}_mean'] = np.mean(values)
            metrics_summary[f'{metric}_std'] = np.std(values)
        data.append(metrics_summary)
    
    metrics_df = pd.DataFrame(data)
    
    return metrics_df

# Run the function for a given experiment group
experiment_group = "exp-9-dropout2d"
metrics_df = gather_metrics_for_experiment_group(experiment_group)

# Save the DataFrame to a CSV file
metrics_df.to_csv(f"experiment_results/eval_{experiment_group}.csv", index=False)

metrics_df

Found 16 runs for experiment group 'exp-9-dropout2d'
Setup 'p-0.5' - 1 runs
Setup 'p-0.2' - 5 runs
Setup 'refhead-zuericrop' - 5 runs
Setup 'p-0.1' - 5 runs
Run https://wandb.ai/crop-classification/messis/runs/cm0hlw6k - Best epoch: 4858 - val_f1_tier3_majority: 0.1980932652950287
Run https://wandb.ai/crop-classification/messis/runs/rouv9lch - Best epoch: 2446 - val_f1_tier3_majority: 0.18668609857559204
Run https://wandb.ai/crop-classification/messis/runs/ym9requx - Best epoch: 2697 - val_f1_tier3_majority: 0.19291472434997559
Run https://wandb.ai/crop-classification/messis/runs/6hsyjfm2 - Best epoch: 3250 - val_f1_tier3_majority: 0.19807904958724976
Run https://wandb.ai/crop-classification/messis/runs/qhqbesf6 - Best epoch: 3300 - val_f1_tier3_majority: 0.20723950862884521
Run https://wandb.ai/crop-classification/messis/runs/d3zidx38 - Best epoch: 3300 - val_f1_tier3_majority: 0.2096947729587555
Run https://wandb.ai/crop-classification/messis/runs/6vdvcpsa - Best epoch: 2999 - val_f1

Unnamed: 0,setup,num_runs,run_handles,best_steps,val_f1_tier1_majority_mean,val_f1_tier1_majority_std,val_f1_tier2_majority_mean,val_f1_tier2_majority_std,val_f1_tier3_majority_mean,val_f1_tier3_majority_std,val_weighted_accuracy_tier1_majority_mean,val_weighted_accuracy_tier1_majority_std,val_weighted_accuracy_tier2_majority_mean,val_weighted_accuracy_tier2_majority_std,val_weighted_accuracy_tier3_majority_mean,val_weighted_accuracy_tier3_majority_std
0,p-0.5,1,[cm0hlw6k],[4858],0.361608,0.0,0.277691,0.0,0.198093,0.0,0.931084,0.0,0.821537,0.0,0.787611,0.0
1,p-0.2,5,"[rouv9lch, ym9requx, 6hsyjfm2, qhqbesf6, d3zid...","[2446, 2697, 3250, 3300, 3300]",0.484903,0.038136,0.314039,0.010112,0.198923,0.008623,0.932252,0.002456,0.824115,0.003953,0.780164,0.005442
2,refhead-zuericrop,5,"[6vdvcpsa, voz39aqw, tzbb1fpf, yfrzmegx, mhanp...","[2999, 3350, 3099, 2496, 2496]",0.49284,0.058787,0.317704,0.016909,0.204716,0.005751,0.931945,0.001627,0.824158,0.003603,0.778112,0.00886
3,p-0.1,5,"[lrt47pea, sft8i5l2, hflukrac, il64btbr, 9nyrx...","[3551, 3401, 3953, 2496, 2345]",0.487323,0.050225,0.31484,0.011993,0.205222,0.00522,0.93254,0.003776,0.82581,0.005709,0.779976,0.010209


In [35]:
# order data frame by setup name
metrics_df = metrics_df.sort_values(by='setup')

In [38]:
metrics_df

Unnamed: 0,setup,num_runs,run_handles,best_steps,val_f1_tier1_majority_mean,val_f1_tier1_majority_std,val_f1_tier2_majority_mean,val_f1_tier2_majority_std,val_f1_tier3_majority_mean,val_f1_tier3_majority_std,val_weighted_accuracy_tier1_majority_mean,val_weighted_accuracy_tier1_majority_std,val_weighted_accuracy_tier2_majority_mean,val_weighted_accuracy_tier2_majority_std,val_weighted_accuracy_tier3_majority_mean,val_weighted_accuracy_tier3_majority_std
3,p-0.1,5,"[lrt47pea, sft8i5l2, hflukrac, il64btbr, 9nyrx...","[3551, 3401, 3953, 2496, 2345]",0.487323,0.050225,0.31484,0.011993,0.205222,0.00522,0.93254,0.003776,0.82581,0.005709,0.779976,0.010209
1,p-0.2,5,"[rouv9lch, ym9requx, 6hsyjfm2, qhqbesf6, d3zid...","[2446, 2697, 3250, 3300, 3300]",0.484903,0.038136,0.314039,0.010112,0.198923,0.008623,0.932252,0.002456,0.824115,0.003953,0.780164,0.005442
0,p-0.5,1,[cm0hlw6k],[4858],0.361608,0.0,0.277691,0.0,0.198093,0.0,0.931084,0.0,0.821537,0.0,0.787611,0.0
2,refhead-zuericrop,5,"[6vdvcpsa, voz39aqw, tzbb1fpf, yfrzmegx, mhanp...","[2999, 3350, 3099, 2496, 2496]",0.49284,0.058787,0.317704,0.016909,0.204716,0.005751,0.931945,0.001627,0.824158,0.003603,0.778112,0.00886


In [36]:
metrics_df['setup']

3                p-0.1
1                p-0.2
0                p-0.5
2    refhead-zuericrop
Name: setup, dtype: object

In [37]:
[x for x in enumerate(df['setup'])]

[(0, 'p-0.1'), (1, 'p-0.2'), (2, 'p-0.5'), (3, 'refhead-zuericrop')]

In [34]:
[x for x in metrics_df['setup']]
metrics_df['setup']

3                p-0.1
1                p-0.2
0                p-0.5
2    refhead-zuericrop
Name: setup, dtype: object

In [41]:
import pandas as pd
import texttable as tt
import latextable

df = metrics_df
df = pd.read_csv(f"experiment_results/eval_{experiment_group}.csv")

df = df.sort_values(by='setup')

# Mapping for short metric names
metric_name_mapping = {
    'val_f1_tier1_majority': 'F1 Tier 1',
    'val_f1_tier2_majority': 'F1 Tier 2',
    'val_f1_tier3_majority': 'F1 Tier 3',
    'val_weighted_accuracy_tier1_majority': 'Accuracy Tier 1',
    'val_weighted_accuracy_tier2_majority': 'Accuracy Tier 2',
    'val_weighted_accuracy_tier3_majority': 'Accuracy Tier 3'
}

# Function to create a LaTeX table with bold best scores
def create_latex_table(df):
    metrics = list(metric_name_mapping.keys())
    
    # Determine the best score for each metric
    best_scores = {metric: df[f'{metric}_mean'].max() for metric in metrics}
    
    # Initialize the Texttable object
    table = tt.Texttable()
    
    # Define the header and make headers vertical
    headers = ['Metric'] + [f"{setup} ({row['num_runs']}-fold)" for idx, row in df.iterrows() for setup in [row['setup']]]
    # headers = [f'\\rotatebox{{45}}{{{header}}}' for header in headers]

    table.header(headers)
    
    # Set alignment for columns
    table.set_cols_align(["c"] * len(headers))
    
    # Populate the table with data
    for metric in metrics:
        row_data = [metric_name_mapping[metric]]
        for idx, row in df.iterrows():
            mean = row[f'{metric}_mean'] * 100
            std = row[f'{metric}_std'] * 100
            value = f'{mean:.1f}\\% ± {std:.1f}\\%'
            if mean == best_scores[metric] * 100:
                value = f'\\textbf{{{value}}}'
            row_data.append(value)
        table.add_row(row_data)
    
    # Generate the LaTeX table
    latex_table = latextable.draw_latex(
        table, 
        caption="Metrics Comparison (majority and val details)", 
        label="tab:metrics_comparison", 
        use_booktabs=True
    )

    # Add \resizebox to the LaTeX table
    latex_table = latex_table.replace('\\begin{center}', '\\begin{center}\\resizebox{\\textwidth}{!}{')
    latex_table = latex_table.replace('\\end{center}', '}\\end{center}')

    return latex_table

# Generate the LaTeX table
latex_table = create_latex_table(df)
print(latex_table)

\begin{table}
	\begin{center}\resizebox{\textwidth}{!}{
		\begin{tabular}{ccccc}
			\toprule
			Metric & p-0.1 (5-fold) & p-0.2 (5-fold) & p-0.5 (1-fold) & refhead-zuericrop (5-fold) \\
			\midrule
			F1 Tier 1 & 48.7\% ± 5.0\% & 48.5\% ± 3.8\% & 36.2\% ± 0.0\% & \textbf{49.3\% ± 5.9\%} \\
			F1 Tier 2 & 31.5\% ± 1.2\% & 31.4\% ± 1.0\% & 27.8\% ± 0.0\% & \textbf{31.8\% ± 1.7\%} \\
			F1 Tier 3 & \textbf{20.5\% ± 0.5\%} & 19.9\% ± 0.9\% & 19.8\% ± 0.0\% & 20.5\% ± 0.6\% \\
			Accuracy Tier 1 & \textbf{93.3\% ± 0.4\%} & 93.2\% ± 0.2\% & 93.1\% ± 0.0\% & 93.2\% ± 0.2\% \\
			Accuracy Tier 2 & \textbf{82.6\% ± 0.6\%} & 82.4\% ± 0.4\% & 82.2\% ± 0.0\% & 82.4\% ± 0.4\% \\
			Accuracy Tier 3 & 78.0\% ± 1.0\% & 78.0\% ± 0.5\% & \textbf{78.8\% ± 0.0\%} & 77.8\% ± 0.9\% \\
			\bottomrule
		\end{tabular}
	}\end{center}
	\caption{Metrics Comparison (majority and val details)}
	\label{tab:metrics_comparison}
\end{table}
