In [8]:
import wandb
import pandas as pd
import numpy as np
import texttable as tt
import latextable

In [14]:
EXPERIMENT_GROUP = "exp-3-timeranges"
BASELINE_SETUP = "march-september"
AVERAGE_EPOCHS = 0 # 5 or 0 | 0 for no averaging, must be odd
HAS_WEIGHTED_ACCURACY = False
FIRST_N_METRICS = 6 #6 # Only weighted_accuracy and macro_f1 for the results table in experiment results section
EXCLUDED_SETUPS = [] # [0, 1] | list of setups to exclude, by their index in the list of setups

In [20]:
def get_best_epoch_metrics(run, metric_name, average_epochs=0):
    history = pd.DataFrame(list(run.scan_history()))
    
    # Find the epoch with the best metric value
    best_epoch = history[metric_name].idxmax()
    best_metrics = history.loc[best_epoch]
    if average_epochs > 0:
        # Find indices of valid (non-NaN) values
        valid_indices = history[metric_name].dropna().index
        
        # Locate the best epoch index in the valid indices
        best_index_position = valid_indices.get_loc(best_epoch)
        
        # Get the indices of the average_epochs epochs to be averaged
        idx_adjustment = (average_epochs - 1) // 2
        start_index = max(0, best_index_position - idx_adjustment)
        end_index = min(len(valid_indices), best_index_position + idx_adjustment + 1)
        selected_indices = valid_indices[start_index:end_index]
        
        # Convert the selected indices to numeric, forcing errors to NaN - Use when averaging over epchs
        # converted_history = history.loc[selected_indices].apply(pd.to_numeric, errors='coerce')
        # converted_history.mean()

        # Average the metrics over the selected epochs
        averaged_metrics = history.loc[selected_indices].mean()
        
        print(f"Run {run.url} - Best epoch: {best_epoch} (averaged over {average_epochs}) - {metric_name}: {averaged_metrics[metric_name]}")
        return averaged_metrics, best_epoch
    else:
        print(f"Run {run.url} - Best epoch: {best_epoch} - {metric_name}: {best_metrics[metric_name]}")
        return best_metrics, best_epoch

def gather_metrics_for_experiment_group(experiment_group, average_epochs=0, has_weighted_accuracy=True):
    api = wandb.Api(timeout=60)

    # Fetch runs matching the experiment group
    runs = api.runs("crop-classification/messis", filters={"config.experiment_group": experiment_group})
    print(f"Found {len(runs)} runs for experiment group '{experiment_group}'")

    # Organize the runs by setup name
    setups = {}
    for run in runs:
        if run.state != 'finished':
            print(f"Run {run.url} - Skipping (not finished, state: {run.state})")
            continue
        if 'name' not in run.config:
            if 'experiment_name' in run.config:
                setup_name = run.config['experiment_name'].split('-')[-1]
                print(f"Run {run.url} - Missing 'name' in config, getting it from 'experiment_name' instead: {setup_name}")
            else:
                print(f"Run {run.url} - Missing 'name' in config, skipping")
                continue
        else:
            setup_name = run.config['name']
        if setup_name not in setups:
            setups[setup_name] = {
                'runs': [],
                'metrics': {metric: [] for metric in [
                    'val_f1_tier1_majority',
                    'val_f1_tier2_majority',
                    'val_f1_tier3_majority',
                    'val_weighted_accuracy_tier1_majority' if has_weighted_accuracy else 'val_accuracy_tier1_majority',
                    'val_weighted_accuracy_tier2_majority' if has_weighted_accuracy else 'val_accuracy_tier2_majority',
                    'val_weighted_accuracy_tier3_majority' if has_weighted_accuracy else 'val_accuracy_tier3_majority',
                    'val_precision_tier1_majority',
                    'val_precision_tier2_majority',
                    'val_precision_tier3_majority',
                    'val_recall_tier1_majority',
                    'val_recall_tier2_majority',
                    'val_recall_tier3_majority',
                    'val_cohen_kappa_tier1_majority',
                    'val_cohen_kappa_tier2_majority',
                    'val_cohen_kappa_tier3_majority',
                ]},
                'best_steps': []
            }
        setups[setup_name]['runs'].append(run)

    # Print setup names with number of runs
    for setup_name, setup_data in setups.items():
        print(f"Setup '{setup_name}' - {len(setup_data['runs'])} runs")

    # Gather metrics for each setup
    for setup_name, setup_data in setups.items():
        for run in setup_data['runs']:
            best_metrics, best_epoch = get_best_epoch_metrics(run, 'val_f1_tier3_majority', average_epochs)
            setup_data['best_steps'].append(best_epoch)
            for metric in setup_data['metrics']:
                setup_data['metrics'][metric].append(best_metrics[metric])
    
    # Prepare data for DataFrame
    data = []
    for setup_name, setup_data in setups.items():
        metrics_summary = {
            'setup': setup_name,
            'num_runs': len(setup_data['runs']),
            'average_epochs': average_epochs,
            'run_handles': [run.id for run in setup_data['runs']],
            'best_steps': setup_data['best_steps'],
        }
        for metric, values in setup_data['metrics'].items():
            metrics_summary[f'{metric}_mean'] = np.mean(values)
            metrics_summary[f'{metric}_std'] = np.std(values)
        data.append(metrics_summary)
    
    metrics_df = pd.DataFrame(data)
    
    return metrics_df

# Run the function for a given experiment group
metrics_df = gather_metrics_for_experiment_group(EXPERIMENT_GROUP, average_epochs=AVERAGE_EPOCHS, has_weighted_accuracy=HAS_WEIGHTED_ACCURACY)

# Save the DataFrame to a CSV file
metrics_df.to_csv(f"experiment_results/eval_{EXPERIMENT_GROUP}.csv", index=False)

metrics_df

Found 39 runs for experiment group 'exp-2-optimizer-lr'
Run https://wandb.ai/crop-classification/messis/runs/d4kd08mb - Skipping (not finished, state: crashed)
Setup 'sgd-momentum' - 5 runs
Setup 'adam-1e-3' - 5 runs
Setup 'adam-1e-4' - 5 runs
Setup 'adam-1e-2' - 5 runs
Setup 'lion-10' - 5 runs
Setup 'lion' - 5 runs
Setup 'adamw-1e-3-f1-es' - 2 runs
Setup 'adamw-1e-3' - 5 runs
Setup 'adam-kappa-stopping' - 1 runs
Run https://wandb.ai/crop-classification/messis/runs/vss1692r - Best epoch: 4700 (averaged over 5) - val_f1_tier3_majority: 0.12185872942209244
Run https://wandb.ai/crop-classification/messis/runs/igave2ie - Best epoch: 4061 (averaged over 5) - val_f1_tier3_majority: 0.10692615956068038
Run https://wandb.ai/crop-classification/messis/runs/ordpr3fk - Best epoch: 5240 (averaged over 5) - val_f1_tier3_majority: 0.13082700669765474
Run https://wandb.ai/crop-classification/messis/runs/0jha9ehe - Best epoch: 3471 (averaged over 5) - val_f1_tier3_majority: 0.10031244158744812
Run htt

Unnamed: 0,setup,num_runs,average_epochs,run_handles,best_steps,val_f1_tier1_majority_mean,val_f1_tier1_majority_std,val_f1_tier2_majority_mean,val_f1_tier2_majority_std,val_f1_tier3_majority_mean,...,val_recall_tier2_majority_mean,val_recall_tier2_majority_std,val_recall_tier3_majority_mean,val_recall_tier3_majority_std,val_cohen_kappa_tier1_majority_mean,val_cohen_kappa_tier1_majority_std,val_cohen_kappa_tier2_majority_mean,val_cohen_kappa_tier2_majority_std,val_cohen_kappa_tier3_majority_mean,val_cohen_kappa_tier3_majority_std
0,sgd-momentum,5,5,"[vss1692r, igave2ie, ordpr3fk, 0jha9ehe, odrt8...","[4700, 4061, 5240, 3471, 5486]",0.403812,0.047248,0.235201,0.020974,0.116294,...,0.237162,0.016646,0.124685,0.009809,0.844476,0.0086,0.690901,0.013768,0.626034,0.016044
1,adam-1e-3,5,5,"[kgqs6otp, xx3w65m0, 9qhow77v, y9bwdhxz, kp7m5...","[1408, 1654, 1113, 1113, 1408]",0.449174,0.038801,0.267284,0.011381,0.137794,...,0.268747,0.011023,0.142457,0.008175,0.830671,0.018151,0.670397,0.021892,0.620016,0.021363
2,adam-1e-4,5,5,"[23xr5gr7, hbehqfj2, d6nf53n1, 46bsbr0t, 83tvs...","[2047, 2096, 2440, 1998, 2587]",0.427646,0.015627,0.280511,0.004915,0.168784,...,0.271621,0.007173,0.166121,0.011781,0.842892,0.018621,0.701662,0.017888,0.657728,0.019906
3,adam-1e-2,5,5,"[5q7ikf6i, pk0o64ye, amn954lv, crabx30m, f4w58...","[1850, 1850, 1949, 1703, 1752]",0.459044,0.04607,0.275685,0.01187,0.139601,...,0.286855,0.008929,0.148277,0.00386,0.828885,0.007794,0.668308,0.012446,0.612669,0.014646
4,lion-10,5,5,"[txx2oelc, az3sekqd, e3nam8rz, yn488zbq, wfmt0...","[1429, 1140, 1103, 959, 1356]",0.404674,0.037083,0.227612,0.022132,0.10049,...,0.248542,0.015208,0.113416,0.011054,0.822759,0.015126,0.6494,0.027125,0.568927,0.038153
5,lion,5,5,"[xnpljstk, e8fb1rrl, 8p50mzsr, xooo49sk, od1h4...","[573, 622, 720, 770, 770]",0.238516,0.081281,0.068481,0.022969,0.023701,...,0.091739,0.027482,0.03296,0.010792,0.22351,0.269403,0.109724,0.131581,0.096603,0.112439
6,adamw-1e-3-f1-es,2,5,"[0mvgrj0s, a7vxi8rp]","[2833, 2489]",0.432674,0.019951,0.280266,0.004848,0.175741,...,0.279851,0.004652,0.172274,0.001902,0.846742,0.01128,0.699969,0.011274,0.65334,0.008805
7,adamw-1e-3,5,5,"[mnqp73v9, spsmu62b, 5wu1u7qv, 3wr3vh8a, xcib0...","[995, 1031, 995, 887, 851]",0.425895,0.051678,0.248275,0.009706,0.125435,...,0.253053,0.010511,0.132044,0.006273,0.828672,0.015055,0.657562,0.023646,0.597098,0.018033
8,adam-kappa-stopping,1,5,[0zh7qi1q],[2296],0.383676,0.0,0.257168,0.0,0.160165,...,0.254259,0.0,0.157757,0.0,0.845352,0.0,0.682426,0.0,0.632603,0.0


In [21]:
# Mapping for short metric names
metric_name_mapping = {
    'val_f1_tier1_majority': 'F1 $T_1$',
    'val_f1_tier2_majority': 'F1 $T_2$',
    'val_f1_tier3_majority': 'F1 $T_3$',
}

if HAS_WEIGHTED_ACCURACY:
    metric_name_mapping['val_weighted_accuracy_tier1_majority'] = 'W. Acc. $T_1$'
    metric_name_mapping['val_weighted_accuracy_tier2_majority'] = 'W. Acc. $T_2$'
    metric_name_mapping['val_weighted_accuracy_tier3_majority'] = 'W. Acc. $T_3$'
else:
    metric_name_mapping['val_accuracy_tier1_majority'] = 'Acc. $T_1$'
    metric_name_mapping['val_accuracy_tier2_majority'] = 'Acc. $T_2$'
    metric_name_mapping['val_accuracy_tier3_majority'] = 'Acc. $T_3$'

metric_name_mapping = { # Adding further metrics down here, to make sure we keep the most relevant metrics at the top
    **metric_name_mapping,
    'val_precision_tier1_majority': 'Precision $T_1$',
    'val_precision_tier2_majority': 'Precision $T_2$',
    'val_precision_tier3_majority': 'Precision $T_3$',
    'val_recall_tier1_majority': 'Recall $T_1$',
    'val_recall_tier2_majority': 'Recall $T_2$',
    'val_recall_tier3_majority': 'Recall $T_3$',
    'val_cohen_kappa_tier1_majority': 'Kappa $T_1$',
    'val_cohen_kappa_tier2_majority': 'Kappa $T_2$',
    'val_cohen_kappa_tier3_majority': 'Kappa $T_3$',
}

# Function to create a LaTeX table with bold best scores and underlined second-best scores
def create_latex_table(df, baseline_setup, first_n_metrics=None):
    metrics = list(metric_name_mapping.keys())
    if first_n_metrics:
        metrics = metrics[:first_n_metrics]
    
    # Determine the best and second-best scores for each metric
    best_scores = {metric: df[f'{metric}_mean'].max() for metric in metrics}
    second_best_scores = {
        metric: sorted(df[f'{metric}_mean'].unique(), reverse=True)[1] if len(df[f'{metric}_mean'].unique()) > 1 else None
        for metric in metrics
    }

    # Reorder the dataframe to have the baseline setup first
    df = df.set_index('setup')
    if baseline_setup in df.index:
        df = df.loc[[baseline_setup] + [idx for idx in df.index if idx != baseline_setup]].reset_index()
    else:
        df = df.reset_index()

    # Initialize the Texttable object
    table = tt.Texttable()
    
    # Define the header
    headers = ['Metric'] + [f"{'$\\dagger$ ' if row['setup'] == baseline_setup else ''}{setup} ({row['num_runs']}-fold)" 
                            for idx, row in df.iterrows() for setup in [row['setup']]]
    table.header(headers)
    
    # Set alignment for columns
    cols_align = ["c"] * len(headers)
    cols_align[0] = "l"
    table.set_cols_align(cols_align)
    
    # Populate the table with data
    for metric in metrics:
        row_data = [metric_name_mapping[metric]]
        for idx, row in df.iterrows():
            mean = row[f'{metric}_mean'] * 100
            std = row[f'{metric}_std'] * 100
            value = f'{mean:.1f}\\% ± {std:.1f}\\%'
            if mean == best_scores[metric] * 100:
                value = f'\\textbf{{{value}}}'
            elif second_best_scores[metric] and mean == second_best_scores[metric] * 100:
                value = f'\\underline{{{value}}}'
            row_data.append(value)
        table.add_row(row_data)
    
    # Generate the LaTeX table
    average_epochs_used = df['average_epochs'].iloc[0] 
    caption_averaged =  "" if average_epochs_used == 0 else f" averaged over {average_epochs_used} epochs"
    latex_table = latextable.draw_latex(
        table, 
        caption=f"Results for experiment {EXPERIMENT_GROUP}. All metrics calculated on field majority during validation phase{caption_averaged}, with mean and std reported. Best scores bold, second best underlined, $\\dagger$ is baseline.", 
        label=f"tab:metrics_comparison_{EXPERIMENT_GROUP}", 
        use_booktabs=True,
        position="h"
    )

    # Add \resizebox to the LaTeX table
    latex_table = latex_table.replace('\\begin{center}', '\\begin{center}\\resizebox{\\textwidth}{!}{')
    latex_table = latex_table.replace('\\end{center}', '}\\end{center}')

    return latex_table

# Generate the LaTeX table
df = pd.read_csv(f"experiment_results/eval_{EXPERIMENT_GROUP}.csv")
df = df.sort_values(by='setup')

# Optional: Drop rows to exclude some setups
df = df.drop(EXCLUDED_SETUPS)

latex_table = create_latex_table(df, baseline_setup=BASELINE_SETUP, first_n_metrics=FIRST_N_METRICS)
print(latex_table)

\begin{table}[h]
	\begin{center}\resizebox{\textwidth}{!}{
		\begin{tabular}{lccc}
			\toprule
			Metric & $\dagger$ adam-1e-3 (5-fold) & adam-1e-2 (5-fold) & adam-1e-4 (5-fold) \\
			\midrule
			F1 $T_1$ & \underline{44.9\% ± 3.9\%} & \textbf{45.9\% ± 4.6\%} & 42.8\% ± 1.6\% \\
			F1 $T_2$ & 26.7\% ± 1.1\% & \underline{27.6\% ± 1.2\%} & \textbf{28.1\% ± 0.5\%} \\
			F1 $T_3$ & 13.8\% ± 0.7\% & \underline{14.0\% ± 0.3\%} & \textbf{16.9\% ± 1.4\%} \\
			Acc. $T_1$ & \underline{45.9\% ± 3.6\%} & \textbf{50.0\% ± 4.7\%} & 41.0\% ± 1.9\% \\
			Acc. $T_2$ & 26.9\% ± 1.1\% & \textbf{28.7\% ± 0.9\%} & \underline{27.2\% ± 0.7\%} \\
			Acc. $T_3$ & 14.2\% ± 0.8\% & \underline{14.8\% ± 0.4\%} & \textbf{16.6\% ± 1.2\%} \\
			\bottomrule
		\end{tabular}
	}\end{center}
	\caption{Results for experiment exp-2-optimizer-lr. All metrics calculated on field majority during validation phase averaged over 5 epochs, with mean and std reported. Best scores bold, second best underlined, $\dagger$ is baseline