In [None]:
import pandas as pd
import glob
import json
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def load_json(fname):
    try:
        with open(fname, 'r') as f:
            data = json.load(f)
        return data
    except:
        return []

In [None]:
def plot_with_err(x, y_mean, y_std, linestyle=None, label=None, color='blue', marker='.',ax=None):
    sort_idx = np.argsort(x)
    x_sorted = np.array(x)[sort_idx]
    y_mean_sorted = np.array(y_mean)[sort_idx]
    y_std_sorted = np.array(y_std)[sort_idx]

    y_upper = y_mean_sorted + y_std_sorted
    y_lower = y_mean_sorted - y_std_sorted

    ax.plot(x_sorted, y_mean_sorted, label=label, linestyle=linestyle,color=color,marker=marker)

    ax.fill_between(
        x_sorted,
        y_lower,
        y_upper,
        alpha=0.1,
        color=color
    )
    return ax


def plot_with_err_(x, y_mean, y_std, linestyle=None, label=None, color='blue', marker='.', alpha=0.1, ax=None):
    y_upper = y_mean + y_std
    y_lower = y_mean - y_std

    ax.plot(
        x, 
        y_mean, 
        label=label, 
        linestyle=linestyle, 
        color=color, 
        marker=marker
    )
    ax.fill_between(
        x,
        y_lower,
        y_upper,
        alpha=alpha,
        color=color
    )
    return ax

In [None]:
style_dict = {
    'laplace': {'color': 'black', 'linestyle': '--', 'marker': '.'},
    'mle': {'color': 'red', 'linestyle': ':', 'marker': 'v'},
    'tempscale': {'color': 'blue', 'linestyle': 'dashdot', 'marker': 'o'},
    'blob': {'color': 'purple', 'linestyle': '--', 'marker': 's'},
    'scalabl': {'color': 'green', 'linestyle': 'solid', 'marker': '^'},
    'tfb': {'color': 'blue', 'linestyle': 'dashdot', 'marker': '^'},
    'mcdropout': {'color': 'orange', 'linestyle': 'dashdot', 'marker': 'v'},
    #deepens
    #mcdroput
    #sgld?
    #map
    #zeroshot?
}
metric2arrow = {
    'ACC': '↑',
    'ECE': '↓',
    'NLL': '↓',
    'Brier': '↓',
    'peak_memory': '↓',
    'latency': '↓',
}

wrapper2label = {
    'mle': 'MLE',
    'blob': 'BLoB',
    'scalabl': 'ScalaBL',
    'laplace': 'Laplace',
    'tfb': 'TFB',
    'mcdropout': 'MCDropout',
    'tempscale': 'TempScale'
}

In [None]:
exp_keys = ['model', 'quant', 'wrapper', 'rank', 'prompt_type', 'dataset', 'num_base', 'num_trainable_params', 'num_total_params']
metric_keys = ['ACC', 'ECE', 'NLL', 'Brier', 'peak_memory', 'latency']

root = '/workspace1/csamplawski/src/BayesAdapt/logs/'
root = '/project/synthesis/bayesadapt/logs/'


In [None]:
json_fnames = glob.glob(f'{root}/**/id/metrics.json', recursive=True)

expdirs = []
for fname in json_fnames:
    tokens = fname.split('/')
    edir = '/'.join(tokens[0:-1])
    expdirs.append(edir)
expdirs = list(set(expdirs))

df = []
for edir in expdirs:
    tokens = edir.replace(root, '').split('/')
    keys = ['model', 'quant', 'wrapper', 'rank', 'prompt_type', 'seed', 'dataset']
    row = dict(zip(keys, tokens[1:]))
    row['rank'] = int(tokens[4].replace('rank', ''))
    row['seed'] = int(tokens[6][-1])
    data = load_json(f'{edir}/metrics.json')
    row['results'] = data
    df.append(row)
df = pd.DataFrame(df)
df_exploded = df.explode('results').reset_index(drop=True)
metrics_df = pd.json_normalize(df_exploded['results']).drop(columns=['seed'])
id_df_seeds = pd.concat([df_exploded.drop(columns=['results']), metrics_df], axis=1)
id_df = id_df_seeds.groupby(exp_keys)[metric_keys].agg(['mean', 'std'])
id_df

In [None]:
metrics_df

In [None]:
def query(df, model=None, dataset=None, wrapper=None, prompt_type='instruct', quant='16bit', rank=8):
    query_str = f"prompt_type == '{prompt_type}' and quant == '{quant}' and rank == {rank}"
    if model is not None:
        query_str += f" and model == '{model}'"
    if dataset is not None:
        query_str += f" and dataset == '{dataset}'"
    if wrapper is not None:
        query_str += f" and wrapper == '{wrapper}'"
    q = df.query(query_str).reset_index()
    return q

In [None]:
query(id_df_seeds, dataset='slake', prompt_type='vlm', model='Qwen3-VL-2B-Instruct', wrapper='mle')

In [None]:
json_fnames = glob.glob(f'{root}/**/ood/**/metrics.json', recursive=True)
expdirs = []
for fname in json_fnames:
    tokens = fname.split('/')
    edir = '/'.join(tokens[0:-1])
    expdirs.append(edir)
expdirs = list(set(expdirs))

df = []
for edir in expdirs:
    tokens = edir.replace(root, '').split('/')
    keys = ['model', 'quant', 'wrapper', 'rank', 'prompt_type', 'seed']
    row = dict(zip(keys, tokens[1:-2]))
    row['rank'] = int(tokens[4].replace('rank', ''))
    row['seed'] = int(tokens[6][-1])
    row['dataset'] = tokens[-1]
    data = load_json(f'{edir}/metrics.json')
    row['results'] = data
    df.append(row)
df = pd.DataFrame(df)
df_exploded = df.explode('results').reset_index(drop=True)
metrics_df = pd.json_normalize(df_exploded['results']).drop(columns=['seed'])
ood_df = pd.concat([df_exploded.drop(columns=['results']), metrics_df], axis=1)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=False)
plt.rcParams.update({'font.size': 12})
metrics = ['ACC', 'ECE', 'NLL', 'Brier']

noise_stds = [0,1,2,4,8,16,32,64,128]

dataset = 'slake'
prompt_type = 'vlm'
quant = '16bit'
rank = 8

for ax, metric in zip(axes, metrics):
    arrow = metric2arrow[metric]
    
    for wrapper in ['mle', 'scalabl','blob']:
        label = wrapper2label[wrapper]
        y_mean, y_std = [], []
        for std in noise_stds:
            if std == 0:
                dataset = 'slake'
                metric_df = id_df
            else:
                dataset = f'noisy_slake{std}'
                metric_df = ood_df
            
            metric_vals = metric_df.query(f"dataset == '{dataset}' and prompt_type == '{prompt_type}' and wrapper == '{wrapper}' and quant == '{quant}' and rank == {rank} and model == 'Qwen3-VL-8B-Instruct'" ).reset_index()[metric]
            y_mean.append(metric_vals.mean())
            y_std.append(metric_vals.std())
        ax = plot_with_err(noise_stds, y_mean, y_std, **style_dict[wrapper], label=label, ax=ax)
        ax.set_xlabel('Noise STD (pixel units)')
        
    ax.set_ylabel(f"{metric} ({arrow})")
    ax.legend(
        loc='upper center',          # Anchor point on the legend box itself
        bbox_to_anchor=(0.5, -0.15), # (x, y) coordinates relative to the plot axes
        ncols=2,       # Forces all items into a single row
        frameon=True                # Optional: removes the box border for a cleaner look
    )
    
    #ax.set_xscale('log', base=2)
    ax.grid()


In [None]:
#ax = plt.gca()
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=False)
plt.rcParams.update({'font.size': 12})
metrics = ['ACC', 'ECE', 'NLL', 'Brier']

dataset = 'slake'
prompt_type = 'vlm'
quant = '16bit'
rank = 8

#base_query_str = f"dataset == '{dataset}' and prompt_type == '{prompt_type}' and quant == '{quant}' and rank == {rank}"

for ax, metric in zip(axes, metrics):
    arrow = metric2arrow[metric]
    for wrapper in ['mle', 'scalabl', 'blob', 'mcdropout', 'laplace', 'tfb']:
        label = wrapper2label[wrapper]
        #metric_df = id_df.groupby(exp_keys)[metric].agg(['mean', 'std'])
        #query_str = base_query_str + f" and wrapper == '{wrapper}'"
        #q = id_df.query(query_str).reset_index()
        q = query(id_df, prompt_type=prompt_type, wrapper=wrapper, dataset=dataset)
        ax = plot_with_err(q['num_base'], q[(metric, 'mean')], q[(metric, 'std')], **style_dict[wrapper], label=label, ax=ax)

    ax.set_xlabel('# Parameters (Base + LoRA)')
    ax.set_ylabel(f"{metric} ({arrow})")
    ax.legend(
        loc='upper center',          # Anchor point on the legend box itself
        bbox_to_anchor=(0.5, -0.15), # (x, y) coordinates relative to the plot axes
        ncols=2,       # Forces all items into a single row
        frameon=True                # Optional: removes the box border for a cleaner look
    )
    #ax.set_title(f'Qwen3 Family | {prompt_type} | rank = {rank} | {dataset}')
    ax.grid()


In [None]:
#ax = plt.gca()
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=False)
plt.rcParams.update({'font.size': 12})
metrics = ['ACC', 'ECE', 'NLL', 'Brier']

dataset_sizes = ['xs','s','m','l']
x_vals = [160,640,2558,10234]
prompt_type = 'instruct'
quant = '16bit'
model = 'Qwen3-8B'
rank = 8

base_query_str = f"model == '{model}' and prompt_type == '{prompt_type}' and quant == '{quant}' and rank == {rank}"

for ax, metric in zip(axes, metrics):
    arrow = metric2arrow[metric]
    for wrapper in ['mle', 'scalabl', 'blob', 'mcdropout', 'laplace','tfb']:
        label = wrapper2label[wrapper]
        y_mean, y_std = [], []
        for size in dataset_sizes:
            query_str = base_query_str + f" and wrapper == '{wrapper}' and dataset == 'winogrande_{size}'"
            #metric_df = id_df.groupby(exp_keys)[metric].agg(['mean', 'std'])
            q = id_df.query(query_str).reset_index()
            y_mean.append(q[(metric, 'mean')].item())
            y_std.append(q[(metric, 'std')].item())
        ax = plot_with_err(x_vals, y_mean, y_std, **style_dict[wrapper], label=label, ax=ax)
    ax.grid()
    ax.set_ylabel(f"{metric} ({arrow})")
    ax.set_xlabel('Training Set Size (# of Instances)')
    ax.legend(
        loc='upper center',          # Anchor point on the legend box itself
        bbox_to_anchor=(0.5, -0.15), # (x, y) coordinates relative to the plot axes
        ncols=2,       # Forces all items into a single row
        frameon=True                # Optional: removes the box border for a cleaner look
    )