In [None]:
import json 
import os 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

score_names = {
    "evaluations/milu": ("milu", "acc,none"),
    "evaluations/igb/xsum": ("igb_xsum", "chrf,none"),
    "evaluations/igb/xquad": ("igb_xquad", "f1,none"),
    "evaluations/igb/xorqa": ("igb_xorqa", "f1,none"),
    "evaluations/igb/flores_xxen": ("igb_flores_xxen", "chrf,none"),
    "evaluations/igb/flores_enxx": ("igb_flores_enxx", "chrf,none"),
}

model_names = {
    "llama": "Llama-3.1-8B-Instruct",
    "gemma": "gemma-3-12b-it"
}

In [None]:
def plot(base_dir, task, score_name):
    # collect scores and custom flags
    rows = []
    for dirname in os.listdir(base_dir):
        is_custom = dirname.startswith("__")
        splits = dirname.split("__")
        if is_custom:
            # For custom models, extract the model name after '/'
            full_model = f"{splits[-3]}/{model_names.get(splits[-2])}"
            model_name = model_names.get(splits[-2], splits[-2])
            # Extract dataset from the full model path (first part before '/')
            dataset = splits[-3].split('/')[0] if '/' in splits[-3] else splits[-3]
        else:
            model_name = splits[-1]
            full_model = model_name
            dataset = "Proprietary"  # Use "Proprietary" as dataset for non-custom models
        dirpath = os.path.join(base_dir, dirname)

        for fname in os.listdir(dirpath):
            if not fname.startswith("results_"):
                continue
            with open(os.path.join(dirpath, fname)) as f:
                data = json.load(f)
            score = data["results"][task][score_name]
            rows.append((model_name, score, is_custom, full_model, dataset))

    # sort by dataset name for consistent x-axis ordering
    unique_datasets = sorted(list(set([r[4] for r in rows])))
    dataset_to_x = {d: i for i, d in enumerate(unique_datasets)}
    
    # plot
    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # create scatter plot with distinct colors and markers for models
    markers = ['o', 's', 'p', 'h', 'P', 'X', '^', 'v', '>', '<']
    # Use distinct colors for models
    colors = ['#4285F4',   # Google Blue
              '#EA4335',   # Google Red
              '#FBBC04',   # Google Yellow
              '#34A853',   # Google Green
              '#9C27B0',   # Purple (Material Design)
              '#FF6F00',   # Deep Orange (Material Design)
              '#00ACC1',   # Cyan (Material Design)
              '#AB47BC',   # Light Purple (Material Design)
              '#FF5722',   # Orange-Red (Material Design)
              '#009688',   # Teal (Material Design)
              '#795548',   # Brown (Material Design)
              '#607D8B']   # Blue Grey (Material Design)
    
    # Get unique models and extract model families
    unique_models = sorted(list(set([r[0] for r in rows])))
    
    # Extract model families (e.g., "Llama" from "Llama-3.1-8B-Instruct")
    model_families = {}
    for model in unique_models:
        # Try to extract family name (first part before hyphen or number)
        if '-' in model:
            family = model.split('-')[0]
        elif any(char.isdigit() for char in model):
            # Find where numbers start and take everything before
            for i, char in enumerate(model):
                if char.isdigit():
                    family = model[:i].rstrip()
                    break
            else:
                family = model
        else:
            family = model
        
        if family not in model_families:
            model_families[family] = []
        model_families[family].append(model)
    
    # Assign markers to families and colors to individual models
    family_marker_map = {}
    model_color_map = {}
    
    marker_idx = 0
    color_idx = 0
    
    for family, models in sorted(model_families.items()):
        # Assign same marker to all models in the family
        family_marker = markers[marker_idx % len(markers)]
        family_marker_map[family] = family_marker
        marker_idx += 1
        
        # Assign different colors to each model in the family
        for model in sorted(models):
            model_color_map[model] = colors[color_idx % len(colors)]
            color_idx += 1
    
    # Create model to family mapping
    model_to_family = {}
    for family, models in model_families.items():
        for model in models:
            model_to_family[model] = family
    
    # plot each point
    plotted_labels = set()
    for model_name, score, is_custom, full_model, dataset in rows:
        x = dataset_to_x[dataset]
        
        # Get family and use family marker
        family = model_to_family.get(model_name, model_name)
        marker = family_marker_map.get(family, 'o')
        
        # Use model-specific color
        color = model_color_map.get(model_name, '#000000')
        label = model_name

        # plot the main point with larger size
        ax.scatter(x, score, c=[color], marker=marker, alpha=0.9, 
                  edgecolors='black', s=150, linewidth=1,
                  label=label if label not in plotted_labels else "")
        plotted_labels.add(label)
    
    # add horizontal line for mean
    all_scores = [r[1] for r in rows]
    mean_score = np.mean(all_scores)
    ax.axhline(mean_score, color='grey', linestyle='--', linewidth=1.5, alpha=0.7)
    ax.text(len(unique_datasets) - 0.5, mean_score + 0.005, f'Mean: {mean_score:.3f}', 
            ha='right', va='bottom', color='grey', fontsize=10)
    
    # labels and formatting
    ax.set_xticks(range(len(unique_datasets)))
    ax.set_xticklabels(unique_datasets, rotation=45, ha='right', fontsize=10)
    ax.set_xlabel("Dataset", fontsize=12)
    ax.set_ylabel(score_name.split(",")[0].upper(), fontsize=12)
    ax.set_title(f"{task.upper()} Performance by Dataset", fontsize=14, fontweight='bold')
    
    handles, labels = ax.get_legend_handles_labels()

    # Place legend on the left side with 2 columns
    ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1.05, 0.5), 
             borderaxespad=0., fontsize=10, ncol=1, frameon=True)
    
    # grid and styling
    ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)
    
    # Add some padding to y-axis
    y_min, y_max = ax.get_ylim()
    y_range = y_max - y_min
    ax.set_ylim(y_min - 0.05 * y_range, y_max + 0.05 * y_range)
    
    plt.tight_layout()
    plt.show()

In [None]:
for k, v in score_names.items():
    plot(base_dir=k, task=v[0], score_name=v[1])