In [None]:
import json 
import os 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

score_names = {
    "evaluations/milu": ("milu", "acc,none"),
    "evaluations/mmlu": ("mmlu", "acc,none"),
    "evaluations/mmlu_indic": ("mmlu_indic", "acc,none"),
    "evaluations/mmlu_indic_roman": ("mmlu_indic_roman", "acc,none"),
    # "evaluations/include": ("include", "acc,none"),
    "evaluations/igb/xsum": ("igb_xsum", "chrf,none"),
    "evaluations/igb/xquad": ("igb_xquad", "f1,none"),
    "evaluations/igb/xorqa": ("igb_xorqa", "f1,none"),
    "evaluations/igb/flores_xxen": ("igb_flores_xxen", "chrf,none"),
    "evaluations/igb/flores_enxx": ("igb_flores_enxx", "chrf,none"),
}

model_names = {
    "llama": "Llama-3.1-8B-Instruct",
    "gemma": "gemma-3-12b-it"
}

In [None]:
def get_consistent_style_mapping(all_rows):
    """Get consistent color and marker mappings across all plots"""
    # Collect all unique models across all datasets
    all_models = set()
    for rows in all_rows.values():
        for row in rows:
            all_models.add(row[0])  # model_name is at index 0
    
    unique_models = sorted(list(all_models))
    
    # Define style elements
    markers = ['o', 's', 'p', 'h', 'P', 'X', '^', 'v', '>', '<', 'D', '*']
    colors = ['#4285F4', '#EA4335', '#FBBC04', '#34A853', '#9C27B0', 
              '#FF6F00', '#00ACC1', '#009688', '#795548', '#607D8B',
              '#AB47BC', '#FF5722']
    
    # Extract model families
    model_families = {}
    for model in unique_models:
        if '-' in model:
            family = model.split('-')[0]
        elif any(char.isdigit() for char in model):
            for i, char in enumerate(model):
                if char.isdigit():
                    family = model[:i].rstrip()
                    break
            else:
                family = model
        else:
            family = model
        
        if family not in model_families:
            model_families[family] = []
        model_families[family].append(model)
    
    # Assign markers to families and colors to individual models
    family_marker_map = {}
    model_color_map = {}
    model_to_family = {}
    
    marker_idx = 0
    color_idx = 0
    
    for family, models in sorted(model_families.items()):
        family_marker = markers[marker_idx % len(markers)]
        family_marker_map[family] = family_marker
        marker_idx += 1
        
        for model in sorted(models):
            model_color_map[model] = colors[color_idx % len(colors)]
            model_to_family[model] = family
            color_idx += 1
    
    return model_color_map, family_marker_map, model_to_family

def collect_data(base_dir, task, score_name):
    """Collect data for a specific task"""
    rows = []
    for dirname in os.listdir(base_dir):
        is_custom = dirname.startswith("__")
        splits = dirname.split("__")
        if is_custom:
            model_name = model_names.get(splits[-2], splits[-2])
            dataset = splits[-3].split('/')[0] if '/' in splits[-3] else splits[-3]
            if dataset == "updesh_R":
                dataset = "updesh_R (Full)"
            elif dataset == "updesh_R_uniform_random_sample":
                dataset = "updesh_R (Sample)"
        else:
            model_name = splits[-1]
            dataset = "Proprietary"
        
        dirpath = os.path.join(base_dir, dirname)
        
        for fname in os.listdir(dirpath):
            if not fname.startswith("results_"):
                continue
            with open(os.path.join(dirpath, fname)) as f:
                data = json.load(f)
            score = data["results"][task][score_name]
            rows.append((model_name, score, is_custom, model_name, dataset))
    
    return rows

def plot_subplot(ax, rows, task, score_name, model_color_map, family_marker_map, model_to_family):
    """Plot a single subplot"""
    if not rows:
        ax.text(0.5, 0.5, f"No data for {task}", ha='center', va='center', transform=ax.transAxes)
        ax.set_xticks([])
        ax.set_yticks([])
        return
    
    # Sort by dataset name for consistent x-axis ordering
    unique_datasets = sorted(list(set([r[4] for r in rows])))
    dataset_to_x = {d: i for i, d in enumerate(unique_datasets)}
    
    # Plot each point
    plotted_labels = set()
    for model_name, score, is_custom, full_model, dataset in rows:
        x = dataset_to_x[dataset]
        
        # Get family and use family marker
        family = model_to_family.get(model_name, model_name)
        marker = family_marker_map.get(family, 'o')
        
        # Use model-specific color
        color = model_color_map.get(model_name, '#000000')
        
        # Plot the point
        ax.scatter(x, score, c=[color], marker=marker, alpha=0.9, 
                  edgecolors='black', s=100, linewidth=1,
                  label=model_name if model_name not in plotted_labels else "")
        plotted_labels.add(model_name)
    
    # Add horizontal line for mean
    all_scores = [r[1] for r in rows]
    mean_score = np.mean(all_scores)
    ax.axhline(mean_score, color='grey', linestyle='--', linewidth=1, alpha=0.7)
    
    # Labels and formatting
    ax.set_xticks(range(len(unique_datasets)))
    ax.set_xticklabels(unique_datasets, rotation=45, ha='right', fontsize=8)
    # ax.set_xlabel("Dataset", fontsize=9)
    ax.set_ylabel(score_name.split(",")[0].upper(), fontsize=9)
    ax.set_title(f"{task.upper()}", fontsize=10, fontweight='bold')
    
    # Grid and styling
    ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Add some padding to y-axis
    y_min, y_max = ax.get_ylim()
    y_range = y_max - y_min
    ax.set_ylim(y_min - 0.05 * y_range, y_max + 0.05 * y_range)







os.makedirs("plots", exist_ok=True)

# First, collect all data to ensure consistent styling
all_data = {}
for k, v in score_names.items():
    rows = collect_data(base_dir=k, task=v[0], score_name=v[1])
    all_data[k] = rows

# Get consistent style mappings
model_color_map, family_marker_map, model_to_family = get_consistent_style_mapping(all_data)

# Create figure with 3x3 subplots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.flatten()

# Plot each task
for idx, (k, v) in enumerate(score_names.items()):
    if idx < len(axes):
        plot_subplot(axes[idx], all_data[k], v[0], v[1], 
                    model_color_map, family_marker_map, model_to_family)
    
# Hide unused subplots
for idx in range(len(score_names), len(axes)):
    axes[idx].axis('off')

# Create a shared legend
handles, labels = [], []
for model_name in sorted(model_color_map.keys()):
    family = model_to_family.get(model_name, model_name)
    marker = family_marker_map.get(family, 'o')
    color = model_color_map[model_name]
    
    handle = plt.Line2D([0], [0], marker=marker, color='w', 
                       markerfacecolor=color, markersize=10, 
                       markeredgecolor='black', linewidth=0)
    handles.append(handle)
    labels.append(model_name)

# Add legend at the bottom of the figure
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05),
          ncol=5, fontsize=10, frameon=True, fancybox=True, shadow=True)

plt.tight_layout()
# plt.subplots_adjust(bottom=0.15)  # Make room for legend
plt.savefig("plots/all_tasks_combined.pdf", bbox_inches='tight')
plt.show()