# LegalBench Results Analysis

This notebook analyzes the balanced accuracy scores across different models from the LegalBench evaluation results.

In [25]:
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import re

In [26]:
# Load all JSON files from the results directory
results_dir = Path('results')
results = []

# Models to compare
target_models = ['gemma-3-12b-it', 'Mistral-7B-Instruct-v0.3']

for json_file in results_dir.glob('*.json'):
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
            
        # Extract required fields
        if ('dataset' in data and 
            'name' in data['dataset'] and
            'model' in data and 
            'balanced_accuracy' in data):
            
            # Clean model name by removing organization prefix
            model_name = data['model']
            if '/' in model_name:
                model_name = model_name.split('/')[-1]
            
            # Filter for target models only
            if model_name in target_models:
                results.append({
                    'model': model_name,
                    'task': data['dataset']['name'],
                    'balanced_accuracy': data['balanced_accuracy'],
                    'file': json_file.name
                })
    except Exception as e:
        print(f"Error processing {json_file}: {e}")

print(f"Loaded {len(results)} results for target models")

# Create DataFrame
df = pd.DataFrame(results)
print(f"\nResults by model:")
print(df['model'].value_counts())
print(f"\nUnique tasks: {df['task'].nunique()}")
print(df['task'].unique())

Loaded 58 results for target models

Results by model:
model
gemma-3-12b-it              29
Mistral-7B-Instruct-v0.3    29
Name: count, dtype: int64

Unique tasks: 29
['overruling'
 'maud_change_in_law__subject_to_disproportionate_impact_modifier'
 'cuad_affiliate_license-licensor' 'cuad_irrevocable_or_perpetual_license'
 'cuad_governing_law' 'maud_relational_language_(mae)_applies_to'
 'abercrombie' 'cuad_covenant_not_to_sue' 'cuad_cap_on_liability'
 'maud_pandemic_or_other_public_health_event__subject_to_disproportionate_impact_modifier'
 'international_citizenship_questions' 'maud_fls_(mae)_standard'
 'cuad_unlimited-all-you-can-eat-license' 'cuad_license_grant'
 'opp115_international_and_specific_audiences'
 'cuad_revenue-profit_sharing'
 'maud_ability_to_consummate_concept_is_subject_to_mae_carveouts'
 'definition_extraction' 'opp115_do_not_track' 'definition_classification'
 'maud_changes_in_gaap_or_other_accounting_principles__subject_to_disproportionate_impact_modifier'
 'ssla_

## Model Comparison: gemma-3-12b-it vs Mistral-7B-Instruct-v0.3

First, let's compare the performance of gemma-3-12b-it and Mistral-7B-Instruct-v0.3 across all LegalBench tasks.

In [27]:
# Prepare data for comparison chart
# Pivot the data to have tasks as rows and models as columns
comparison_df = df.pivot(index='task', columns='model', values='balanced_accuracy').fillna(0)

# Reset index to make 'task' a regular column
comparison_df = comparison_df.reset_index()

print("Model comparison data:")
print(comparison_df.head(10))

# Calculate difference between models where both have results
comparison_df['difference'] = comparison_df['gemma-3-12b-it'] - comparison_df['Mistral-7B-Instruct-v0.3']
comparison_df['has_both'] = (comparison_df['gemma-3-12b-it'] > 0) & (comparison_df['Mistral-7B-Instruct-v0.3'] > 0)

print(f"\nTasks with results from both models: {comparison_df['has_both'].sum()}")
print(f"Tasks with only gemma-3-12b-it: {(comparison_df['gemma-3-12b-it'] > 0).sum() - comparison_df['has_both'].sum()}")
print(f"Tasks with only Mistral-7B-Instruct-v0.3: {(comparison_df['Mistral-7B-Instruct-v0.3'] > 0).sum() - comparison_df['has_both'].sum()}")

Model comparison data:
model                                   task  Mistral-7B-Instruct-v0.3  \
0                                abercrombie                  0.589474   
1            cuad_affiliate_license-licensee                  0.919192   
2            cuad_affiliate_license-licensor                  0.818182   
3                      cuad_cap_on_liability                  0.886035   
4                   cuad_covenant_not_to_sue                  0.886364   
5                         cuad_governing_law                  0.914384   
6               cuad_ip_ownership_assignment                  0.927083   
7      cuad_irrevocable_or_perpetual_license                  0.975000   
8                         cuad_license_grant                  0.901146   
9                cuad_revenue-profit_sharing                  0.935401   

model  gemma-3-12b-it  
0            0.631579  
1            0.909091  
2            0.920455  
3            0.939807  
4            0.938312  
5            0.986

In [28]:
# Create grouped bar chart comparing the two models across tasks
# Melt the dataframe to have model as a column for plotting
plot_data = pd.melt(
    comparison_df, 
    id_vars=['task'], 
    value_vars=['gemma-3-12b-it', 'Mistral-7B-Instruct-v0.3'],
    var_name='model', 
    value_name='balanced_accuracy'
)

# Remove rows with zero values (no results)
plot_data = plot_data[plot_data['balanced_accuracy'] > 0]

# Sort tasks by average performance for better visualization
task_avg = plot_data.groupby('task')['balanced_accuracy'].mean().sort_values(ascending=True)
plot_data['task'] = pd.Categorical(plot_data['task'], categories=task_avg.index, ordered=True)
plot_data = plot_data.sort_values('task')

fig = px.bar(
    plot_data,
    x='task',
    y='balanced_accuracy',
    color='model',
    barmode='group',
    title='Model Performance Comparison Across LegalBench Tasks<br>gemma-3-12b-it vs Mistral-7B-Instruct-v0.3',
    labels={'balanced_accuracy': 'Balanced Accuracy', 'task': 'Task'},
    color_discrete_map={
        'gemma-3-12b-it': '#1f77b4',
        'Mistral-7B-Instruct-v0.3': '#ff7f0e'
    }
)

# Customize layout
fig.update_layout(
    height=600,
    width=max(1200, len(task_avg) * 60),
    font=dict(size=11),
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickangle=-45,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()

In [29]:
# Create a difference chart showing which model performs better on each task
# Only include tasks where both models have results
both_models_df = comparison_df[comparison_df['has_both']].copy()

if len(both_models_df) > 0:
    # Sort by difference for better visualization
    both_models_df = both_models_df.sort_values('difference', ascending=True)
    
    # Create color based on which model is better
    colors = ['#ff4d4d' if diff < 0 else '#4dff4d' for diff in both_models_df['difference']]
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=both_models_df['difference'],
        y=both_models_df['task'],
        orientation='h',
        marker_color=colors,
        text=[f'{diff:.3f}' for diff in both_models_df['difference']],
        textposition='outside',
        name='Performance Difference'
    ))
    
    fig.update_layout(
        title='Performance Difference: gemma-3-12b-it vs Mistral-7B-Instruct-v0.3<br><span style="font-size:12px">Green: gemma-3-12b-it better | Red: Mistral-7B-Instruct-v0.3 better</span>',
        xaxis_title='Balanced Accuracy Difference (gemma - Mistral)',
        yaxis_title='Task',
        height=max(400, len(both_models_df) * 25),
        width=800,
        font=dict(size=11),
        title_font_size=14,
        showlegend=False
    )
    
    # Add vertical line at x=0
    fig.add_vline(x=0, line_dash="dash", line_color="black", opacity=0.5)
    
    fig.show()
else:
    print("No tasks found where both models have results for direct comparison.")

In [30]:
# Summary statistics for the model comparison
print("=== Model Comparison Summary ===\n")

# Overall performance
gemma_results = df[df['model'] == 'gemma-3-12b-it']
mistral_results = df[df['model'] == 'Mistral-7B-Instruct-v0.3']

print(f"gemma-3-12b-it:")
print(f"  - Number of tasks: {len(gemma_results)}")
print(f"  - Mean balanced accuracy: {gemma_results['balanced_accuracy'].mean():.4f}")
print(f"  - Std deviation: {gemma_results['balanced_accuracy'].std():.4f}")
print(f"  - Best task: {gemma_results.loc[gemma_results['balanced_accuracy'].idxmax(), 'task']} ({gemma_results['balanced_accuracy'].max():.4f})")
print(f"  - Worst task: {gemma_results.loc[gemma_results['balanced_accuracy'].idxmin(), 'task']} ({gemma_results['balanced_accuracy'].min():.4f})")

print(f"\nMistral-7B-Instruct-v0.3:")
print(f"  - Number of tasks: {len(mistral_results)}")
print(f"  - Mean balanced accuracy: {mistral_results['balanced_accuracy'].mean():.4f}")
print(f"  - Std deviation: {mistral_results['balanced_accuracy'].std():.4f}")
print(f"  - Best task: {mistral_results.loc[mistral_results['balanced_accuracy'].idxmax(), 'task']} ({mistral_results['balanced_accuracy'].max():.4f})")
print(f"  - Worst task: {mistral_results.loc[mistral_results['balanced_accuracy'].idxmin(), 'task']} ({mistral_results['balanced_accuracy'].min():.4f})")

# Head-to-head comparison
if len(both_models_df) > 0:
    print(f"\n=== Head-to-Head Comparison ({len(both_models_df)} tasks) ===")
    gemma_wins = (both_models_df['difference'] > 0).sum()
    mistral_wins = (both_models_df['difference'] < 0).sum()
    ties = (both_models_df['difference'] == 0).sum()
    
    print(f"gemma-3-12b-it wins: {gemma_wins}")
    print(f"Mistral-7B-Instruct-v0.3 wins: {mistral_wins}")
    print(f"Ties: {ties}")
    print(f"Average difference (gemma - mistral): {both_models_df['difference'].mean():.4f}")
    
    if gemma_wins > 0:
        biggest_gemma_win = both_models_df.loc[both_models_df['difference'].idxmax()]
        print(f"Biggest gemma advantage: {biggest_gemma_win['task']} (+{biggest_gemma_win['difference']:.4f})")
    
    if mistral_wins > 0:
        biggest_mistral_win = both_models_df.loc[both_models_df['difference'].idxmin()]
        print(f"Biggest mistral advantage: {biggest_mistral_win['task']} ({biggest_mistral_win['difference']:.4f})")

=== Model Comparison Summary ===

gemma-3-12b-it:
  - Number of tasks: 29
  - Mean balanced accuracy: 0.7570
  - Std deviation: 0.2255
  - Best task: cuad_governing_law (0.9863)
  - Worst task: maud_fls_(mae)_standard (0.3012)

Mistral-7B-Instruct-v0.3:
  - Number of tasks: 29
  - Mean balanced accuracy: 0.7247
  - Std deviation: 0.2023
  - Best task: cuad_irrevocable_or_perpetual_license (0.9750)
  - Worst task: maud_fls_(mae)_standard (0.2500)

=== Head-to-Head Comparison (29 tasks) ===
gemma-3-12b-it wins: 19
Mistral-7B-Instruct-v0.3 wins: 8
Ties: 2
Average difference (gemma - mistral): 0.0323
Biggest gemma advantage: opp115_do_not_track (+0.3727)
Biggest mistral advantage: ssla_plaintiff (-0.3482)


## Abercrombie Task Analysis - All Models

Now let's look at all models' performance specifically on the Abercrombie trademark classification task.

In [31]:
# Load abercrombie results for all models
abercrombie_results = []

for json_file in results_dir.glob('*.json'):
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
            
        # Filter for abercrombie dataset only
        if ('dataset' in data and 
            'name' in data['dataset'] and 
            'abercrombie' in data['dataset']['name'].lower() and
            'model' in data and 
            'balanced_accuracy' in data):
            
            # Clean model name by removing organization prefix
            model_name = data['model']
            if '/' in model_name:
                model_name = model_name.split('/')[-1]
            
            abercrombie_results.append({
                'model': model_name,
                'balanced_accuracy': data['balanced_accuracy'],
                'dataset_name': data['dataset']['name'],
                'file': json_file.name
            })
    except Exception as e:
        print(f"Error processing {json_file}: {e}")

print(f"Loaded {len(abercrombie_results)} abercrombie results")

# Create DataFrame for abercrombie
abercrombie_df = pd.DataFrame(abercrombie_results)
print("\nFirst few abercrombie results:")
print(abercrombie_df.head())

Loaded 17 abercrombie results

First few abercrombie results:
                      model  balanced_accuracy dataset_name  \
0            gemma-3-12b-it           0.631579  abercrombie   
1             gemma-3-4b-it           0.389474  abercrombie   
2                SmolLM3-3B           0.410526  abercrombie   
3    hplt2c_eng_checkpoints           0.189474  abercrombie   
4  Mistral-7B-Instruct-v0.3           0.589474  abercrombie   

                                                file  
0  legalbench-abercrombie-gemma-3-12b-it_eng_Latn...  
1  legalbench-abercrombie-gemma-3-4b-it_eng_Latn....  
2    legalbench-abercrombie-SmolLM3-3B_eng_Latn.json  
3  legalbench-abercrombie-hplt2c_eng_checkpoints_...  
4  legalbench-abercrombie-Mistral-7B-Instruct-v0....  


In [32]:
# Group by model and calculate mean balanced accuracy for abercrombie
abercrombie_scores = abercrombie_df.groupby('model')['balanced_accuracy'].agg(['mean', 'count']).reset_index()
abercrombie_scores.columns = ['model', 'mean_balanced_accuracy', 'num_evaluations']
abercrombie_scores = abercrombie_scores.sort_values('mean_balanced_accuracy', ascending=True)

print("Abercrombie model performance summary:")
print(abercrombie_scores)

Abercrombie model performance summary:
                       model  mean_balanced_accuracy  num_evaluations
0               EuroLLM-1.7B                0.168421                1
16    hplt2c_eng_checkpoints                0.189474                1
1                 EuroLLM-9B                0.252632                1
13             gemma-3-1b-pt                0.263158                1
12             gemma-3-1b-it                0.284211                1
15             gemma-3-4b-pt                0.305263                1
14             gemma-3-4b-it                0.389474                1
5   OLMo-2-1124-13B-Instruct                0.400000                1
8                 SmolLM3-3B                0.410526                1
9            SmolLM3-3B-Base                0.421053                1
4            OLMo-2-1124-13B                0.442105                1
6             OLMo-2-1124-7B                0.526316                1
3            Mistral-7B-v0.1                0.53684

In [33]:
# Create horizontal bar chart for abercrombie results
fig = px.bar(
    abercrombie_scores, 
    x='mean_balanced_accuracy', 
    y='model',
    orientation='h',
    title='Abercrombie Task Performance - All Models<br>(Mean Balanced Accuracy)',
    labels={'mean_balanced_accuracy': 'Balanced Accuracy', 'model': 'Model'},
    text=[f'{score:.3f} ({count} evals)' for score, count in zip(abercrombie_scores['mean_balanced_accuracy'], abercrombie_scores['num_evaluations'])],
    color='mean_balanced_accuracy',
    color_continuous_scale='viridis'
)

# Customize layout
fig.update_traces(textposition='outside')
fig.update_layout(
    height=max(400, len(abercrombie_scores) * 30),
    width=900,
    showlegend=False,
    font=dict(size=12),
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14
)

fig.show()

In [34]:
# Alternative vertical bar chart for abercrombie
# Sort by balanced accuracy for better visualization
abercrombie_scores_sorted = abercrombie_scores.sort_values('mean_balanced_accuracy', ascending=True)

fig = px.bar(
    abercrombie_scores_sorted, 
    x='model', 
    y='mean_balanced_accuracy',
    title='Abercrombie Task Performance - All Models<br>(Mean Balanced Accuracy)',
    labels={'mean_balanced_accuracy': 'Balanced Accuracy', 'model': 'Model'},
    text=[f'{score:.3f}<br>({count} evals)' for score, count in zip(abercrombie_scores_sorted['mean_balanced_accuracy'], abercrombie_scores_sorted['num_evaluations'])],
    color='mean_balanced_accuracy',
    color_continuous_scale='plasma'
)

# Customize layout
fig.update_traces(textposition='outside')
fig.update_layout(
    height=600,
    width=max(800, len(abercrombie_scores) * 50),
    showlegend=False,
    font=dict(size=11),
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickangle=-45
)

fig.show()

In [35]:
# Show detailed statistics for abercrombie
print("\nDetailed Abercrombie Model Statistics:")
print("=" * 50)
for _, row in abercrombie_scores.sort_values('mean_balanced_accuracy', ascending=False).iterrows():
    print(f"{row['model']:<30} {row['mean_balanced_accuracy']:.4f} ({row['num_evaluations']} evaluations)")
    
print(f"\nAbercrombie Overall Statistics:")
print(f"Number of models: {len(abercrombie_scores)}")
print(f"Total evaluations: {abercrombie_scores['num_evaluations'].sum()}")
print(f"Mean balanced accuracy across all models: {abercrombie_scores['mean_balanced_accuracy'].mean():.4f}")
print(f"Best performing model: {abercrombie_scores.loc[abercrombie_scores['mean_balanced_accuracy'].idxmax(), 'model']}")
print(f"Best score: {abercrombie_scores['mean_balanced_accuracy'].max():.4f}")


Detailed Abercrombie Model Statistics:
gemma-3-12b-it                 0.6316 (1 evaluations)
Mistral-7B-Instruct-v0.3       0.5895 (1 evaluations)
OLMo-2-1124-7B-Instruct        0.5579 (1 evaluations)
gemma-3-12b-pt                 0.5368 (1 evaluations)
Mistral-7B-v0.1                0.5368 (1 evaluations)
OLMo-2-1124-7B                 0.5263 (1 evaluations)
OLMo-2-1124-13B                0.4421 (1 evaluations)
SmolLM3-3B-Base                0.4211 (1 evaluations)
SmolLM3-3B                     0.4105 (1 evaluations)
OLMo-2-1124-13B-Instruct       0.4000 (1 evaluations)
gemma-3-4b-it                  0.3895 (1 evaluations)
gemma-3-4b-pt                  0.3053 (1 evaluations)
gemma-3-1b-it                  0.2842 (1 evaluations)
gemma-3-1b-pt                  0.2632 (1 evaluations)
EuroLLM-9B                     0.2526 (1 evaluations)
hplt2c_eng_checkpoints         0.1895 (1 evaluations)
EuroLLM-1.7B                   0.1684 (1 evaluations)

Abercrombie Overall Statistics:
Number of