# Multi-Language LLM Evaluation Analysis

This notebook analyzes and visualizes LLM performance across multiple languages.

**Developed by**: Red Hat AI Customer Adoption and Innovation team (CAI)

## Features
- Load evaluation results from local storage or S3
- Compare model performance across languages (English, Spanish, Japanese)
- Visualize performance metrics with interactive charts
- Generate comprehensive evaluation reports
- Track performance over multiple evaluations

## 1. Setup and Dependencies

In [None]:
# Install required packages (if not already installed)
!pip install -q pandas matplotlib seaborn plotly boto3

In [None]:
import json
import os
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Dependencies loaded successfully")

## 2. Configuration

Configure where to load evaluation results from:

In [None]:
# ========== CONFIGURATION ==========

# Data source: 'local' or 's3'
DATA_SOURCE = 'local'

# Local results directory
LOCAL_RESULTS_DIR = Path("/workspace/shared-workspace/multilang_evaluation_results")

# S3 Configuration (if using S3)
S3_CONFIG = {
    'bucket': os.environ.get('S3_BUCKET', 'llm-evaluation-results'),
    'endpoint_url': os.environ.get('S3_ENDPOINT_URL'),
    'access_key': os.environ.get('AWS_ACCESS_KEY_ID'),
    'secret_key': os.environ.get('AWS_SECRET_ACCESS_KEY'),
}

# Languages to analyze
LANGUAGES = ['en', 'es', 'ja']
LANGUAGE_NAMES = {
    'en': 'English',
    'es': 'Spanish (Espa√±ol)',
    'ja': 'Japanese (Êó•Êú¨Ë™û)'
}

print(f"Configuration:")
print(f"  Data Source: {DATA_SOURCE}")
if DATA_SOURCE == 'local':
    print(f"  Local Directory: {LOCAL_RESULTS_DIR}")
else:
    print(f"  S3 Bucket: {S3_CONFIG['bucket']}")

## 3. Data Loading Functions

In [None]:
def load_local_results(results_dir: Path) -> Dict:
    """
    Load evaluation results from local directory
    """
    print(f"üìÇ Loading results from: {results_dir}")
    
    if not results_dir.exists():
        raise FileNotFoundError(f"Results directory not found: {results_dir}")
    
    results = {
        'summary': None,
        'languages': {}
    }
    
    # Load cross-language summary
    summary_file = results_dir / "cross_language_summary.json"
    if summary_file.exists():
        with open(summary_file) as f:
            results['summary'] = json.load(f)
        print(f"  ‚úì Loaded summary")
    
    # Load language-specific results
    for lang in LANGUAGES:
        lang_dir = results_dir / lang
        results_file = lang_dir / "results.json"
        
        if results_file.exists():
            with open(results_file) as f:
                results['languages'][lang] = json.load(f)
            print(f"  ‚úì Loaded {LANGUAGE_NAMES[lang]} results")
    
    return results


def load_s3_results(s3_prefix: str, config: Dict) -> Dict:
    """
    Load evaluation results from S3
    """
    import boto3
    from botocore.exceptions import ClientError
    
    print(f"‚òÅÔ∏è  Loading results from S3: {s3_prefix}")
    
    # Initialize S3 client
    s3_client_config = {'service_name': 's3'}
    if config.get('endpoint_url'):
        s3_client_config['endpoint_url'] = config['endpoint_url']
    if config.get('access_key') and config.get('secret_key'):
        s3_client_config['aws_access_key_id'] = config['access_key']
        s3_client_config['aws_secret_access_key'] = config['secret_key']
    
    s3 = boto3.client(**s3_client_config)
    
    results = {
        'summary': None,
        'languages': {}
    }
    
    # Load cross-language summary
    try:
        response = s3.get_object(
            Bucket=config['bucket'],
            Key=f"{s3_prefix}/cross_language_summary.json"
        )
        results['summary'] = json.loads(response['Body'].read())
        print(f"  ‚úì Loaded summary")
    except ClientError:
        print(f"  ‚ö†Ô∏è  Summary not found")
    
    # Load language-specific results
    for lang in LANGUAGES:
        try:
            response = s3.get_object(
                Bucket=config['bucket'],
                Key=f"{s3_prefix}/{lang}/results.json"
            )
            results['languages'][lang] = json.loads(response['Body'].read())
            print(f"  ‚úì Loaded {LANGUAGE_NAMES[lang]} results")
        except ClientError:
            print(f"  ‚ö†Ô∏è  {LANGUAGE_NAMES[lang]} results not found")
    
    return results


print("‚úÖ Data loading functions defined")

## 4. Load Evaluation Results

In [None]:
# Load results based on configuration
if DATA_SOURCE == 'local':
    evaluation_results = load_local_results(LOCAL_RESULTS_DIR)
else:
    # Prompt for S3 prefix if not set
    s3_prefix = input("Enter S3 prefix (e.g., evaluations/model_name/version/timestamp): ")
    evaluation_results = load_s3_results(s3_prefix, S3_CONFIG)

# Display summary
if evaluation_results['summary']:
    print("\n" + "="*60)
    print("EVALUATION SUMMARY")
    print("="*60)
    print(f"Model: {evaluation_results['summary'].get('model_name', 'Unknown')}")
    print(f"Version: {evaluation_results['summary'].get('model_version', 'Unknown')}")
    print(f"Timestamp: {evaluation_results['summary'].get('timestamp', 'Unknown')}")
    print(f"Languages Evaluated: {', '.join(evaluation_results['languages'].keys())}")
    print("="*60)

## 5. Data Processing and Analysis

In [None]:
def extract_metrics(results: Dict) -> pd.DataFrame:
    """
    Extract metrics from results and create a pandas DataFrame
    """
    data = []
    
    for lang, lang_data in results['languages'].items():
        for task, metrics in lang_data.get('results', {}).items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    if isinstance(value, (int, float)):
                        data.append({
                            'language': lang,
                            'language_name': LANGUAGE_NAMES.get(lang, lang),
                            'task': task,
                            'metric': metric_name,
                            'value': value
                        })
    
    return pd.DataFrame(data)


# Extract metrics
df_metrics = extract_metrics(evaluation_results)

print(f"\nüìä Extracted {len(df_metrics)} metric values")
print(f"\nSample data:")
display(df_metrics.head(10))

# Summary statistics
print(f"\nüìà Summary Statistics by Language:")
summary = df_metrics.groupby('language_name')['value'].agg(['mean', 'std', 'min', 'max', 'count'])
display(summary)

## 6. Visualization: Performance Comparison Across Languages

In [None]:
# Filter for accuracy metrics only
df_accuracy = df_metrics[
    df_metrics['metric'].str.contains('acc|f1|em', case=False, na=False)
]

# Create bar chart comparing accuracy across languages
fig = px.bar(
    df_accuracy,
    x='task',
    y='value',
    color='language_name',
    barmode='group',
    title='Model Performance Comparison Across Languages',
    labels={'value': 'Score', 'task': 'Benchmark Task', 'language_name': 'Language'},
    height=500
)

fig.update_layout(
    xaxis_tickangle=-45,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig.show()

## 7. Detailed Metrics Table

In [None]:
# Create pivot table for detailed view
pivot_table = df_accuracy.pivot_table(
    index=['task', 'metric'],
    columns='language_name',
    values='value',
    aggfunc='first'
).round(4)

print("\nüìã Detailed Performance Metrics:")
display(pivot_table)

# Export to CSV
output_file = Path("multilang_evaluation_summary.csv")
pivot_table.to_csv(output_file)
print(f"\nüíæ Summary exported to: {output_file}")

## 8. Heatmap: Performance Across Tasks and Languages

In [None]:
# Aggregate metrics by task and language
heatmap_data = df_accuracy.groupby(['task', 'language_name'])['value'].mean().unstack()

# Create heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.3f',
    cmap='RdYlGn',
    cbar_kws={'label': 'Score'},
    vmin=0,
    vmax=1
)
plt.title('Performance Heatmap: Tasks vs Languages', fontsize=14, pad=20)
plt.xlabel('Language', fontsize=12)
plt.ylabel('Benchmark Task', fontsize=12)
plt.tight_layout()
plt.show()

## 9. Language Performance Gap Analysis

In [None]:
# Calculate performance gap relative to English
if 'English' in heatmap_data.columns:
    performance_gaps = pd.DataFrame()
    
    for lang in heatmap_data.columns:
        if lang != 'English':
            gaps = (heatmap_data[lang] - heatmap_data['English']) / heatmap_data['English'] * 100
            performance_gaps[lang] = gaps
    
    # Plot performance gaps
    fig, ax = plt.subplots(figsize=(10, 6))
    performance_gaps.plot(kind='bar', ax=ax)
    ax.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
    ax.set_title('Performance Gap Relative to English (%)', fontsize=14, pad=20)
    ax.set_xlabel('Benchmark Task', fontsize=12)
    ax.set_ylabel('Performance Gap (%)', fontsize=12)
    ax.legend(title='Language', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print("\nüìä Average Performance Gap (vs English):")
    avg_gaps = performance_gaps.mean()
    for lang, gap in avg_gaps.items():
        print(f"  {lang}: {gap:+.2f}%")
else:
    print("‚ö†Ô∏è  English baseline not available for gap analysis")

## 10. Export Comprehensive Report

In [None]:
# Generate markdown report
report = []
report.append("# Multi-Language LLM Evaluation Report\n")
report.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

if evaluation_results['summary']:
    summary = evaluation_results['summary']
    report.append("## Model Information\n")
    report.append(f"- **Model**: {summary.get('model_name', 'Unknown')}\n")
    report.append(f"- **Version**: {summary.get('model_version', 'Unknown')}\n")
    report.append(f"- **Evaluation Date**: {summary.get('timestamp', 'Unknown')}\n")

report.append("\n## Performance Summary\n")
report.append(f"\n{pivot_table.to_markdown()}\n")

if 'English' in heatmap_data.columns and len(performance_gaps) > 0:
    report.append("\n## Performance Gap Analysis\n")
    report.append("\nAverage performance gap relative to English:\n")
    for lang, gap in avg_gaps.items():
        report.append(f"- **{lang}**: {gap:+.2f}%\n")

# Save report
report_file = Path("multilang_evaluation_report.md")
with open(report_file, 'w') as f:
    f.writelines(report)

print(f"\n‚úÖ Comprehensive report saved to: {report_file}")
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

## 11. Optional: Compare Multiple Evaluation Runs

If you have multiple evaluation runs, you can load and compare them:

In [None]:
# Example: Load multiple evaluation runs for comparison
# Uncomment and modify as needed

# evaluation_dirs = [
#     Path("/workspace/shared-workspace/multilang_evaluation_results_v1"),
#     Path("/workspace/shared-workspace/multilang_evaluation_results_v2"),
# ]

# all_results = {}
# for eval_dir in evaluation_dirs:
#     version = eval_dir.name.split('_')[-1]
#     all_results[version] = load_local_results(eval_dir)

# # Create comparison visualizations
# # ... your comparison code here

print("‚ÑπÔ∏è  To compare multiple runs, uncomment and configure the cell above")