In [None]:
# Juror-AI Comparison Analysis
*Comprehensive analysis comparing human juror ratings with AI model predictions*

## Overview
This notebook provides a complete analysis of how well AI models (OpenAI GPT-4.1, Claude Sonnet 4, and Gemini 2.5) align with human juror ratings across different legal scenarios.

### Dataset Information
- **Total Responses**: 1,198 juror ratings
- **Scenarios**: 4 different legal scenarios
- **AI Models**: OpenAI, Claude, Gemini
- **Demographics**: Political affiliation, gender, ethnicity, education, income

### Analysis Goals
1. Evaluate overall AI model performance vs human ratings
2. Identify scenario-specific performance patterns
3. Explore demographic influences on AI-human alignment
4. Provide actionable insights for legal AI applications


In [None]:
# Setup and Imports
import sys
import os
sys.path.append('../')

# Core libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt

# Custom modules
from src.data_loader import (
    load_data, validate_data_structure, preprocess_data, 
    print_data_summary, get_sample_data
)
from src.metrics import (
    compute_all_metrics, print_metrics_report, 
    compare_models_pairwise, create_metrics_summary_table
)
from src.plots import (
    setup_plot_style, plot_overall_mae, plot_scenario_heatmap,
    plot_error_distribution, plot_demographic_comparison,
    plot_scatter_matrix, plot_scenario_trends, PALETTE
)
from src.utils import (
    format_metrics_for_display, get_top_performers, 
    create_performance_summary, check_data_quality
)

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Setup plotting
setup_plot_style()
pio.renderers.default = "notebook"

print("✅ All modules loaded successfully!")
print(f"📊 Color Palette: {PALETTE}")
print("🚀 Ready for analysis!")


In [None]:
## 📥 Data Loading & Validation

Let's start by loading the dataset and performing comprehensive validation to ensure data quality.


In [None]:
# Load raw data
raw_df = load_data("../data/datacsv.csv")

# Display basic info
print(f"\n📋 Dataset Shape: {raw_df.shape}")
print(f"🔗 Columns: {list(raw_df.columns)}")

# Check data quality
quality_report = check_data_quality(raw_df)
print(f"\n🎯 Data Quality Score: {quality_report['quality_score']}/100")

if quality_report['issues']:
    print("\n⚠️ Data Quality Issues:")
    for issue in quality_report['issues']:
        print(f"  • {issue}")
else:
    print("✅ No major data quality issues detected!")

# Show first few rows
print("\n📊 Sample Data:")
display(raw_df.head())


In [None]:
# Preprocess data
df = preprocess_data(raw_df)

# Print comprehensive data summary
print_data_summary(df)

# Show sample of processed data
print("\n🔍 Processed Data Sample:")
sample_data = get_sample_data(df, 10)
display(sample_data[['JurorID', 'Scenario', 'Human', 'OpenAI', 'Claude', 'Gemini', 'Party', 'Gender']])


In [None]:
## 📊 Statistical Metrics Computation

Now let's compute comprehensive metrics to evaluate how well each AI model aligns with human juror ratings.


In [None]:
# Compute all metrics
print("🧮 Computing comprehensive metrics...")
all_metrics = compute_all_metrics(df)

# Print detailed metrics report
print_metrics_report(all_metrics)

# Display formatted overall metrics
print("\n📋 Overall Performance Table:")
overall_formatted = format_metrics_for_display(all_metrics['overall'])
display(overall_formatted)

# Show top performers
top_performers = get_top_performers(all_metrics, metric='MAE')
print(f"\n🏆 Top Performers by Analysis:")
for analysis, models in top_performers.items():
    print(f"  • {analysis}: {models[0] if models else 'N/A'}")

# Pairwise comparison
print("\n🔄 Pairwise Model Comparison:")
pairwise_results = compare_models_pairwise(df)
display(pairwise_results)


In [None]:
## 📈 Comprehensive Visualizations

Let's create a comprehensive set of visualizations to understand model performance patterns across different dimensions.


In [None]:
# 1. Overall Model Performance
print("📊 Creating Overall Performance Visualization...")
fig_overall = plot_overall_mae(all_metrics['overall'])
fig_overall.show()

# Save figure
fig_overall.write_html("../results/figs/overall_performance.html")
fig_overall.write_image("../results/figs/overall_performance.png", width=800, height=500)


In [None]:
# 2. Scenario Performance Heatmap
print("🔥 Creating Scenario Performance Heatmap...")
fig_heatmap = plot_scenario_heatmap(df)
fig_heatmap.show()

# Save figure
fig_heatmap.write_html("../results/figs/scenario_heatmap.html")
fig_heatmap.write_image("../results/figs/scenario_heatmap.png", width=800, height=600)


In [None]:
# 3. Error Distribution Analysis
print("📊 Creating Error Distribution Analysis...")
fig_errors = plot_error_distribution(df)
fig_errors.show()

# 4. Demographic Comparison - Political Affiliation
print("🏛️ Creating Political Affiliation Analysis...")
fig_party = plot_demographic_comparison(df, 'Party')
fig_party.show()

# 5. Scenario Trends
print("📈 Creating Scenario Trends Analysis...")
fig_trends = plot_scenario_trends(df)
fig_trends.show()

# Save all figures
fig_errors.write_html("../results/figs/error_distribution.html")
fig_party.write_html("../results/figs/party_comparison.html")
fig_trends.write_html("../results/figs/scenario_trends.html")

print("✅ All visualizations created and saved!")


In [None]:
## 🎯 Summary & Key Insights

Let's create a comprehensive summary of our findings.


In [None]:
# Generate comprehensive performance summary
performance_summary = create_performance_summary(df)
print(performance_summary)

# Export all metrics to CSV files
print("\n💾 Exporting Results...")
from src.utils import export_results_to_csv
exported_files = export_results_to_csv(all_metrics)
print(f"✅ Exported metrics to: {exported_files}")

# Final insights
print("\n" + "="*80)
print("🔍 KEY FINDINGS & NEXT STEPS")
print("="*80)
print("""
📊 PERFORMANCE RANKING:
Based on Mean Absolute Error (MAE), the models rank as follows:
(Lower MAE = Better performance)

📈 SCENARIO INSIGHTS:
Different models excel in different legal scenarios, suggesting
specialized applications may benefit from ensemble approaches.

👥 DEMOGRAPHIC PATTERNS:
Political affiliation and education level show interesting
correlations with AI-human alignment patterns.

🚀 NEXT STEPS:
1. Interactive dashboard for real-time exploration
2. Deep-dive analysis of outlier cases
3. Model ensemble recommendations
4. Legal implications and recommendations

📱 Launch the Streamlit dashboard for interactive exploration:
   streamlit run app/streamlit_app.py
""")

print("="*80)
print("✅ ANALYSIS COMPLETE!")
print("="*80)
