In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Set style for better aesthetics
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Read the data
df = pd.read_csv('data.csv')

# Create figure with subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Comparison', fontsize=20, fontweight='bold', y=0.98)

# Color scheme
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
model_colors = {'gpt-4o': '#FF6B6B', 'gemini-2.0-flash': '#4ECDC4'}

# 1. Success Rate Comparison by Type (Bar Chart)
pivot_success = df[df['Type'] != 'OVERALL'].pivot(index='Type', columns='Model', values='Success Rate')
pivot_success.values[:] = [[float(x.strip('%')) for x in row] for row in pivot_success.values]

ax1.bar(np.arange(len(pivot_success.index)) - 0.2, pivot_success['gpt-4o'], 
    width=0.4, label='GPT-4o', color=model_colors['gpt-4o'], alpha=0.8)
ax1.bar(np.arange(len(pivot_success.index)) + 0.2, pivot_success['gemini-2.0-flash'], 
    width=0.4, label='Gemini-2.0-Flash', color=model_colors['gemini-2.0-flash'], alpha=0.8)

ax1.set_xlabel('Task Type', fontweight='bold')
ax1.set_ylabel('Success Rate (%)', fontweight='bold')
ax1.set_title('Success Rate by Task Type', fontweight='bold', fontsize=14)
ax1.set_xticks(np.arange(len(pivot_success.index)))
ax1.set_xticklabels([t.replace('_', '\n') for t in pivot_success.index], rotation=0)
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, 100)

# 2. Total Attempts Distribution (Stacked Bar)
df_filtered = df[df['Type'] != 'OVERALL'].copy()
df_filtered['Total'] = df_filtered['Success'] + df_filtered['Failure'] + df_filtered['Error']

models = df_filtered['Model'].unique()
types = df_filtered['Type'].unique()
x = np.arange(len(types))
width = 0.35

for i, model in enumerate(models):
    model_data = df_filtered[df_filtered['Model'] == model]
    bottom_success = np.zeros(len(types))
    bottom_failure = model_data['Success'].values
    
    ax2.bar(x + i*width, model_data['Success'], width, 
        label=f'{model} - Success', color=model_colors[model], alpha=0.8)
    ax2.bar(x + i*width, model_data['Failure'], width, bottom=model_data['Success'],
        label=f'{model} - Failure', color=model_colors[model], alpha=0.5)
    ax2.bar(x + i*width, model_data['Error'], width, 
        bottom=model_data['Success'] + model_data['Failure'],
        label=f'{model} - Error', color=model_colors[model], alpha=0.3)

ax2.set_xlabel('Task Type', fontweight='bold')
ax2.set_ylabel('Number of Attempts', fontweight='bold')
ax2.set_title('Distribution of Outcomes by Task Type', fontweight='bold', fontsize=14)
ax2.set_xticks(x + width/2)
ax2.set_xticklabels([t.replace('_', '\n') for t in types])
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Overall Performance Radar Chart
overall_data = df[df['Type'] == 'OVERALL'].copy()
overall_data['Success Rate Numeric'] = overall_data['Success Rate'].str.rstrip('%').astype(float)

angles = np.linspace(0, 2*np.pi, 4, endpoint=False).tolist()
angles += angles[:1]  # Complete the circle

for model in overall_data['Model'].unique():
    model_row = overall_data[overall_data['Model'] == model].iloc[0]
    values = [model_row['Success'], model_row['Failure'], model_row['Error'], model_row['Success Rate Numeric']]
    values += values[:1]  # Complete the circle
    
    ax3.plot(angles, values, 'o-', linewidth=2, label=model, color=model_colors[model])
    ax3.fill(angles, values, alpha=0.25, color=model_colors[model])

ax3.set_xticks(angles[:-1])
ax3.set_xticklabels(['Success', 'Failure', 'Error', 'Success Rate (%)'])
ax3.set_title('Overall Performance Radar Chart', fontweight='bold', fontsize=14, pad=20)
ax3.legend()
ax3.grid(True)

# 4. Success Rate Trend
types_ordered = ['ENVIRONMENTAL_INTERACTION', 'VISUAL_UNDERSTANDING', 'SEQUENTIAL_REASONING']
for model in df[df['Type'] != 'OVERALL']['Model'].unique():
    model_data = df[(df['Model'] == model) & (df['Type'] != 'OVERALL')]
    success_rates = []
    for task_type in types_ordered:
        rate = model_data[model_data['Type'] == task_type]['Success Rate'].iloc[0]
        success_rates.append(float(rate.strip('%')))
        
    ax4.plot(range(len(types_ordered)), success_rates, 'o-', linewidth=3, 
         markersize=8, label=model, color=model_colors[model])

ax4.set_xlabel('Task Complexity →', fontweight='bold')
ax4.set_ylabel('Success Rate (%)', fontweight='bold')
ax4.set_title('Performance Across Task Complexity', fontweight='bold', fontsize=14)
ax4.set_xticks(range(len(types_ordered)))
ax4.set_xticklabels(['Environmental\nInteraction', 'Visual\nUnderstanding', 'Sequential\nReasoning'])
ax4.legend()
ax4.grid(True, alpha=0.3)
ax4.set_ylim(50, 100)

plt.tight_layout()
plt.show()

IndentationError: expected an indented block after 'for' statement on line 94 (3835717723.py, line 95)