# Results Analysis

This notebook analyzes experimental results and generates visualizations.

**Contents:**
1. Load experimental results
2. Analyze technique performance
3. Statistical comparison
4. Generate visualizations

In [None]:
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from visualization import (
    plot_accuracy_comparison,
    plot_loss_comparison,
    plot_confidence_intervals,
    plot_technique_rankings,
)
from pipeline.statistics import StatisticalValidator

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Results

**Note:** This section assumes results from a completed experiment.
If you haven't run an experiment yet, run:
```bash
python main.py run-experiment
```

In [None]:
# Load results (update path if needed)
results_path = '../results/experiment_results.json'

try:
    with open(results_path, 'r') as f:
        results = json.load(f)
    print("✅ Results loaded successfully!")
    print(f"\nConfiguration:")
    print(f"  Model: {results['config']['llm_model']}")
    print(f"  Techniques: {', '.join(results['config']['techniques'])}")
except FileNotFoundError:
    print("❌ Results file not found. Please run an experiment first.")
    print("   Command: python main.py run-experiment")
    results = None

## 2. Mock Data for Demonstration

Since we may not have run actual experiments yet, let's create mock data:

In [None]:
# Create mock results for demonstration
np.random.seed(42)

techniques = [
    'baseline',
    'chain_of_thought',
    'chain_of_thought_plus_plus',
    'react',
    'tree_of_thoughts',
    'role_based',
    'few_shot',
]

# Generate mock accuracy data
mock_accuracy = {
    'baseline': 0.72,
    'chain_of_thought': 0.85,
    'chain_of_thought_plus_plus': 0.89,
    'react': 0.83,
    'tree_of_thoughts': 0.87,
    'role_based': 0.78,
    'few_shot': 0.81,
}

# Generate mock loss data (lower is better)
mock_loss = {
    'baseline': 0.38,
    'chain_of_thought': 0.25,
    'chain_of_thought_plus_plus': 0.20,
    'react': 0.27,
    'tree_of_thoughts': 0.23,
    'role_based': 0.32,
    'few_shot': 0.29,
}

# Generate mock distributions
mock_accuracy_dist = {
    tech: list(np.random.normal(acc, 0.05, 30)) 
    for tech, acc in mock_accuracy.items()
}

print("Mock data generated for demonstration")

## 3. Accuracy Comparison

In [None]:
# Plot accuracy comparison
fig = plot_accuracy_comparison(
    mock_accuracy,
    title="Accuracy Comparison by Prompt Technique"
)
plt.show()

# Print rankings
print("\nAccuracy Rankings:")
for i, (tech, acc) in enumerate(sorted(mock_accuracy.items(), key=lambda x: x[1], reverse=True), 1):
    print(f"  {i}. {tech}: {acc:.3f}")

## 4. Loss Comparison

In [None]:
# Plot loss comparison
fig = plot_loss_comparison(
    mock_loss,
    title="Loss Function Comparison (Lower is Better)"
)
plt.show()

# Print rankings
print("\nLoss Rankings (Lower is Better):")
for i, (tech, loss) in enumerate(sorted(mock_loss.items(), key=lambda x: x[1]), 1):
    print(f"  {i}. {tech}: {loss:.3f}")

## 5. Statistical Validation

In [None]:
# Create validator
validator = StatisticalValidator(alpha=0.05)

# Calculate confidence intervals
ci = validator.calculate_confidence_intervals(mock_accuracy_dist, confidence=0.95)

print("95% Confidence Intervals:")
for tech, intervals in ci.items():
    print(f"\n{tech}:")
    print(f"  Mean: {intervals['mean']:.3f}")
    print(f"  CI: [{intervals['lower']:.3f}, {intervals['upper']:.3f}]")
    print(f"  Std Error: {intervals['std_error']:.4f}")

In [None]:
# Plot confidence intervals
means = {tech: ci[tech]['mean'] for tech in techniques}
lowers = {tech: ci[tech]['lower'] for tech in techniques}
uppers = {tech: ci[tech]['upper'] for tech in techniques}

fig = plot_confidence_intervals(
    means, lowers, uppers,
    title="Accuracy with 95% Confidence Intervals"
)
plt.show()

In [None]:
# Pairwise statistical comparison
comparison = validator.compare_techniques(
    {tech: {'accuracy': vals} for tech, vals in mock_accuracy_dist.items()},
    metric='accuracy',
    use_parametric=True
)

print("Pairwise Statistical Tests (with Bonferroni correction):")
print(f"\nAlpha: {comparison['alpha']}")
print(f"Bonferroni-corrected alpha: {comparison['bonferroni_alpha']:.4f}")
print("\nSignificant differences:")
for pair, result in comparison['pairwise_tests'].items():
    if result['significant']:
        print(f"  {pair}: p={result['p_value']:.4f} ✓")

## 6. Overall Rankings

In [None]:
# Calculate composite scores (weighted accuracy and loss)
composite_scores = {}
for tech in techniques:
    # Higher accuracy is better, lower loss is better
    # Normalize and combine
    score = 0.6 * mock_accuracy[tech] + 0.4 * (1 - mock_loss[tech])
    composite_scores[tech] = score

# Assign rankings
rankings = {}
for i, (tech, score) in enumerate(sorted(composite_scores.items(), key=lambda x: x[1], reverse=True), 1):
    rankings[tech] = i

# Plot rankings
fig = plot_technique_rankings(
    rankings, composite_scores,
    title="Overall Technique Rankings (Composite Score)"
)
plt.show()

## 7. Summary Table

In [None]:
# Create summary DataFrame
summary_data = []
for tech in techniques:
    summary_data.append({
        'Technique': tech.replace('_', ' ').title(),
        'Accuracy': f"{mock_accuracy[tech]:.3f}",
        'Loss': f"{mock_loss[tech]:.3f}",
        'Composite Score': f"{composite_scores[tech]:.3f}",
        'Rank': rankings[tech],
    })

df_summary = pd.DataFrame(summary_data)
df_summary = df_summary.sort_values('Rank')

print("\n" + "=" * 80)
print("EXPERIMENT SUMMARY")
print("=" * 80)
print(df_summary.to_string(index=False))
print("=" * 80)

## Key Findings

Based on the mock data:

1. **Best Performing Technique**: Chain-of-Thought++ (CoT++)
   - Highest accuracy: 89%
   - Lowest loss: 0.20
   - Includes self-verification and confidence scoring

2. **Runner-ups**:
   - Tree-of-Thoughts (ToT): 87% accuracy
   - Chain-of-Thought (CoT): 85% accuracy

3. **Baseline Performance**: 72% accuracy
   - All optimization techniques outperform baseline
   - Improvement range: 6-17 percentage points

4. **Statistical Significance**:
   - Top techniques show statistically significant improvements
   - Bonferroni correction applied for multiple comparisons