In [None]:
# -*- coding: utf-8 -*-
"""
3_Evaluation_Comparison.ipynb

This notebook provides a comparative analysis of the evaluation results across different fine-tuning methods.
It loads the ROUGE scores, training metrics, and other relevant data, and uses `src/visualize.py` to generate plots.
"""

# Import necessary libraries
import pandas as pd
import os
import sys

# Add parent directory to path
sys.path.append('..')

from src.config import TABLES_DIR
from src.visualize import (
    plot_rouge_scores, 
    plot_bleu_scores,
    plot_trainable_parameters, 
    plot_training_and_inference_time,
    plot_vram_usage,
    plot_radar_chart,
    plot_comprehensive_comparison
)

# --- Load Evaluation Results ---
print("Loading evaluation results...")
results_path = os.path.join(TABLES_DIR, "evaluation_results.csv")

if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
    print("Evaluation results loaded successfully.")
    print(f"\nEvaluation Results DataFrame ({len(results_df)} methods):")
    print(results_df.to_string(index=False))
    
    # Display summary statistics
    print("\n=== Summary Statistics ===")
    if 'ROUGE-L' in results_df.columns:
        print(f"\nBest ROUGE-L: {results_df['ROUGE-L'].max():.4f} ({results_df.loc[results_df['ROUGE-L'].idxmax(), 'Method']})")
    if 'BLEU' in results_df.columns:
        print(f"Best BLEU: {results_df['BLEU'].max():.4f} ({results_df.loc[results_df['BLEU'].idxmax(), 'Method']})")
    if 'Training Time (min)' in results_df.columns:
        print(f"Fastest Training: {results_df['Training Time (min)'].min():.2f} min ({results_df.loc[results_df['Training Time (min)'].idxmin(), 'Method']})")
else:
    print(f"Error: Evaluation results file not found at {results_path}. Please run `src/evaluate.py` first.")
    results_df = pd.DataFrame()  # Create empty DataFrame to prevent errors

# --- Generate Visualizations ---
if not results_df.empty:
    print("\nGenerating comparative visualizations...")
    plot_rouge_scores(results_df)
    plot_trainable_parameters(results_df)
    plot_training_and_inference_time(results_df)
    print("Visualizations generated and saved to report/figures.")
else:
    print("Skipping visualization as evaluation results are empty.")


# --- Analysis and Discussion Points ---
print("\n--- Analysis and Discussion Points ---")
print("Based on the generated plots and the `results_df` above, consider the following for your report:")
print("1. **ROUGE Scores Comparison**: Which method achieved the highest ROUGE-1, ROUGE-2, and ROUGE-L scores? Discuss potential reasons. Are there trade-offs?")
print("2. **Trainable Parameters**: Compare the number of trainable parameters for each method. How does this relate to their performance and resource usage?")
print("3. **Training Time & Inference Time**: Analyze the training and inference times. Which methods are most efficient? Are there any unexpected results?")
print("4. **Resource Usage (GPU Memory)**: Although not explicitly plotted here (requires manual logging), discuss the GPU memory footprint of each method based on your observations during training.")
print("5. **Trade-offs**: Summarize the trade-offs between accuracy, training time, memory usage, and model complexity for each fine-tuning approach.")
print("6. **Conclusion for Report**: Which method would you recommend for this task under what constraints (e.g., limited GPU vs. high accuracy requirement)?")

print("Evaluation comparison complete. Review the generated figures in `report/figures/` for your report.")
