In [None]:
pip install --upgrade unbabel-comet

import json
import torch
import pandas as pd
from tqdm import tqdm
import os

# Verify that GPU is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")

def evaluate_translations(source_file, target_file, sample_size=None):
    """
    Evaluate translation quality using COMET model in QE mode
    """
    # Import COMET
    from comet import download_model, load_from_checkpoint
    
    # Download the QE model that doesn't require references
    print("Downloading COMET QE model (this may take a minute)...")
    model_path = download_model("Unbabel/wmt20-comet-qe-da")  # Using QE model instead
    model = load_from_checkpoint(model_path)
    print("Model loaded successfully")
    
    # Load source and target data
    print(f"Loading source file: {source_file}")
    with open(source_file, "r", encoding="utf-8") as f:
        source_data = [json.loads(line.strip()) for line in f if line.strip()]
    
    print(f"Loading target file: {target_file}")
    with open(target_file, "r", encoding="utf-8") as f:
        target_data = [json.loads(line.strip()) for line in f if line.strip()]
    
    # Prepare data for COMET - QE mode only needs src and mt
    samples = []
    
    # Determine number of samples to evaluate - now using full dataset by default
    if sample_size:
        eval_size = min(sample_size, min(len(source_data), len(target_data)))
    else:
        eval_size = min(len(source_data), len(target_data))
    
    print(f"Preparing {eval_size} samples for evaluation...")
    
    for i in range(eval_size):
        source_text = source_data[i].get("instruction", "")
        target_text = target_data[i].get("instruction", "")
        
        if source_text and target_text:
            samples.append({
                "src": source_text,
                "mt": target_text,
                "index": i
            })
    
    # Run the model in batches to prevent memory issues
    all_scores = []
    batch_size = 8  # Small batch size to prevent CUDA OOM errors
    
    print("Running COMET evaluation...")
    for i in tqdm(range(0, len(samples), batch_size)):
        batch = samples[i:i+batch_size]
        # Use GPU (gpus=1) since we're in Colab GPU environment
        model_output = model.predict(batch, batch_size=batch_size, gpus=1)
        all_scores.extend(model_output.scores)
    
    # Collect scores
    scores = []
    for i, score in enumerate(all_scores):
        item_index = samples[i]["index"]
        scores.append({
            "index": item_index,
            "source": samples[i]["src"][:100] + "..." if len(samples[i]["src"]) > 100 else samples[i]["src"],
            "translation": samples[i]["mt"][:100] + "..." if len(samples[i]["mt"]) > 100 else samples[i]["mt"],
            "score": score
        })
    
    # Calculate statistics
    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0
    
    # Export to CSV for further analysis
    df = pd.DataFrame(scores)
    df.to_csv("comet_full_evaluation_results.csv", index=False)
    print(f"Results saved to comet_full_evaluation_results.csv")
    
    return {
        "samples": scores,
        "average_score": avg_score,
        "num_evaluated": len(samples)
    }

# Main execution
try:
    # Check if files exist
    source_file = "cleaned_train1.jsonl"
    target_file = "final_train1.jsonl"
    
    if not os.path.exists(source_file):
        print(f"Error: File {source_file} not found")
    elif not os.path.exists(target_file):
        print(f"Error: File {target_file} not found")
    else:
        # Evaluate the entire dataset
        print("Evaluating entire dataset - this may take a while...")
        results = evaluate_translations(
            source_file=source_file, 
            target_file=target_file,
            sample_size=None  # Set to None to evaluate all samples
        )

        # Display results
        print(f"\n=== EVALUATION SUMMARY ===")
        print(f"Average COMET score: {results['average_score']:.4f}")
        print(f"Number of samples evaluated: {results['num_evaluated']}")

        # Display top examples
        print("\n=== Top 5 Translations ===")
        for i, sample in enumerate(sorted(results['samples'], key=lambda x: x['score'], reverse=True)[:5]):
            print(f"\nSample {i+1} (Score: {sample['score']:.4f}):")
            print(f"Source: {sample['source']}")
            print(f"Translation: {sample['translation']}")
        
        # Display worst translations
        print("\n=== Worst 5 Translations ===")
        for i, sample in enumerate(sorted(results['samples'], key=lambda x: x['score'])[:5]):
            print(f"\nSample {i+1} (Score: {sample['score']:.4f}):")
            print(f"Source: {sample['source']}")
            print(f"Translation: {sample['translation']}")
            
        # Generate quality distribution statistics
        scores = [s['score'] for s in results['samples']]
        score_ranges = {
            "Excellent (0.8-1.0)": len([s for s in scores if s >= 0.8]),
            "Good (0.6-0.8)": len([s for s in scores if 0.6 <= s < 0.8]),
            "Average (0.4-0.6)": len([s for s in scores if 0.4 <= s < 0.6]),
            "Poor (0.2-0.4)": len([s for s in scores if 0.2 <= s < 0.4]),
            "Very Poor (0-0.2)": len([s for s in scores if s < 0.2])
        }
        
        print("\n=== Quality Distribution ===")
        for range_name, count in score_ranges.items():
            percentage = (count / len(scores)) * 100
            print(f"{range_name}: {count} samples ({percentage:.2f}%)")
        
except Exception as e:
    print(f"Error during evaluation: {e}")
    
    # Fallback to LaBSE evaluation if COMET fails
    print("\nFalling back to LaBSE similarity evaluation for the entire dataset...")
    
    from sentence_transformers import SentenceTransformer, util
    
    # Load model
    print("Loading LaBSE model...")
    # Configure to use GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer("sentence-transformers/LaBSE", device=device)
    
    def get_semantic_similarity(text_en, text_ms):
        embeddings_en = model.encode(text_en, convert_to_tensor=True)
        embeddings_ms = model.encode(text_ms, convert_to_tensor=True)
        cosine_score = util.cos_sim(embeddings_en, embeddings_ms).item()
        return cosine_score
    
    # Evaluate with LaBSE
    source_file = "cleaned_train3.jsonl"
    target_file = "final_train3.jsonl"
    
    with open(source_file, "r", encoding="utf-8") as f:
        source_data = [json.loads(line.strip()) for line in f if line.strip()]
    
    with open(target_file, "r", encoding="utf-8") as f:
        target_data = [json.loads(line.strip()) for line in f if line.strip()]
    
    # Evaluate all samples
    eval_size = min(len(source_data), len(target_data))
    results = []
    
    print(f"Evaluating {eval_size} samples with LaBSE...")
    
    # Process in batches to show progress
    batch_size = 50
    for i in tqdm(range(0, eval_size, batch_size), desc="Evaluating batches"):
        batch_end = min(i + batch_size, eval_size)
        for j in range(i, batch_end):
            source_text = source_data[j].get("instruction", "")
            target_text = target_data[j].get("instruction", "")
            
            if source_text and target_text:
                score = get_semantic_similarity(source_text, target_text)
                results.append({
                    "index": j,
                    "source": source_text[:100] + "..." if len(source_text) > 100 else source_text,
                    "translation": target_text[:100] + "..." if len(target_text) > 100 else target_text,
                    "score": score
                })
    
    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv("labse_full_evaluation_results.csv", index=False)
    print("Results saved to labse_full_evaluation_results.csv")
    
    # Calculate average score
    avg_score = sum(item["score"] for item in results) / len(results) if results else 0
    
    # Display results
    print(f"\n=== LaBSE EVALUATION SUMMARY ===")
    print(f"Average similarity score: {avg_score:.4f}")
    print(f"Number of samples evaluated: {len(results)}")
    
    # Generate quality distribution statistics
    scores = [s['score'] for s in results]
    score_ranges = {
        "Excellent (0.8-1.0)": len([s for s in scores if s >= 0.8]),
        "Good (0.6-0.8)": len([s for s in scores if 0.6 <= s < 0.8]),
        "Average (0.4-0.6)": len([s for s in scores if 0.4 <= s < 0.6]),
        "Poor (0.2-0.4)": len([s for s in scores if 0.2 <= s < 0.4]),
        "Very Poor (0-0.2)": len([s for s in scores if s < 0.2])
    }
    
    print("\n=== Quality Distribution ===")
    for range_name, count in score_ranges.items():
        percentage = (count / len(scores)) * 100
        print(f"{range_name}: {count} samples ({percentage:.2f}%)")