# 03 - Evaluation and Visualization

This notebook:
1. Loads test dataset and indexes
2. Evaluates all four plagiarism detection methods
3. Generates comparison charts
4. Performs ablation studies
5. Analyzes errors and failure patterns

In [1]:
import sys
import os
import json
import numpy as np
from tqdm import tqdm
import time

# Add src to path
sys.path.append(os.path.abspath('.'))

from src.retrieval import DenseRetriever, BM25Retriever, HybridRetriever
from src.llm import GeminiLLM
from src.evaluation import EvaluationMetrics, ErrorAnalyzer
from src.visualization import (
    plot_comparison_chart,
    plot_ablation_study,
    plot_confusion_matrices,
    plot_cost_vs_performance
)
from src.config import RANDOM_SEED, DEFAULT_TOP_K, DEFAULT_ALPHA

np.random.seed(RANDOM_SEED)
print("✓ Imports successful")

  from .autonotebook import tqdm as notebook_tqdm


✓ Imports successful


## Load Test Dataset and Indexes

In [2]:
# Load test dataset
with open('data/test_dataset.json', 'r') as f:
    test_dataset = json.load(f)

print(f"Loaded {len(test_dataset)} test cases")
print(f"Positive examples: {sum(1 for tc in test_dataset if tc['is_plagiarism'])}")
print(f"Negative examples: {sum(1 for tc in test_dataset if not tc['is_plagiarism'])}")

# Load indexes
print("\nLoading indexes...")
dense_retriever = DenseRetriever.load("indexes/dense_retriever.pkl")
bm25_retriever = BM25Retriever.load("indexes/bm25_retriever.pkl")
hybrid_retriever = HybridRetriever(dense_retriever, bm25_retriever)
llm = GeminiLLM()
print("✓ All systems loaded")

Loaded 35 test cases
Positive examples: 20
Negative examples: 15

Loading indexes...
✓ All systems loaded


## Define Detection Functions

(Same as 02_interactive.ipynb)

In [3]:
def detect_embedding(query_code, threshold=0.85, top_k=5):
    results = dense_retriever.retrieve(query_code, top_k=top_k)
    if not results:
        return {'is_plagiarism': False, 'confidence': 0.0}
    top_chunk, max_similarity = results[0]
    is_plagiarism = max_similarity >= threshold
    return {
        'is_plagiarism': is_plagiarism,
        'confidence': float(max_similarity * 100),
        'max_similarity': float(max_similarity)
    }

def detect_llm(query_code, max_context_functions=50):
    corpus_chunks = dense_retriever.chunks[:max_context_functions]
    result = llm.analyze_plagiarism_direct(query_code, corpus_chunks)
    return result

def detect_rag(query_code, top_k=10):
    retrieved = dense_retriever.retrieve(query_code, top_k=top_k)
    candidate_chunks = [chunk for chunk, score in retrieved]
    result = llm.analyze_plagiarism_with_context(query_code, candidate_chunks)
    return result

def detect_hybrid_rag(query_code, top_k=10, alpha=0.5):
    retrieved = hybrid_retriever.retrieve(query_code, top_k=top_k, alpha=alpha, fusion_method='rrf')
    candidate_chunks = [chunk for chunk, score in retrieved]
    result = llm.analyze_plagiarism_with_context(query_code, candidate_chunks)
    return result

print("✓ Detection functions defined")

✓ Detection functions defined


## Evaluate All Systems

In [4]:
def evaluate_system(detection_func, system_name, test_dataset, **kwargs):
    """
    Evaluate a detection system on the test dataset.
    """
    print(f"\nEvaluating {system_name}...")
    
    y_true = []
    y_pred = []
    
    for test_case in tqdm(test_dataset, desc=system_name):
        query_code = test_case['code']
        ground_truth = test_case['is_plagiarism']
        
        try:
            result = detection_func(query_code, **kwargs)
            prediction = result.get('is_plagiarism', False)
        except Exception as e:
            print(f"\nError on test case {test_case['id']}: {e}")
            prediction = False
        
        y_true.append(ground_truth)
        y_pred.append(prediction)
        
        # Rate limiting for API calls
        if 'llm' in system_name.lower() or 'rag' in system_name.lower():
            time.sleep(0.5)  # Avoid rate limits
    
    # Calculate metrics
    metrics = EvaluationMetrics.calculate_metrics(y_true, y_pred)
    EvaluationMetrics.print_metrics(metrics, system_name)
    
    return metrics, y_pred

print("✓ Evaluation function defined")

✓ Evaluation function defined


### Evaluate System 1: Pure Embedding Search

In [5]:
metrics_embedding, pred_embedding = evaluate_system(
    detect_embedding,
    "Pure Embedding Search",
    test_dataset,
    threshold=0.85
)


Evaluating Pure Embedding Search...


Pure Embedding Search: 100%|██████████| 35/35 [00:15<00:00,  2.32it/s]


evaluation metrics for Pure Embedding Search
precision: 1.0000
recall:    0.8500
f1 score:  0.9189
accuracy:  0.9143

confusion matrix:
  tp:  17  fn:   3
  fp:   0  tn:  15

error rates:
  false positive rate: 0.0000
  false negative rate: 0.1500






### Evaluate System 2: Direct LLM Analysis

In [6]:
metrics_llm, pred_llm = evaluate_system(
    detect_llm,
    "Direct LLM Analysis",
    test_dataset,
    max_context_functions=50
)


Evaluating Direct LLM Analysis...


Direct LLM Analysis:   0%|          | 0/35 [00:00<?, ?it/s]


Error on test case 0: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:   3%|▎         | 1/35 [00:00<00:17,  2.00it/s]


Error on test case 1: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:   6%|▌         | 2/35 [00:01<00:16,  2.00it/s]


Error on test case 2: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:   9%|▊         | 3/35 [00:01<00:16,  2.00it/s]


Error on test case 3: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  11%|█▏        | 4/35 [00:02<00:15,  2.00it/s]


Error on test case 4: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  14%|█▍        | 5/35 [00:02<00:15,  2.00it/s]


Error on test case 5: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  17%|█▋        | 6/35 [00:03<00:14,  2.00it/s]


Error on test case 6: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  20%|██        | 7/35 [00:03<00:14,  2.00it/s]


Error on test case 7: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  23%|██▎       | 8/35 [00:04<00:13,  2.00it/s]


Error on test case 8: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  26%|██▌       | 9/35 [00:04<00:13,  2.00it/s]


Error on test case 9: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  29%|██▊       | 10/35 [00:05<00:12,  2.00it/s]


Error on test case 10: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  31%|███▏      | 11/35 [00:05<00:12,  2.00it/s]


Error on test case 11: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  34%|███▍      | 12/35 [00:06<00:11,  2.00it/s]


Error on test case 12: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  37%|███▋      | 13/35 [00:06<00:11,  2.00it/s]


Error on test case 13: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  40%|████      | 14/35 [00:07<00:10,  2.00it/s]


Error on test case 14: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  43%|████▎     | 15/35 [00:07<00:10,  2.00it/s]


Error on test case 15: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  46%|████▌     | 16/35 [00:08<00:09,  2.00it/s]


Error on test case 16: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  49%|████▊     | 17/35 [00:08<00:09,  2.00it/s]


Error on test case 17: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  51%|█████▏    | 18/35 [00:09<00:08,  2.00it/s]


Error on test case 18: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  54%|█████▍    | 19/35 [00:09<00:08,  2.00it/s]


Error on test case 19: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  57%|█████▋    | 20/35 [00:10<00:07,  2.00it/s]


Error on test case 20: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  60%|██████    | 21/35 [00:10<00:07,  2.00it/s]


Error on test case 21: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  63%|██████▎   | 22/35 [00:11<00:06,  2.00it/s]


Error on test case 22: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  66%|██████▌   | 23/35 [00:11<00:06,  2.00it/s]


Error on test case 23: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  69%|██████▊   | 24/35 [00:12<00:05,  2.00it/s]


Error on test case 24: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  71%|███████▏  | 25/35 [00:12<00:05,  2.00it/s]


Error on test case 25: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  74%|███████▍  | 26/35 [00:13<00:04,  2.00it/s]


Error on test case 26: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  77%|███████▋  | 27/35 [00:13<00:04,  2.00it/s]


Error on test case 27: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  80%|████████  | 28/35 [00:14<00:03,  2.00it/s]


Error on test case 28: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  83%|████████▎ | 29/35 [00:14<00:03,  2.00it/s]


Error on test case 29: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  86%|████████▌ | 30/35 [00:15<00:02,  2.00it/s]


Error on test case 30: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  89%|████████▊ | 31/35 [00:15<00:02,  2.00it/s]


Error on test case 31: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  91%|█████████▏| 32/35 [00:16<00:01,  2.00it/s]


Error on test case 32: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  94%|█████████▍| 33/35 [00:16<00:01,  2.00it/s]


Error on test case 33: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis:  97%|█████████▋| 34/35 [00:17<00:00,  2.00it/s]


Error on test case 34: GeminiLLM.analyze_plagiarism_direct() takes 2 positional arguments but 3 were given


Direct LLM Analysis: 100%|██████████| 35/35 [00:17<00:00,  2.00it/s]


evaluation metrics for Direct LLM Analysis
precision: 0.0000
recall:    0.0000
f1 score:  0.0000
accuracy:  0.4286

confusion matrix:
  tp:   0  fn:  20
  fp:   0  tn:  15

error rates:
  false positive rate: 0.0000
  false negative rate: 1.0000






### Evaluate System 3: Standard RAG

In [7]:
metrics_rag, pred_rag = evaluate_system(
    detect_rag,
    "Standard RAG",
    test_dataset,
    top_k=10
)


Evaluating Standard RAG...


Standard RAG:   3%|▎         | 1/35 [00:02<01:21,  2.40s/it]

generation failed (attempt 1): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 200, model: gemini-2.0-flash
Please retry in 40.145493361s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 200
}
, retry_delay {
  seconds: 40
}
]
generation failed (attempt 2): 429 You exceeded your current quota

Standard RAG:   3%|▎         | 1/35 [00:06<03:53,  6.87s/it]


KeyboardInterrupt: 

### Evaluate System 4: Hybrid RAG

In [None]:
metrics_hybrid, pred_hybrid = evaluate_system(
    detect_hybrid_rag,
    "Hybrid RAG",
    test_dataset,
    top_k=10,
    alpha=0.5
)

## Generate Comparison Chart

In [None]:
# Compile results
all_results = {
    'Pure Embedding': metrics_embedding,
    'Direct LLM': metrics_llm,
    'Standard RAG': metrics_rag,
    'Hybrid RAG': metrics_hybrid
}

# Plot comparison chart
plot_comparison_chart(all_results, save_path="results/comparison_chart.png")

# Save results to JSON
os.makedirs('results', exist_ok=True)
with open('results/evaluation_results.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print("\n✓ Results saved to results/evaluation_results.json")

## Plot Confusion Matrices

In [None]:
plot_confusion_matrices(all_results, save_path="results/confusion_matrices.png")

## Ablation Study 1: Varying k in RAG Systems

In [None]:
print("\nAblation Study: Impact of k on Standard RAG")
print("="*50)

k_values = [3, 5, 10, 15, 20]
k_results = {
    'precision': [],
    'recall': [],
    'f1_score': [],
    'accuracy': []
}

for k in k_values:
    print(f"\nTesting k={k}...")
    metrics, _ = evaluate_system(
        detect_rag,
        f"RAG (k={k})",
        test_dataset[:15],  # Use subset for speed
        top_k=k
    )
    
    k_results['precision'].append(metrics['precision'])
    k_results['recall'].append(metrics['recall'])
    k_results['f1_score'].append(metrics['f1_score'])
    k_results['accuracy'].append(metrics['accuracy'])

# Plot results
plot_ablation_study(
    k_values,
    k_results,
    save_path="results/ablation_k_values.png",
    title="Ablation Study: Impact of k on RAG Performance"
)

print("\n✓ Ablation study complete")

## Ablation Study 2: Fusion Weights in Hybrid RAG

In [None]:
print("\nAblation Study: Fusion Weight (alpha) in Hybrid RAG")
print("="*50)

alphas = [0.0, 0.3, 0.5, 0.7, 1.0]  # 0=pure BM25, 1=pure dense
alpha_results = {
    'precision': [],
    'recall': [],
    'f1_score': [],
    'accuracy': []
}

for alpha in alphas:
    print(f"\nTesting alpha={alpha}...")
    metrics, _ = evaluate_system(
        detect_hybrid_rag,
        f"Hybrid RAG (α={alpha})",
        test_dataset[:15],  # Use subset for speed
        top_k=10,
        alpha=alpha
    )
    
    alpha_results['precision'].append(metrics['precision'])
    alpha_results['recall'].append(metrics['recall'])
    alpha_results['f1_score'].append(metrics['f1_score'])
    alpha_results['accuracy'].append(metrics['accuracy'])

# Plot results
plot_ablation_study(
    alphas,
    alpha_results,
    save_path="results/ablation_alpha_values.png",
    title="Ablation Study: Fusion Weight (α) in Hybrid RAG\n(0=pure BM25, 1=pure Dense)"
)

print("\n✓ Ablation study complete")

## Error Analysis

In [None]:
# Analyze errors for each system
systems = {
    'Pure Embedding': pred_embedding,
    'Direct LLM': pred_llm,
    'Standard RAG': pred_rag,
    'Hybrid RAG': pred_hybrid
}

for system_name, predictions in systems.items():
    print(f"\n{'='*60}")
    print(f"Error Analysis: {system_name}")
    print(f"{'='*60}")
    
    errors = ErrorAnalyzer.analyze_errors(test_dataset, predictions)
    ErrorAnalyzer.print_error_analysis(errors)
    
    # Show sample false positives
    if errors['false_positives']:
        print("\nSample False Positive:")
        fp = errors['false_positives'][0]
        print(f"Code: {fp['test_case']['code'][:100]}...")
    
    # Show sample false negatives
    if errors['false_negatives']:
        print("\nSample False Negative:")
        fn = errors['false_negatives'][0]
        print(f"Code: {fn['test_case']['code'][:100]}...")
        print(f"Transformation: {fn['test_case'].get('transformation_type', 'N/A')}")

## Cost vs Performance Analysis

In [None]:
# Approximate relative costs (arbitrary units)
systems_list = ['Pure Embedding', 'Standard RAG', 'Hybrid RAG', 'Direct LLM']
f1_scores = [
    all_results['Pure Embedding']['f1_score'],
    all_results['Standard RAG']['f1_score'],
    all_results['Hybrid RAG']['f1_score'],
    all_results['Direct LLM']['f1_score']
]
costs = [1, 5, 6, 10]  # Relative costs (embedding < RAG < hybrid < direct LLM)

plot_cost_vs_performance(
    systems_list,
    f1_scores,
    costs,
    save_path="results/cost_vs_performance.png"
)

## Summary and Insights

In [None]:
print("\n" + "="*70)
print("EVALUATION SUMMARY")
print("="*70)

print("\nPerformance Ranking by F1 Score:")
ranked = sorted(all_results.items(), key=lambda x: x[1]['f1_score'], reverse=True)
for i, (system, metrics) in enumerate(ranked):
    print(f"{i+1}. {system}: F1={metrics['f1_score']:.4f}, "
          f"Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")

print("\n" + "="*70)
print("KEY INSIGHTS")
print("="*70)

insights = [
    "1. Pure Embedding: Fast and cheap, good for exact/minor copies",
    "2. Direct LLM: Best context understanding, but expensive and slow",
    "3. Standard RAG: Balanced approach, depends on retrieval quality",
    "4. Hybrid RAG: Best of both retrieval methods, slight complexity increase",
    "\nTrade-offs:",
    "- Accuracy ↔ Cost: More LLM usage = higher accuracy but higher cost",
    "- Accuracy ↔ Speed: Embedding-only is fastest, LLM methods slower",
    "- Retrieval Quality Impact: RAG performance heavily depends on top-k selection"
]

for insight in insights:
    print(insight)

print("\n" + "="*70)
print("EVALUATION COMPLETE - All results saved to results/")
print("="*70)