# End-to-End Plagiarism Detection Demo

This notebook demonstrates the complete plagiarism detection pipeline from loading data to generating reports.

In [None]:
# Setup: Add parent directory to path
import sys
sys.path.insert(0, '..')

import json
from src import (
    load_submissions,
    validate_submissions,
    PlagiarismScorer,
    ExplanationGenerator
)

## Step 1: Load Sample Data

We'll create some sample submissions to demonstrate the system.

In [None]:
# Sample submissions for testing
submissions = [
    {
        'submission_id': 's001',
        'language': 'python',
        'code': '''
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
'''
    },
    {
        'submission_id': 's002',
        'language': 'python',
        'code': '''
def fib(num):
    # Calculate fibonacci
    if num <= 1:
        return num
    return fib(num-1) + fib(num-2)
'''
    },
    {
        'submission_id': 's003',
        'language': 'python',
        'code': '''
def fibonacci(n):
    a, b = 0, 1
    for i in range(n):
        a, b = b, a + b
    return a
'''
    }
]

print(f"Loaded {len(submissions)} submissions")

## Step 2: Validate Submissions

Ensure all submissions are valid and use the same language.

In [None]:
is_valid, errors = validate_submissions(submissions, strict=False)

if is_valid:
    print("✓ All submissions are valid")
else:
    print("✗ Validation errors:")
    for error in errors:
        print(f"  - {error}")

## Step 3: Analyze Plagiarism

Run the plagiarism detection pipeline on all submissions.

In [None]:
# Initialize scorer
scorer = PlagiarismScorer()

# Analyze all submissions
results = scorer.analyze_all(submissions, normalize=True)

# Display results
for result in results:
    print(f"\nSubmission: {result['submission_id']}")
    print(f"  Similarity: {result['similarity_score']:.1f}%")
    print(f"  Most similar to: {result['most_similar_to']}")
    print(f"  Severity: {result['severity']}")
    print(f"  Breakdown:")
    for key, value in result['breakdown'].items():
        print(f"    {key}: {value:.1f}%")

## Step 4: Generate Detailed Reports

Create human-readable reports for each submission.

In [None]:
# Initialize report generator
reporter = ExplanationGenerator()

# Generate reports
for i, result in enumerate(results):
    report = reporter.generate_report(
        submission_id=result['submission_id'],
        similarity_score=result['similarity_score'],
        breakdown=result['breakdown'],
        severity=result['severity'],
        most_similar_to=result['most_similar_to'],
        code=submissions[i]['code'],
        adjustments=result.get('adjustments', [])
    )
    
    # Print formatted report
    print(reporter.format_text_report(report))
    print()

## Step 5: Pairwise Comparison

Compare specific submission pairs in detail.

In [None]:
# Compare s001 and s002 (expected: high similarity - same algorithm, different names)
result = scorer.compute_similarity(
    submissions[0]['code'],
    submissions[1]['code'],
    language='python',
    normalize=True
)

print("Comparison: s001 vs s002 (recursive fibonacci, different variable names)")
print(f"Final Score: {result['final_score']:.1f}%")
print(f"Severity: {result['severity']}")
print(f"\nBreakdown:")
for key, value in result['breakdown'].items():
    print(f"  {key}: {value:.1f}%")

if result.get('structural_breakdown'):
    print(f"\nStructural Method: {result['structural_method']}")
    print(f"Structural Breakdown:")
    for key, value in result['structural_breakdown'].items():
        print(f"  {key}: {value:.1f}%")

print("\n" + "="*80 + "\n")

# Compare s001 and s003 (expected: low similarity - different algorithms)
result = scorer.compute_similarity(
    submissions[0]['code'],
    submissions[2]['code'],
    language='python',
    normalize=True
)

print("Comparison: s001 vs s003 (recursive vs iterative fibonacci)")
print(f"Final Score: {result['final_score']:.1f}%")
print(f"Severity: {result['severity']}")
print(f"\nBreakdown:")
for key, value in result['breakdown'].items():
    print(f"  {key}: {value:.1f}%")

## Step 6: Save Results

Export results to JSON for further analysis.

In [None]:
# Save results to file
output_path = '../data/results/demo_results.json'

with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {output_path}")

## Conclusion

This demonstration shows:
1. ✓ Data loading and validation
2. ✓ Multi-signal plagiarism detection (lexical + structural + semantic)
3. ✓ Student-safe bias adjustments
4. ✓ Adaptive explanation generation
5. ✓ Severity classification

**Key Observations:**
- s001 vs s002: High similarity (same recursive algorithm, just renamed variables)
- s001 vs s003: Low similarity (different algorithms - recursive vs iterative)

The system correctly identifies algorithmic similarity while distinguishing between different valid approaches.