# 04: Semantic Similarity with Code Embeddings

This notebook demonstrates **semantic similarity** - using ML-based code embeddings to understand algorithmic intent.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.similarity.semantic import SemanticSimilarity
from src.io import load_submissions
from src.normalization import get_normalizer

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Initialize Semantic Analyzer

In [None]:
# Initialize semantic similarity
semantic_analyzer = SemanticSimilarity()

print("ü§ñ Semantic Similarity Using Code Embeddings")
print("=" * 70)
print("\nThis analyzer uses:")
print("  ‚Ä¢ CodeBERT (if transformers available)")
print("  ‚Ä¢ Falls back to lexical similarity if not")
print("\nSemantic similarity captures:")
print("  ‚úì Algorithmic intent")
print("  ‚úì Problem-solving approach")
print("  ‚úì Logic patterns")
print("=" * 70)

## 2. Test: Same Algorithm, Different Implementation

In [None]:
# Recursive binary search
binary_search_recursive = '''
def binary_search(arr, target, left, right):
    if left > right:
        return -1
    mid = (left + right) // 2
    if arr[mid] == target:
        return mid
    elif arr[mid] < target:
        return binary_search(arr, target, mid + 1, right)
    else:
        return binary_search(arr, target, left, mid - 1)
'''

# Iterative binary search
binary_search_iterative = '''
def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
'''

# Compute similarity
similarity = semantic_analyzer.compute_similarity(
    binary_search_recursive,
    binary_search_iterative
)

print("üß™ TEST: Binary Search - Recursive vs Iterative")
print("=" * 70)
print(f"\nSemantic Similarity: {similarity:.1f}%")
print("\nüí° Semantic analysis should recognize these as the SAME algorithm")
print("   despite different implementation styles!")

if similarity > 70:
    print("\n‚úÖ SUCCESS: Recognized same algorithmic intent!")
else:
    print(f"\n‚ö†Ô∏è  Note: Score is {similarity:.1f}% (may be using fallback method)")

## 3. Test: Different Algorithms

In [None]:
# Bubble sort
bubble_sort = '''
def sort_array(arr):
    n = len(arr)
    for i in range(n):
        for j in range(0, n-i-1):
            if arr[j] > arr[j+1]:
                arr[j], arr[j+1] = arr[j+1], arr[j]
    return arr
'''

# Quick sort
quick_sort = '''
def sort_array(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return sort_array(left) + middle + sort_array(right)
'''

similarity2 = semantic_analyzer.compute_similarity(bubble_sort, quick_sort)

print("üß™ TEST: Bubble Sort vs Quick Sort")
print("=" * 70)
print(f"\nSemantic Similarity: {similarity2:.1f}%")
print("\nüí° These are DIFFERENT algorithms (both solve sorting)")
print("   Should have moderate similarity (same problem, different approach)")

if 40 < similarity2 < 70:
    print("\n‚úÖ SUCCESS: Correctly identified as related but different!")
elif similarity2 > 70:
    print("\n‚ö†Ô∏è  Similarity is high - may indicate same problem domain")
else:
    print(f"\n‚ö†Ô∏è  Similarity is low - clearly different approaches")

## 4. Real Dataset Analysis

In [None]:
# Load and normalize submissions
submissions = load_submissions('../data/raw/sample_submissions.csv')
normalizer = get_normalizer('python')

normalized_codes = []
for sub in submissions:
    normalizer.reset_counters()
    normalized_codes.append(normalizer.normalize(sub['code']))

# Compute pairwise semantic similarities
n = len(submissions)
semantic_matrix = np.zeros((n, n))

print(f"Computing semantic similarity for {n} submissions...\n")

for i in range(n):
    for j in range(n):
        if i == j:
            semantic_matrix[i][j] = 100
        elif i < j:
            sim = semantic_analyzer.compute_similarity(
                normalized_codes[i],
                normalized_codes[j]
            )
            semantic_matrix[i][j] = sim
            semantic_matrix[j][i] = sim
            print(f"  {submissions[i]['submission_id']} ‚Üî {submissions[j]['submission_id']}: {sim:.1f}%")

print("\n‚úì Semantic similarity matrix computed")

## 5. Semantic Similarity Heatmap

In [None]:
submission_ids = [sub['submission_id'] for sub in submissions]

plt.figure(figsize=(10, 8))
ax = sns.heatmap(semantic_matrix,
                 annot=True,
                 fmt='.1f',
                 cmap='viridis',
                 vmin=0,
                 vmax=100,
                 xticklabels=submission_ids,
                 yticklabels=submission_ids,
                 cbar_kws={'label': 'Semantic Similarity (%)'},
                 linewidths=0.5,
                 linecolor='white')

plt.title('Semantic Similarity Heatmap\n(Algorithmic Intent Analysis)',
          fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Submission ID', fontsize=12, fontweight='bold')
plt.ylabel('Submission ID', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

## 6. Top Semantically Similar Pairs

In [None]:
# Find most similar pairs
pairs = []
for i in range(n):
    for j in range(i+1, n):
        pairs.append({
            'id1': submission_ids[i],
            'id2': submission_ids[j],
            'similarity': semantic_matrix[i][j]
        })

pairs.sort(key=lambda x: x['similarity'], reverse=True)

print("üîù Top Semantically Similar Pairs")
print("=" * 70)
print(f"\n{'Rank':<6} {'Pair':<20} {'Semantic Similarity':<20} {'Assessment'}")
print("=" * 70)

for i, pair in enumerate(pairs[:5], 1):
    sim = pair['similarity']
    
    if sim >= 90:
        assessment = "üö® Very High - Investigate"
    elif sim >= 70:
        assessment = "‚ö†Ô∏è  High - Likely similar"
    elif sim >= 50:
        assessment = "üìä Moderate - Same problem"
    else:
        assessment = "‚úÖ Low - Different approaches"
    
    pair_str = f"{pair['id1']} ‚Üî {pair['id2']}"
    print(f"{i:<6} {pair_str:<20} {sim:<20.1f} {assessment}")

## 7. Semantic vs Lexical Comparison

In [None]:
from src.similarity.lexical import LexicalSimilarity

# Compare semantic vs lexical for top pair
if pairs:
    top_pair = pairs[0]
    idx1 = submission_ids.index(top_pair['id1'])
    idx2 = submission_ids.index(top_pair['id2'])
    
    # Get lexical similarity
    lexical_analyzer = LexicalSimilarity()
    lexical_sim = lexical_analyzer.compute_similarity(
        normalized_codes[idx1],
        normalized_codes[idx2]
    )
    
    semantic_sim = top_pair['similarity']
    
    # Visualize comparison
    fig, ax = plt.subplots(figsize=(10, 6))
    
    methods = ['Lexical\n(Token-based)', 'Semantic\n(Intent-based)']
    scores = [lexical_sim, semantic_sim]
    colors = ['#3498db', '#e74c3c']
    
    bars = ax.bar(methods, scores, color=colors, alpha=0.7, 
                  edgecolor='black', linewidth=2)
    
    # Add value labels
    for bar, score in zip(bars, scores):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{score:.1f}%',
                ha='center', va='bottom', fontsize=14, fontweight='bold')
    
    ax.set_ylabel('Similarity Score (%)', fontsize=12, fontweight='bold')
    ax.set_title(f'Lexical vs Semantic Similarity\nMost Similar Pair: {top_pair["id1"]} ‚Üî {top_pair["id2"]}',
                 fontsize=14, fontweight='bold', pad=20)
    ax.set_ylim(0, 110)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Comparison for {top_pair['id1']} ‚Üî {top_pair['id2']}:")
    print(f"   Lexical:  {lexical_sim:.1f}% (surface-level tokens)")
    print(f"   Semantic: {semantic_sim:.1f}% (algorithmic intent)")
    
    diff = abs(semantic_sim - lexical_sim)
    if diff > 20:
        print(f"\n   ‚ö†Ô∏è  Large difference ({diff:.1f}%) - semantic captures deeper similarity!")
    else:
        print(f"\n   ‚úì Methods agree (difference: {diff:.1f}%)")

## Summary

‚úÖ **Algorithmic Intent**: Semantic similarity understands what the code is trying to do

‚úÖ **Implementation-Agnostic**: Recognizes same algorithm in different styles

‚úÖ **Strong Signal**: Gets 40% weight in final score (tied with structural)

‚ö†Ô∏è **Model Dependent**: Requires CodeBERT/transformers (falls back to lexical if unavailable)

**Key Insight**: Semantic similarity is crucial for detecting plagiarism when students:
- Use the same algorithm but different variable names
- Switch between recursive/iterative implementations
- Reorganize code structure but keep the same logic

**Next Steps**: Proceed to notebook 05 to see how all three signals combine in score fusion!