In [2]:
# imports

import os
import json
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple
from dataclasses import dataclass, asdict
from typing import List
from pathlib import Path
import json
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# prepare paths

root_path = Path("..")
results_dir = root_path / 'results'
dataset_path = root_path / 'notebook' / 'data' / 'test_dataset.json'

predictions_csv_path = results_dir / 'predictions.csv'
summary_csv_path = results_dir / 'summary.csv'
ablations_csv_path = results_dir / 'ablations.csv'
chart_path = results_dir / 'comparison_chart.png'

In [None]:
# run ./03_interactive.ipynb to load all methods into memory
%run ./03_interactive.ipynb

In [None]:
# create local methods and parameters for easier handling

def call_embedding(code: str, top_k = 10):
    return detect_embedding(code, top_k = top_k)

def call_llm(code, top_n = 25):
    return detect_llm(code, top_n = top_n)

def call_rag(code: str, top_k = 5):
    return detect_rag(code, top_k = top_k)

def call_hybrid_rag(code, top_k_dense = 5, top_k_bm25 = 5, top_k_fused = 5, w_dense = 0.5):
    return detect_hybrid_rag(
        code,
        top_k_dense = top_k_dense,
        top_k_bm25 = top_k_bm25,
        top_k_fused = top_k_fused,
        w_dense = w_dense
    )

methods = {
    "pure_embedding": call_embedding,
    "direct_llm": call_llm,
    "rag": call_rag,
    "hybrid_rag": call_hybrid_rag
}

method_params = {
    "pure_embedding": {"top_k": 10},
    "direct_llm": {"top_n": 25},
    "rag": {"top_k": 5},
    "hybrid_rag": {"top_k_dense": 5, "top_k_bm25": 5, "top_k_fused": 5, "w_dense": 0.5},
}

In [7]:
# load dataset

@dataclass
class CodeSample:
    id: str
    query_code: str
    is_positive: bool
    source_hint: str
    notes: str

def load_dataset(dataset_path):
    with open(dataset_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    return [CodeSample(**item) for item in data]

dataset = load_dataset(dataset_path)

In [12]:
# run on dataset and create results

@dataclass
class EvaluationRow:
    id: str
    is_positive: bool
    method: str
    is_plagiarized: bool
    reason: str
    evidence_mine: any
    evidence_oai: any
    ms_elapsed: float

rows = []

for name, func in methods.items():
    params = method_params.get(name, {})
    
    for sample in dataset:
        start_time = time.time()
        result = func(sample.query_code, **params)
        end_time = time.time()

        row = EvaluationRow(
            id = sample.id,
            is_positive = sample.is_positive,
            method = result.method,
            is_plagiarized = result.is_plagiarized,
            reason = result.reason,
            evidence_mine = result.evidence_mine,
            evidence_oai = result.evidence_oai,
            ms_elapsed = (end_time - start_time) * 1000
        )
        
        rows.append(row)
    
    print(f"{name} finished")

# convert and save
results = pd.DataFrame([asdict(r) for r in rows])
results.to_csv(predictions_csv_path, index = False)

pure_embedding finished
direct_llm finished
direct_llm finished
rag finished
rag finished
hybrid_rag finished
hybrid_rag finished


In [13]:
# evaluate results using metrics and form summary

def confusion_counts(dataframe):
    tp = int(((dataframe.is_positive == True) & (dataframe.is_plagiarized == True)).sum())
    fp = int(((dataframe.is_positive == False) & (dataframe.is_plagiarized == True)).sum())
    tn = int(((dataframe.is_positive == False) & (dataframe.is_plagiarized == False)).sum())
    fn = int(((dataframe.is_positive == True) & (dataframe.is_plagiarized == False)).sum())
    return tp, fp, tn, fn

def calculate_metrics(true_positive, false_positive, true_negative, false_negative):
    # precision = how often a detected plagiarism case was really a plagiarism
    assumed_plagiarized_cnt = true_positive + false_positive
    precision = true_positive / assumed_plagiarized_cnt if assumed_plagiarized_cnt > 0 else 0.0

    # recall = how many real plagiarism cases the model caught
    total_plagiarized_cnt = true_positive + false_negative
    recall = true_positive / total_plagiarized_cnt if total_plagiarized_cnt > 0 else 0.0

    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    accuracy = (true_positive + true_negative) / max(true_positive + false_positive + true_negative + false_negative, 1)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy
    }

summary_rows = []
for method_name, dataframe in results.groupby("method"):
    tp, fp, tn, fn = confusion_counts(dataframe)
    scores = calculate_metrics(tp, fp, tn, fn)
    avg_ms = float(dataframe["ms_elapsed"].mean()) if len(dataframe) else 0.0
    
    summary_rows.append({
        "method": method_name,
        "n": int(len(dataframe)),
        "tp": tp, "fp": fp, "tn": tn, "fn": fn,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1": scores["f1"],
        "accuracy": scores["accuracy"],
        "avg_ms": avg_ms
    })

summary = pd.DataFrame(summary_rows).sort_values("f1", ascending = False).reset_index(drop = True)
summary.to_csv(summary_csv_path, index = False)