In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# imports

import os
import json
import re
from pathlib import Path
import numpy as np
import faiss
import pickle
from openai import OpenAI
from typing import List
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer

In [3]:
# paths

base_dir = Path(".")
data_dir = base_dir / "data"
indexes_dir = base_dir / "indexes"

dense_index_path = indexes_dir / "dense_index.faiss"
dense_meta_path = indexes_dir / "dense_meta.json"
bm25_path = indexes_dir / "bm25_index.pkl"
test_dataset_path = data_dir / "test_dataset.json"

In [None]:
similarity_threshold = 0.65

In [5]:
# load dense index and meta

if not dense_index_path.exists():
    raise FileNotFoundError(f"FAISS index not found at {dense_index_path}")

dense_index = faiss.read_index(str(dense_index_path))

with open(dense_meta_path, "r", encoding="utf-8") as f:
    dense_meta = json.load(f)

chunk_ids = dense_meta["chunk_ids"]
chunk_repos = dense_meta["repos"]
chunk_paths = dense_meta["paths"]
chunk_texts = dense_meta["texts"]

#print(f"loaded dense index with {len(chunk_ids)} chunks")

In [6]:
# load bm25

with open(bm25_path, "rb") as f:
    bm25_pack = pickle.load(f)

bm25 = bm25_pack["bm25"]
bm25_chunks = bm25_pack["chunks"]

#print(f"loaded bm25 index with {len(bm25_chunks)} chunks")

In [7]:
# models

emb_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

openai_api_key = os.getenv("OPENAI_API_KEY")
oai_client = None

if openai_api_key:
    oai_client = OpenAI(api_key=openai_api_key)


In [8]:
# helpers

def embed_code(text):
    vec = emb_model.encode([text], convert_to_numpy=True)
    return vec.astype("float32")

def tokenize_code(text):
    return re.findall(r"[A-Za-z_][A-Za-z0-9_]*", text)

def find_text_by_path(path):
    for c in bm25_chunks:
        if c["source_path"] == path:
            return c["text"]
    return None

class PlagiarismResult(BaseModel): # Pydantic model for structured output from OpenAI
    is_plagiarized: bool
    reason: str
    evidence: List[str] = []

class DetectionResult: # each method will return this result
    def __init__(self, method, is_plagiarized, reason, evidence_mine = "", evidence_oai = None):
        self.method = method
        self.is_plagiarized = is_plagiarized
        self.reason = reason
        self.evidence_mine = evidence_mine
        self.evidence_oai = evidence_oai

def llm_call(prompt) -> PlagiarismResult:
    if oai_client is None:
        return PlagiarismResult(is_plagiarized = False, reason = "no OPENAI_API_KEY in env, skipping LLM detection", evidence = [])
    
    oai_response = oai_client.chat.completions.parse(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": "You are a careful code plagiarism checker."},
            {"role": "user", "content": prompt},
        ],
        response_format = PlagiarismResult,
        temperature = 0.0
    )

    return oai_response.choices[0].message.parsed


In [9]:
# pure embedding search

def detect_embedding(code_query, top_k = 10):
    query_vec = embed_code(code_query)
    distances, indexes = dense_index.search(query_vec, top_k)
    evidence = []

    # distance is L2 norm, and it's directly related to cosine similarity: D(a, b) = 2 - 2 * cos(Î˜)
    # the less the distance, the more similar two documents are.
    # we need to convert this into a formula to make similarity % more intuitive.
    # simplest option is: similarity = 1 / (1 + distance)
    # this way:
    # when distance is 0, documents are the same, and similarity score will be 1.
    # when distance is large, documents are very different, and similarity score will be significantly less than 1.
    
    for dist, idx in zip(distances[0], indexes[0]):
        if idx == -1:
            continue

        similarity = 1.0 / (1.0 + float(dist))
        
        if similarity >= similarity_threshold:
            evidence.append(
                {
                    "chunk_id": chunk_ids[idx],
                    "path": chunk_paths[idx],
                    "text": chunk_texts[idx],
                    "similarity": similarity
                }
            )

    is_plagiarized = len(evidence) > 0

    return DetectionResult(
        method = "pure_embedding",
        is_plagiarized = is_plagiarized,
        reason = "",
        evidence_mine = evidence
    )

In [10]:
# direct LLM analysis

def build_prompt(corpus_snippets, code_query):
    code_snippets = "\n".join(
        f"[{i}]\n```go\n{snippet}\n```" for i, snippet in enumerate(corpus_snippets, 1)
    )
    return (
        "You are checking code plagiarism.\n"
        "You will get a query function and several candidate snippets.\n"
        "Answer with the provided response format.\n\n"
        f"Query code:\n```go\n{code_query}\n```\n\n"
        f"Candidate snippets:\n{code_snippets}\n\n"
        "Think about structure, call order, and control flow, not only names."
    )

def detect_llm(code_query, top_n = 25):
    corpus_snippets = [text for text in chunk_texts[:top_n]]
    prompt = build_prompt(corpus_snippets, code_query)
    result = llm_call(prompt)

    return DetectionResult(
        method = "direct_llm",
        is_plagiarized = result.is_plagiarized,
        reason = result.reason,
        evidence_oai = result.evidence
    )

In [11]:
# standard RAG

def detect_rag(code_query, top_k = 5):
    query_vec = embed_code(code_query)
    distances, indexes = dense_index.search(query_vec, top_k)

    corpus_snippets = []
    evidence = []

    for dist, idx in zip(distances[0], indexes[0]):
        if idx == -1:
            continue

        path = chunk_paths[idx]
        text = chunk_texts[idx]
        
        corpus_snippets.append(text)
        
        similarity = 1.0 / (1.0 + float(dist))
        
        if similarity > similarity_threshold:
            evidence.append({
                "chunk_id": chunk_ids[idx],
                "path": path,
                "text": text,
                "similarity": similarity
            })

    prompt = build_prompt(corpus_snippets, code_query)
    result = llm_call(prompt)

    return DetectionResult(
        method = "rag",
        is_plagiarized = result.is_plagiarized,
        reason = result.reason,
        evidence_mine = evidence,
        evidence_oai = result.evidence
    )

In [12]:
# hybrid RAG

def retrieve_dense(code_query, top_k = 5):
    vec = embed_code(code_query)
    distances, indexes = dense_index.search(vec, top_k)

    out = []
    for dist, idx in zip(distances[0], indexes[0]):
        if idx == -1:
            continue

        out.append(
            {
                "index": int(idx),
                "dist": float(dist),
                "path": chunk_paths[idx],
                "repo": chunk_repos[idx],
                "text": chunk_texts[idx]
            }
        )

    return out

def retrieve_bm25(code_query, top_k = 5):
    toks = tokenize_code(code_query)
    scores = bm25.get_scores(toks)
    idxs = np.argsort(scores)[::-1][:top_k]

    out = []
    for i in idxs:
        out.append(
            {
                "index": int(i),
                "score": float(scores[i]),
                "path": bm25_chunks[i]["source_path"],
                "repo": bm25_chunks[i]["repo"],
                "text": bm25_chunks[i]["text"]
            }
        )

    return out

def detect_hybrid_rag(code_query, top_k_dense = 5, top_k_bm25 = 5, top_k_fused = 5, w_dense = 0.5):
    top_k_fused = min(top_k_fused, top_k_dense + top_k_bm25)
    
    # retrieve from both methods
    dense_hits = retrieve_dense(code_query, top_k = top_k_dense)
    bm25_hits = retrieve_bm25(code_query, top_k = top_k_bm25)

    # normalize dense scores to 0..1
    dense_scores = {}
    dists = [h["dist"] for h in dense_hits]
    d_max = max(dists)
    d_min = min(dists)

    for h in dense_hits:
        if d_max == d_min:
            s = 1.0
        else:
            s = 1.0 - (h["dist"] - d_min) / (d_max - d_min)
        dense_scores[h["index"]] = s

    # normalize bm25 scores to 0..1
    bm25_scores = {}
    b_max = max(h["score"] for h in bm25_hits)

    for h in bm25_hits:
        s = h["score"] / b_max if b_max > 0 else 0.0
        bm25_scores[h["index"]] = s

    # fuse based on weight
    fused = {}

    for idx, s in dense_scores.items():
        fused[idx] = fused.get(idx, 0.0) + w_dense * s

    for idx, s in bm25_scores.items():
        fused[idx] = fused.get(idx, 0.0) + (1.0 - w_dense) * s

    # sort fused scores and take top 'top_k_fused'
    fused_sorted = sorted(fused.items(), key=lambda x: x[1], reverse=True)
    top_indices = [idx for idx, _ in fused_sorted[:top_k_fused]]

    corpus_snippets = []
    evidence = []

    for idx in top_indices:
        text = chunk_texts[idx]
        path = chunk_paths[idx]
        repo = chunk_repos[idx]

        corpus_snippets.append(text)

        evidence.append(
            {
                "index": int(idx),
                "repo": repo,
                "path": path,
                "dense_score": dense_scores.get(idx),
                "bm25_score": bm25_scores.get(idx),
                "fused_score": fused.get(idx)
            }
        )

    prompt = build_prompt(corpus_snippets, code_query)
    result = llm_call(prompt)

    return DetectionResult(
        method = "hybrid_rag",
        is_plagiarized = result.is_plagiarized,
        reason = result.reason,
        evidence_mine = evidence,
        evidence_oai = result.evidence
    )