In [107]:
import json
import pickle
import numpy as np
import faiss
import re
from dotenv import load_dotenv
import os
from google import genai
import json
import os
from sentence_transformers import SentenceTransformer

In [108]:
INDEX_PATH = "indexes/reference.index"
META_PATH = "indexes/chunks_metadata.pkl"
CONFIG_PATH = "indexes/index_config.json"
BM25_PATH = "indexes/bm25.pkl"

with open(BM25_PATH, "rb") as f:
    bm25 = pickle.load(f)

index = faiss.read_index(INDEX_PATH)

with open(META_PATH, "rb") as f:
    chunked_docs = pickle.load(f)

with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

embedder = SentenceTransformer(config["embedding_model"])

In [109]:
load_dotenv()
api_key = os.getenv("LLM_HW_API_KEY")
if not api_key:
    raise ValueError("API Key not set")

client = genai.Client(api_key=api_key)
model = client.models
model_name = "gemini-2.0-flash"

In [110]:
with open("indexes/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Loaded", len(documents), "documents from corpus")

Loaded 18 documents from corpus


In [111]:
def detect_embedding(
    code_snippet: str,
    top_k: int = 5,
    plagiarism_threshold: float = 0.42
):

    query_embedding = embedder.encode(
        [code_snippet],
        normalize_embeddings=True
    ).astype("float32")

    D, I = index.search(query_embedding, top_k)

    results = []
    max_sim = float(np.max(D[0]))

    for score, idx in zip(D[0], I[0]):
        meta = chunked_docs[idx]

        results.append({
            "similarity": float(score),
            "file_name": meta["file_name"],
            "chunk_type": meta["chunk_type"],
            "matched_text_preview": meta["text"][:300]
        })

    plagiarism = max_sim >= plagiarism_threshold


    if max_sim >= plagiarism_threshold:
        verdict = 1
    else:
        verdict = 0

    return {
        "plagiarism": plagiarism,
        "max_similarity": round(max_sim, 4),
        "verdict": verdict,
        "matches": results
    }

In [112]:
def detect_llm(student_code: str):
    """
    Direct LLM Analysis: Dumps the whole repo + student code into the prompt.
    """
    
    corpus_texts = []
    for doc in documents:
        fname = doc.get('file_name', 'unknown_file')
        content = doc.get('raw_code', doc.get('text', '')) 
        corpus_texts.append(f"--- START FILE: {fname} ---\n{content}\n--- END FILE ---\n")
    
    full_repo_context = "\n".join(corpus_texts)
    
    prompt = f"""
    You are a code plagiarism detection system.
    
    REFERENCE REPOSITORY (The Source of Truth):
    {full_repo_context}
    
    SUSPICIOUS CODE (Student Submission):
    {student_code}
    
    TASK:
    Analyze the Suspicious Code. determine if it is plagiarized from the Reference Repository.
    - Plagiarism includes: direct copying, variable renaming, removing comments, or restructuring while keeping logic.
    - It is NOT plagiarism if the code uses a completely different algorithm.
    
    OUTPUT FORMAT:
    Return valid JSON only. No markdown formatting.
    {{
        "is_plagiarized": true/false,
        "confidence_score": 0.0 to 1.0,
        "explanation": "Brief reason citing specific file names from reference if found."
    }}
    """

    try:
        response = model.generate_content(
            model=model_name,
            contents=prompt,
            config={"response_mime_type": "application/json"}
        )
        
        result_text = response.text
        data = json.loads(result_text)
        
        return {
            "verdict": 1 if data["is_plagiarized"] else 0,
            "explanation": data["explanation"],
            "score": data.get("confidence_score", 0.0)
        }

    except Exception as e:
        print(f"LLM Error: {e}")
        return {
            "verdict": "Error",
            "explanation": str(e),
            "score": 0.0
        }

In [113]:
def detect_rag(
    student_code: str,
    top_k: int = 5,
    max_output_tokens: int = 1024
):
    query_emb = embedder.encode([student_code], normalize_embeddings=True).astype("float32")
    
    D, I = index.search(query_emb, top_k)
    
    retrieved_chunks = []
    context_texts = []
    for score, idx in zip(D[0], I[0]):
        meta = chunked_docs[idx]
        retrieved_chunks.append({
            "similarity": float(score),
            "file_name": meta["file_name"],
            "chunk_type": meta["chunk_type"],
            "matched_text": meta.get("original_code", meta["text"])  # use original code for LLM
        })
        context_texts.append(f"File: {meta['file_name']}\n{meta.get('original_code', meta['text'])}\n")
    
    context = "\n".join(context_texts)
    
    prompt = f"""
You are a code plagiarism detection assistant.
You are given the following relevant reference code snippets:

{context}

Determine if the following student's code is plagiarized:

{student_code}

Instructions:
- Provide a short verdict: "Plagiarized", "Partially Plagiarized", or "Original".
- Explain briefly why.
- Do not hallucinate file names.
- Return in JSON format like:
{{"verdict": "...", "explanation": "..."}}
"""
    
    response = model.generate_content(
            model=model_name,
            contents=prompt,
            config={"response_mime_type": "application/json"}
    )
    
    content = response.last.split("\n")[0] if hasattr(response, "last") else response.text
    
    try:
        result = json.loads(content)
    except:
        result = {"verdict": "Unknown", "explanation": content}
    
    result["plagiarism"] = 1 if result.get("verdict", "").lower() != "original" else 0
    result["retrieved_chunks"] = retrieved_chunks
    
    return result

In [114]:
def detect_hybrid_rag(
    student_code: str,
    top_k: int = 5,
    bm25_weight: float = 0.5,
    embedding_weight: float = 0.5
):
    query_emb = embedder.encode([student_code], normalize_embeddings=True).astype("float32")
    D, I = index.search(query_emb, top_k*3)
    
    embedding_candidates = {idx: float(score) for score, idx in zip(D[0], I[0])}
    
    tokenized_query = re.findall(r"\w+", student_code)
    bm25_scores = bm25.get_scores(tokenized_query)
    
    fused_scores = {}
    for idx in range(len(chunked_docs)):
        emb_score = embedding_candidates.get(idx, 0)
        bm25_score = bm25_scores[idx] if idx < len(bm25_scores) else 0
        fused_scores[idx] = embedding_weight*emb_score + bm25_weight*bm25_score
    
    top_indices = sorted(fused_scores, key=fused_scores.get, reverse=True)[:top_k]
    
    retrieved_chunks = []
    context_texts = []
    for idx in top_indices:
        meta = chunked_docs[idx]
        score = fused_scores[idx]
        retrieved_chunks.append({
            "file_name": meta["file_name"],
            "chunk_type": meta["chunk_type"],
            "similarity": float(score),
            "matched_text": meta.get("original_code", meta["text"])
        })
        context_texts.append(f"File: {meta['file_name']}\n{meta.get('original_code', meta['text'])}\n")
    

    context = "\n".join(context_texts)
    prompt = f"""
You are a code plagiarism detection assistant.
You are given the following top candidate reference code snippets:

{context}

Determine if the following student's code is plagiarized:

{student_code}

Instructions:
- Provide a short verdict: "Plagiarized", "Partially Plagiarized", or "Original".
- Explain briefly why.
- Do not hallucinate file names.
- Return in JSON format like:
{{"verdict": "...", "explanation": "..."}}
"""
    
    response = model.generate_content(
            model=model_name,
            contents=prompt,
            config={"response_mime_type": "application/json"}
    )
    
    content = response.last.split("\n")[0] if hasattr(response, "last") else response.text
    import json
    try:
        result = json.loads(content)
    except:
        result = {"verdict": "Unknown", "explanation": content}
    
    result["plagiarism"] = 1 if result.get("verdict", "").lower() != "original" else 0
    result["retrieved_chunks"] = retrieved_chunks
    
    return result