In [2]:
!pip install transformers torch




In [4]:
from huggingface_hub import login
login(token="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", add_to_git_credential=True, new_session=False)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
!pip install networkx




In [23]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

class CodePlagiarismDetector:
    def __init__(self, model_name="microsoft/codebert-base"):
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaModel.from_pretrained(model_name)
        self.model.eval()

    def preprocess_code(self, code):
        # Remove comments
        tree = ast.parse(code)
        clean_code = ast.unparse(tree)
        # Remove extra whitespace
        return ' '.join(clean_code.split())

    def get_code_embedding(self, code):
        preprocessed_code = self.preprocess_code(code)
        inputs = self.tokenizer(preprocessed_code, return_tensors="pt", truncation=True, max_length=512, padding=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def calculate_similarity(self, code1, code2):
        embedding1 = self.get_code_embedding(code1)
        embedding2 = self.get_code_embedding(code2)
        similarity = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity

    def detect_plagiarism(self, submission, reference_codes, threshold=0.85):
        similarities = []
        for ref_code in reference_codes:
            similarity = self.calculate_similarity(submission, ref_code)
            similarities.append(similarity)
        
        max_similarity = max(similarities)
        is_plagiarized = max_similarity > threshold
        
        return {
            "is_plagiarized": is_plagiarized,
            "similarity_score": max_similarity,
            "most_similar_index": np.argmax(similarities)
        }

    def analyze_code_structure(self, code):
        tree = ast.parse(code)
        return {
            "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
            "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
            "num_loops": len([node for node in ast.walk(tree) if isinstance(node, (ast.For, ast.While))]),
            "num_conditionals": len([node for node in ast.walk(tree) if isinstance(node, ast.If)])
        }

    def detect_advanced_plagiarism(self, submission, reference_codes, threshold=0.85):
        basic_result = self.detect_plagiarism(submission, reference_codes, threshold)
        submission_structure = self.analyze_code_structure(submission)
        
        reference_structures = [self.analyze_code_structure(ref) for ref in reference_codes]
        structural_similarities = []
        
        for ref_structure in reference_structures:
            similarity = sum(1 for k, v in submission_structure.items() if ref_structure.get(k) == v) / len(submission_structure)
            structural_similarities.append(similarity)
        
        max_structural_similarity = max(structural_similarities)
        
        return {
            **basic_result,
            "structural_similarity": max_structural_similarity,
            "combined_score": (basic_result['similarity_score'] + max_structural_similarity) / 2
        }

# Example usage
detector = CodePlagiarismDetector()

submission = """
def calc_product(num1, num2):
    result = 1
    for i in range(num2):
        result += num1
    return result - num1
"""

reference_codes = [
    """
def multiply(a, b):
    return a * b
    """,
    """
def square(x):
    return x*x
squares = list(map(square, range(10)))
    """
]

result = detector.detect_advanced_plagiarism(submission, reference_codes)
print(f"Is plagiarized: {result['is_plagiarized']}")
print(f"Similarity score: {result['similarity_score']:.2f}")
print(f"Structural similarity: {result['structural_similarity']:.2f}")
print(f"Combined score: {result['combined_score']:.2f}")
print(f"Most similar reference code index: {result['most_similar_index']}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Is plagiarized: True
Similarity score: 0.97
Structural similarity: 0.75
Combined score: 0.86
Most similar reference code index: 0
