In [1]:
# Install required packages
!pip install tree-sitter
!pip install tree-sitter-java
!pip install tree-sitter-python
!pip install tree-sitter-cpp

import os
import numpy as np
import json
from tqdm import tqdm
from tree_sitter import Language, Parser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import torch
from transformers import RobertaTokenizer, RobertaModel

Collecting tree-sitter
  Downloading tree_sitter-0.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Downloading tree_sitter-0.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (560 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m560.8/560.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tree-sitter
Successfully installed tree-sitter-0.23.1
Collecting tree-sitter-java
  Downloading tree_sitter_java-0.23.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading tree_sitter_java-0.23.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter-java
Successfully installed tree-sitter-java-0.23.2


In [2]:
# Load Java, Python, and C++ languages
import tree_sitter_java
import tree_sitter_python
import tree_sitter_cpp

JAVA_LANGUAGE = Language(tree_sitter_java.language())
PYTHON_LANGUAGE = Language(tree_sitter_python.language())
CPP_LANGUAGE = Language(tree_sitter_cpp.language())

java_parser = Parser(JAVA_LANGUAGE)
python_parser = Parser(PYTHON_LANGUAGE)
cpp_parser = Parser(CPP_LANGUAGE)


In [3]:
# Load CodeBERT model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [4]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return json.JSONEncoder.default(self, obj)

In [5]:
def tree_to_sequence(code, language):
    if language == 'java':
        parser = java_parser
    elif language == 'python':
        parser = python_parser
    elif language == 'cpp':
        parser = cpp_parser
    else:
        raise ValueError("Unsupported language")

    tree = parser.parse(bytes(code, "utf8"))

    def traverse(node, depth=0):
        if node.type != 'string' and node.type != 'comment':
            yield f"{node.type}_{depth}"
            for child in node.children:
                yield from traverse(child, depth + 1)

    return ' '.join(traverse(tree.root_node))

In [6]:
def preprocess_code(code, language):
    # Remove comments
    if language == 'java':
        code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.DOTALL)
    elif language == 'python':
        code = re.sub(r'#.*?\n|\'\'\'.*?\'\'\'|""".*?"""', '', code, flags=re.DOTALL)

    # Remove string literals
    code = re.sub(r'".*?"', '""', code)

    # Remove import statements
    if language == 'java':
        code = re.sub(r'import\s+[\w.]+;', '', code)
    elif language == 'python':
        code = re.sub(r'import\s+[\w.]+|from\s+[\w.]+\s+import\s+[\w.]+', '', code)

    # Remove package declarations (Java only)
    if language == 'java':
        code = re.sub(r'package\s+[\w.]+;', '', code)

    # Remove whitespace
    code = re.sub(r'\s+', ' ', code).strip()
    return code

In [7]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [8]:
def normalized_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [9]:
def get_file_language(filename):
    extension = os.path.splitext(filename)[1].lower()
    if extension in ['.java']:
        return 'java'
    elif extension in ['.py', '.pyw']:
        return 'python'
    elif extension in ['.cpp', '.cxx', '.cc', '.c++', '.hpp', '.hxx', '.hh', '.h++', '.h']:
        return 'cpp'
    else:
        raise ValueError(f"Unsupported file type: {filename}")


In [10]:
def get_codebert_embedding(code):
    try:
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    except Exception as e:
        print(f"Error generating CodeBERT embedding: {str(e)}")
        return None

In [11]:
def process_files(directory):
    submissions = {}
    for root, _, files in os.walk(directory):
        for file in files:
            try:
                language = get_file_language(file)
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        code = f.read()
                        preprocessed_code = preprocess_code(code, language)
                        tree_sequence = tree_to_sequence(preprocessed_code, language)
                        codebert_embedding = get_codebert_embedding(tree_sequence)
                        submission = {
                            'sequence': tree_sequence,
                            'language': language,
                            'embedding': codebert_embedding,
                            'tokens': set(tree_sequence.split())
                        }
                        submissions[file] = submission
                    except UnicodeDecodeError:
                        print(f"Error reading {file_path}. Skipping.")
            except ValueError as e:
                print(f"Skipping file {file}: {str(e)}")
    return submissions


In [12]:
def compute_similarities(submissions):
    filenames = list(submissions.keys())
    n = len(filenames)
    codebert_similarities = np.zeros((n, n))
    jaccard_similarities = np.zeros((n, n))
    tfidf_similarities = np.zeros((n, n))

    # Prepare TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([sub['sequence'] for sub in submissions.values()])

    embeddings = np.array([sub['embedding'] for sub in submissions.values()])

    for i in range(n):
        for j in range(i+1, n):
            # CodeBERT similarity
            codebert_sim = normalized_similarity(embeddings[i], embeddings[j])
            codebert_similarities[i][j] = codebert_similarities[j][i] = codebert_sim * 100

            # Jaccard similarity
            jaccard_sim = jaccard_similarity(submissions[filenames[i]]['tokens'], submissions[filenames[j]]['tokens'])
            jaccard_similarities[i][j] = jaccard_similarities[j][i] = jaccard_sim * 100

            # TF-IDF similarity
            tfidf_sim = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0][0]
            tfidf_similarities[i][j] = tfidf_similarities[j][i] = tfidf_sim * 100

    return codebert_similarities, jaccard_similarities, tfidf_similarities

In [13]:
def check_plagiarism(directory, threshold=80):
    submissions = process_files(directory)
    codebert_similarities, jaccard_similarities, tfidf_similarities = compute_similarities(submissions)

    filenames = list(submissions.keys())
    n = len(filenames)

    results = []
    for i in range(n):
        file_result = {"file": filenames[i], "comparisons": {}}
        for j in range(n):
            if i != j:
                codebert_sim = codebert_similarities[i][j]
                jaccard_sim = jaccard_similarities[i][j]
                tfidf_sim = tfidf_similarities[i][j]

                # Calculate combined similarity (you can adjust the weights as needed)
                combined_sim = (codebert_sim + jaccard_sim + tfidf_sim) / 3

                file_result["comparisons"] = {
                    "filename": [filenames[j]] ,
                    "combined_similarity": combined_sim,
                    "potential_plagiarism": combined_sim > threshold
                }
        results.append(file_result)

    return results

In [14]:
# Example usage
directory = '/kaggle/input/ir-plag-dataset'
plagiarism_results = check_plagiarism(directory)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
# Save to JSON file using the custom encoder
with open('plagiarism_results.json', 'w') as f:
    json.dump(plagiarism_results, f, indent=2, cls=NumpyEncoder)

print("Results have been saved to 'plagiarism_results.json'")