What the test data looks like:

test1.cpp - original submission from a student

test2.cpp - exact copy of test1

test3.cpp - original submission from a student

test4.cpp - original submission from a student

test5.cpp - original submission from a student

test6.cpp - changed muliple variable and function names from test1



In [1]:
import re
import os

def remove_comments_and_headers(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    code = re.sub(r'#include\s*<.*>', '', code)
    code = re.sub(r'#include\s*".*"', '', code)
    code = re.sub(r'using\s+namespace\s+std;', '', code)
    code = re.sub(r';', '', code)
    return code

def normalize_whitespace(code):
    code = code.strip()
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'\n+', '\n', code)
    return code


In [2]:
def preprocess_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        code = file.read()
    code = remove_comments_and_headers(code)
    code = normalize_whitespace(code)
    return code

In [3]:
def preprocess_files(directory):
    preprocessed_files = {}
    for filename in os.listdir(directory):
        if filename.endswith('.cpp'):
            file_path = os.path.join(directory, filename)
            preprocessed_files[filename] = preprocess_code(file_path)
    return preprocessed_files

In [4]:
directory = '/content/data/tests'
preprocessed_files = preprocess_files(directory)

In [5]:
for filename, preprocessed_code in preprocessed_files.items():
    print(f"--- Preprocessed code for {filename} ---")
    print(preprocessed_code)
    print("------------------------------------------")

--- Preprocessed code for test4.cpp ---
int solve(vector<int> ar, int n, int i, vector<int> dp) { if(i == n) return 1 if(dp[i] != -1) return dp[i] int include = 0, exclude = 0 if(ar[i] > ar[i-1]) include = 1 + solve(ar, n, i+1, dp) exclude = 0 + solve(ar, n, i+1, dp) return dp[i] = max(include, exclude) } int main() { vector<int> ar1 = {10,20,30,35,40,45} int n1 = ar1.size() vector<int> dp1(n1, -1) cout << "Test case 1 : " << solve(ar1, n1, 1, dp1) vector<int> ar2 = {5,4,3,2,1} int n2 = ar2.size() vector<int> dp2(n2, -1) cout << "\nTest case 2 : " << solve(ar2, n2, 1, dp2) return 0 }
------------------------------------------
--- Preprocessed code for test6.cpp ---
int longestInc(vector<int>& arr,int curr,int prev,vector<vector<int>>dp) { if(curr==arr.size()) return 0 if(dp[curr][prev+1]!=-1) return dp[curr][prev+1] int exclude = 0 + longest_inc(arr, curr+1, prev, dp) int include=0 if(prev==-1||arr[curr]>arr[prev]){ include=1+longest_inc(arr,curr+1,curr,dp) } dp[curr][prev+1] = max(inc

In [6]:
def tokenize_code(code):
    tokens = set(code.split())
    return tokens


In [7]:
def calculate_jaccard_similarity(tokens1, tokens2):
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    jaccard_similarity = intersection / union if union != 0 else 0.0
    return jaccard_similarity

In [8]:
for filename1, code1 in preprocessed_files.items():
    for filename2, code2 in preprocessed_files.items():
        if filename1 < filename2:
            tokens1 = tokenize_code(code1)
            tokens2 = tokenize_code(code2)
            jaccard_similarity = calculate_jaccard_similarity(tokens1, tokens2)
            print(f"Jaccard similarity between {filename1} and {filename2}: {jaccard_similarity}")


Jaccard similarity between test4.cpp and test6.cpp: 0.14942528735632185
Jaccard similarity between test4.cpp and test5.cpp: 0.24742268041237114
Jaccard similarity between test5.cpp and test6.cpp: 0.17582417582417584
Jaccard similarity between test3.cpp and test4.cpp: 0.11627906976744186
Jaccard similarity between test3.cpp and test6.cpp: 0.09243697478991597
Jaccard similarity between test3.cpp and test5.cpp: 0.11851851851851852
Jaccard similarity between test1.cpp and test4.cpp: 0.13043478260869565
Jaccard similarity between test1.cpp and test6.cpp: 0.5254237288135594
Jaccard similarity between test1.cpp and test5.cpp: 0.1326530612244898
Jaccard similarity between test1.cpp and test3.cpp: 0.10743801652892562
Jaccard similarity between test1.cpp and test2.cpp: 1.0
Jaccard similarity between test2.cpp and test4.cpp: 0.13043478260869565
Jaccard similarity between test2.cpp and test6.cpp: 0.5254237288135594
Jaccard similarity between test2.cpp and test5.cpp: 0.1326530612244898
Jaccard simi

In [9]:
threshold = 0.45

In [10]:
def identify_plagiarized_code(preprocessed_files, threshold):
    plagiarized_pairs = []
    for filename1, code1 in preprocessed_files.items():
        for filename2, code2 in preprocessed_files.items():
            if filename1 < filename2:
                tokens1 = tokenize_code(code1)
                tokens2 = tokenize_code(code2)
                jaccard_similarity = calculate_jaccard_similarity(tokens1, tokens2)
                if jaccard_similarity > threshold:
                    plagiarized_pairs.append((filename1, filename2, jaccard_similarity))
    return plagiarized_pairs

In [11]:
plagiarized_pairs = identify_plagiarized_code(preprocessed_files, threshold)

In [12]:
for pair in plagiarized_pairs:
    filename1, filename2, similarity = pair
    print(f"Plagiarized code pair: {filename1} and {filename2}, Similarity: {similarity}")

Plagiarized code pair: test1.cpp and test6.cpp, Similarity: 0.5254237288135594
Plagiarized code pair: test1.cpp and test2.cpp, Similarity: 1.0
Plagiarized code pair: test2.cpp and test6.cpp, Similarity: 0.5254237288135594
