What the test data looks like:

test1.cpp - original submission from a student

test2.cpp - exact copy of test1

test3.cpp - original submission from a student

test4.cpp - original submission from a student

test5.cpp - original submission from a student

test6.cpp - changed muliple variable and function names from test1


In [1]:
import re
import os

def remove_comments_and_headers(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    code = re.sub(r'#include\s*<.*>', '', code)
    code = re.sub(r'#include\s*".*"', '', code)
    code = re.sub(r'using\s+namespace\s+std;', '', code)
    code = re.sub(r';', '', code)
    return code

def normalize_whitespace(code):
    code = code.strip()
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'\n+', '\n', code)
    return code


In [2]:
def preprocess_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        code = file.read()
    code = remove_comments_and_headers(code)
    code = normalize_whitespace(code)
    return code

In [3]:
def preprocess_files(directory):
    preprocessed_files = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        preprocessed_files[filename] = preprocess_code(file_path)
    return preprocessed_files

In [4]:
directory = '/content/data/tests'
preprocessed_files = preprocess_files(directory)

In [5]:
for filename, preprocessed_code in preprocessed_files.items():
    print(f"--- Preprocessed code for {filename} ---")
    print(preprocessed_code)
    print("------------------------------------------")

--- Preprocessed code for test4.cpp ---
int solve(vector<int> ar, int n, int i, vector<int> dp) { if(i == n) return 1 if(dp[i] != -1) return dp[i] int include = 0, exclude = 0 if(ar[i] > ar[i-1]) include = 1 + solve(ar, n, i+1, dp) exclude = 0 + solve(ar, n, i+1, dp) return dp[i] = max(include, exclude) } int main() { vector<int> ar1 = {10,20,30,35,40,45} int n1 = ar1.size() vector<int> dp1(n1, -1) cout << "Test case 1 : " << solve(ar1, n1, 1, dp1) vector<int> ar2 = {5,4,3,2,1} int n2 = ar2.size() vector<int> dp2(n2, -1) cout << "\nTest case 2 : " << solve(ar2, n2, 1, dp2) return 0 }
------------------------------------------
--- Preprocessed code for test6.cpp ---
int longestInc(vector<int>& arr,int curr,int prev,vector<vector<int>>dp) { if(curr==arr.size()) return 0 if(dp[curr][prev+1]!=-1) return dp[curr][prev+1] int exclude = 0 + longest_inc(arr, curr+1, prev, dp) int include=0 if(prev==-1||arr[curr]>arr[prev]){ include=1+longest_inc(arr,curr+1,curr,dp) } dp[curr][prev+1] = max(inc

In [6]:
from gensim.models import Word2Vec

In [7]:
all_tokens = [token.split() for code in preprocessed_files.values() for token in code.split()]

word2vec_model = Word2Vec(sentences=all_tokens, vector_size=100, window=5, min_count=1, workers=4)

word2vec_model.save("word2vec_model.bin")

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
def generate_embeddings(model, code):
    embeddings = []
    for token in code.split():
        if token in model.wv:
            embeddings.append(model.wv[token])
    return embeddings

In [10]:
def calculate_similarity(embeddings1, embeddings2):
    if not embeddings1 or not embeddings2:
        return 0.0
    similarity_matrix = cosine_similarity(embeddings1, embeddings2) * 10
    return similarity_matrix.mean()

In [16]:
for filename1, code1 in preprocessed_files.items():
    for filename2, code2 in preprocessed_files.items():
        if filename1 < filename2:
            embeddings1 = generate_embeddings(word2vec_model, code1)
            embeddings2 = generate_embeddings(word2vec_model, code2)
            similarity = calculate_similarity(embeddings1, embeddings2)
            print(f"Similarity between {filename1} and {filename2}: {similarity}")


Similarity between test4.cpp and test6.cpp: 0.17415085434913635
Similarity between test4.cpp and test5.cpp: 0.22890661656856537
Similarity between test5.cpp and test6.cpp: 0.17923347651958466
Similarity between test3.cpp and test4.cpp: 0.19106075167655945
Similarity between test3.cpp and test6.cpp: 0.13860298693180084
Similarity between test3.cpp and test5.cpp: 0.18163923919200897
Similarity between test1.cpp and test4.cpp: 0.1795368492603302
Similarity between test1.cpp and test6.cpp: 0.2615128755569458
Similarity between test1.cpp and test5.cpp: 0.1804451197385788
Similarity between test1.cpp and test3.cpp: 0.14582641422748566
Similarity between test1.cpp and test2.cpp: 0.2974817156791687
Similarity between test2.cpp and test4.cpp: 0.1795368492603302
Similarity between test2.cpp and test6.cpp: 0.2615128755569458
Similarity between test2.cpp and test5.cpp: 0.1804451197385788
Similarity between test2.cpp and test3.cpp: 0.14582641422748566


In [17]:
threshold = 0.25

In [21]:
def identify_plagiarized_code(preprocessed_files, word2vec_model, threshold):
    plagiarized_pairs = []
    for filename1, code1 in preprocessed_files.items():
        for filename2, code2 in preprocessed_files.items():
            if filename1 < filename2:
                embeddings1 = generate_embeddings(word2vec_model, code1)
                embeddings2 = generate_embeddings(word2vec_model, code2)
                similarity = calculate_similarity(embeddings1, embeddings2)
                if similarity > threshold:
                    plagiarized_pairs.append((filename1, filename2, similarity))
    return plagiarized_pairs


In [22]:
plagiarized_pairs = identify_plagiarized_code(preprocessed_files, word2vec_model, threshold)

In [23]:
for pair in plagiarized_pairs:
    filename1, filename2, similarity = pair
    print(f"Plagiarized code pair: {filename1} and {filename2}, Similarity: {similarity}")


Plagiarized code pair: test1.cpp and test6.cpp, Similarity: 0.2615128755569458
Plagiarized code pair: test1.cpp and test2.cpp, Similarity: 0.2974817156791687
Plagiarized code pair: test2.cpp and test6.cpp, Similarity: 0.2615128755569458


In [24]:
import re
import os

def remove_comments_and_headers(code):
    code = re.sub(r'//.*', '', code)  # Single-line comments
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)  # Multi-line comments
    code = re.sub(r'#include\s*<.*>', '', code)  # Remove system headers
    code = re.sub(r'#include\s*".*"', '', code)  # Remove local headers
    code = re.sub(r'using\s+namespace\s+std\s*;', '', code)  # Remove using namespace std
    return code

def normalize_whitespace(code):
    code = re.sub(r'\s+', ' ', code)  # Normalize all whitespace to a single space
    code = re.sub(r'\n+', '\n', code)  # Normalize multiple newlines to a single newline
    return code.strip()

def preprocess_code(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            code = file.read()
        code = remove_comments_and_headers(code)
        code = normalize_whitespace(code)
        return code
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return ""


In [25]:
def preprocess_files(directory):
    preprocessed_files = {}
    for filename in os.listdir(directory):
        if filename.endswith(".cpp") or filename.endswith(".h"):
            file_path = os.path.join(directory, filename)
            preprocessed_files[filename] = preprocess_code(file_path)
    return preprocessed_files


In [26]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def generate_embeddings(model, code):
    embeddings = [model.wv[token] for token in code.split() if token in model.wv]
    return np.array(embeddings)

def calculate_similarity(embeddings1, embeddings2):
    if embeddings1.size == 0 or embeddings2.size == 0:
        return 0.0
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)
    return similarity_matrix.mean()


In [27]:
def identify_plagiarized_code(preprocessed_files, word2vec_model, threshold):
    plagiarized_pairs = []
    files = list(preprocessed_files.items())
    for i in range(len(files)):
        filename1, code1 = files[i]
        embeddings1 = generate_embeddings(word2vec_model, code1)
        for j in range(i + 1, len(files)):
            filename2, code2 = files[j]
            embeddings2 = generate_embeddings(word2vec_model, code2)
            similarity = calculate_similarity(embeddings1, embeddings2)
            if similarity > threshold:
                plagiarized_pairs.append((filename1, filename2, similarity))
    return plagiarized_pairs


In [28]:
directory = '/content/data/tests'
threshold = 0
model_path = "word2vec_model.bin"
preprocessed_files = preprocess_files(directory)

all_tokens = [token.split() for code in preprocessed_files.values() for token in code.split()]
word2vec_model = Word2Vec(sentences=all_tokens, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save(model_path)

plagiarized_pairs = identify_plagiarized_code(preprocessed_files, word2vec_model, threshold)
for filename1, filename2, similarity in plagiarized_pairs:
    print(f"Plagiarized code pair: {filename1} and {filename2}, Similarity: {similarity}")



Plagiarized code pair: test4.cpp and test6.cpp, Similarity: 0.018834834918379784
Plagiarized code pair: test4.cpp and test5.cpp, Similarity: 0.022038480266928673
Plagiarized code pair: test4.cpp and test3.cpp, Similarity: 0.018189571797847748
Plagiarized code pair: test4.cpp and test1.cpp, Similarity: 0.014911038801074028
Plagiarized code pair: test4.cpp and test2.cpp, Similarity: 0.014911038801074028
Plagiarized code pair: test6.cpp and test5.cpp, Similarity: 0.017952190712094307
Plagiarized code pair: test6.cpp and test3.cpp, Similarity: 0.016722209751605988
Plagiarized code pair: test6.cpp and test1.cpp, Similarity: 0.022227376699447632
Plagiarized code pair: test6.cpp and test2.cpp, Similarity: 0.022227376699447632
Plagiarized code pair: test5.cpp and test3.cpp, Similarity: 0.018162373453378677
Plagiarized code pair: test5.cpp and test1.cpp, Similarity: 0.015161887742578983
Plagiarized code pair: test5.cpp and test2.cpp, Similarity: 0.015161887742578983
Plagiarized code pair: test3

In [30]:
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def remove_comments_and_headers(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    code = re.sub(r'#include\s*<.*>', '', code)
    code = re.sub(r'#include\s*".*"', '', code)
    code = re.sub(r'using\s+namespace\s+std;', '', code)
    return code

def normalize_whitespace(code):
    code = code.strip()
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'\n+', '\n', code)
    return code

def preprocess_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        code = file.read()
    code = remove_comments_and_headers(code)
    code = normalize_whitespace(code)
    return code

def preprocess_files(directory):
    preprocessed_files = {}
    for filename in os.listdir(directory):
        if filename.endswith('.cpp'):
            file_path = os.path.join(directory, filename)
            preprocessed_files[filename] = preprocess_code(file_path)
    return preprocessed_files

directory = '/content/data/tests'
preprocessed_files = preprocess_files(directory)

# Vectorize the code using TF-IDF
def vectorize_code(preprocessed_files):
    vectorizer = TfidfVectorizer(binary=True)
    filenames = list(preprocessed_files.keys())
    codes = list(preprocessed_files.values())
    tfidf_matrix = vectorizer.fit_transform(codes)
    return filenames, tfidf_matrix

filenames, tfidf_matrix = vectorize_code(preprocessed_files)

# Binarize the TF-IDF matrix
binary_matrix = (tfidf_matrix > 0).astype(int)

# Calculate Jaccard similarity between the binary vectors
def calculate_jaccard_similarity(binary_matrix):
    similarities = np.zeros((binary_matrix.shape[0], binary_matrix.shape[0]))
    for i in range(binary_matrix.shape[0]):
        for j in range(i + 1, binary_matrix.shape[0]):
            intersection = np.sum(np.logical_and(binary_matrix[i].toarray(), binary_matrix[j].toarray()))
            union = np.sum(np.logical_or(binary_matrix[i].toarray(), binary_matrix[j].toarray()))
            jaccard_similarity = intersection / union if union != 0 else 0.0
            similarities[i, j] = jaccard_similarity
            similarities[j, i] = jaccard_similarity
    return similarities

similarity_matrix = calculate_jaccard_similarity(binary_matrix)

threshold = 0.5
def identify_plagiarized_code(filenames, similarity_matrix, threshold):
    plagiarized_pairs = []
    for i in range(len(filenames)):
        for j in range(i + 1, len(filenames)):
            if similarity_matrix[i, j] > threshold:
                plagiarized_pairs.append((filenames[i], filenames[j], similarity_matrix[i, j]))
    return plagiarized_pairs

plagiarized_pairs = identify_plagiarized_code(filenames, similarity_matrix, threshold)
for pair in plagiarized_pairs:
    filename1, filename2, similarity = pair
    print(f"Plagiarized code pair: {filename1} and {filename2}, Similarity: {similarity}")


Plagiarized code pair: test6.cpp and test5.cpp, Similarity: 0.6285714285714286
Plagiarized code pair: test6.cpp and test1.cpp, Similarity: 0.6944444444444444
Plagiarized code pair: test6.cpp and test2.cpp, Similarity: 0.6944444444444444
Plagiarized code pair: test5.cpp and test1.cpp, Similarity: 0.5263157894736842
Plagiarized code pair: test5.cpp and test2.cpp, Similarity: 0.5263157894736842
Plagiarized code pair: test1.cpp and test2.cpp, Similarity: 1.0
