In [2]:
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE

In [39]:
#labels (0: not plagiarized, 1: plagiarized)

file_pairs = [
    ('/content/data/file1.cpp', '/content/data/file2.cpp', 0),
    ('/content/data/file3.cpp', '/content/data/file4.cpp', 0),
    ('/content/data/file5.cpp', '/content/data/file6.cpp', 1),
    ('/content/data/file7.cpp', '/content/data/file8.cpp', 1),
    ('/content/data/file9.cpp', '/content/data/file10.cpp', 1),
    ('/content/data/file11.cpp', '/content/data/file12.cpp', 1),
    ('/content/data/file13.cpp', '/content/data/file14.cpp', 0),
    ('/content/data/file15.cpp', '/content/data/file16.cpp', 0),
    ('/content/data/file17.cpp', '/content/data/file18.cpp', 0),
    ('/content/data/file19.cpp', '/content/data/file20.cpp', 0),
    ('/content/data/file21.cpp', '/content/data/file22.cpp', 0),
    ('/content/data/file23.cpp', '/content/data/file24.cpp', 0),
    ('/content/data/file25.cpp', '/content/data/file26.cpp', 0),
    ('/content/data/file27.cpp', '/content/data/file28.cpp', 1),
    ('/content/data/file29.cpp', '/content/data/file30.cpp', 1),
    ('/content/data/file31.cpp', '/content/data/file32.cpp', 0),
    ('/content/data/file33.cpp', '/content/data/file34.cpp', 0),
    ('/content/data/file35.cpp', '/content/data/file36.cpp', 0),
    ('/content/data/file37.cpp', '/content/data/file38.cpp', 1),
    ('/content/data/file39.cpp', '/content/data/file40.cpp', 0),
    ('/content/data/file1.cpp', '/content/data/file6.cpp', 0),
    ('/content/data/file5.cpp', '/content/data/file8.cpp', 0),
    ('/content/data/file9.cpp', '/content/data/file5.cpp', 0),

]

In [40]:
def load_and_preprocess(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        code1 = f1.read()
        code2 = f2.read()
    code1 = preprocess_code(code1)
    code2 = preprocess_code(code2)
    return code1, code2

def preprocess_code(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    code = re.sub(r'#include\s*<.*>', '', code)
    code = re.sub(r'#include\s*".*"', '', code)
    code = re.sub(r'using\s+namespace\s+std;', '', code)
    code = re.sub(r';', '', code)
    code = code.strip()
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'\n+', '\n', code)
    return code

In [41]:
def augment_code_sample(code_sample):
    lines = code_sample.split(' ')
    random.shuffle(lines)
    return ' '.join(lines)

In [42]:
code_samples = []
labels = []

for file1, file2, label in file_pairs:
    code1, code2 = load_and_preprocess(file1, file2)
    code_samples.append(code1)
    code_samples.append(code2)
    labels.extend([label, label])

    for _ in range(40):
        augmented_code1 = augment_code_sample(code1)
        augmented_code2 = augment_code_sample(code2)
        code_samples.append(augmented_code1)
        code_samples.append(augmented_code2)
        labels.extend([label, label])


In [43]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(code_samples)

In [44]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

In [45]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [46]:
clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(f"Best parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [47]:
cv_scores = cross_val_score(grid_search.best_estimator_, X_resampled, y_resampled, cv=3)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.85942857 0.95314286 0.67162471]
Mean cross-validation score: 0.828065380843413


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91       315
           1       0.87      1.00      0.93       341

    accuracy                           0.92       656
   macro avg       0.93      0.92      0.92       656
weighted avg       0.93      0.92      0.92       656



In [54]:
import os

In [55]:
def predict_plagiarism(new_files_dir, model, vectorizer):
    files = [os.path.join(new_files_dir, f) for f in os.listdir(new_files_dir) if f.endswith('.cpp')]
    file_pairs = [(files[i], files[j]) for i in range(len(files)) for j in range(i + 1, len(files))]

    predictions = []
    plagiarism_counts = {}

    for file1, file2 in file_pairs:
        code1, code2 = load_and_preprocess(file1, file2)
        code_combined = code1 + " " + code2
        code_transformed = vectorizer.transform([code_combined])
        prediction = model.predict(code_transformed)[0]
        predictions.append((file1, file2, prediction))

        if prediction == 1:
            if file1 not in plagiarism_counts:
                plagiarism_counts[file1] = 0
            if file2 not in plagiarism_counts:
                plagiarism_counts[file2] = 0
            plagiarism_counts[file1] += 1
            plagiarism_counts[file2] += 1

    return predictions, plagiarism_counts


What the test data looks like:

test1.cpp - original submission from a student

test2.cpp - exact copy of test1

test3.cpp - original submission from a student

test4.cpp - original submission from a student

test5.cpp - original submission from a student

test6.cpp - changed muliple variable and function names from test1



In [66]:
new_files_dir = '/content/data/tests'
predictions, plagiarism_counts = predict_plagiarism(new_files_dir, best_clf, vectorizer)

print("Plagiarized file pairs:")
for file1, file2, pred in predictions:
    if pred == 1:
        print(f"Files: {os.path.basename(file1)} and {os.path.basename(file2)} -> Plagiarized: {pred}")


Plagiarized file pairs:
Files: test6.cpp and test1.cpp -> Plagiarized: 1
Files: test6.cpp and test2.cpp -> Plagiarized: 1
Files: test1.cpp and test2.cpp -> Plagiarized: 1


In [64]:
num_files = len(os.listdir(new_files_dir))
threshold = num_files // 3

plagiarized_files = [file for file, count in plagiarism_counts.items() if count > threshold]

print("Files consistently marked as plagiarized:")
for file in plagiarized_files:
    print(os.path.basename(file))

Files consistently marked as plagiarized:
