In [1]:
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler

#labels (0: not plagiarized, 1: plagiarized)
file_pairs = [
    ('/content/data/tests/test1.cpp', '/content/data/tests/test6.cpp', 1),
    ('/content/data/tests/test1.cpp', '/content/data/tests/test2.cpp', 1),
    ('/content/data/tests/test2.cpp', '/content/data/tests/test4.cpp', 0),
    ('/content/data/tests/test5.cpp', '/content/data/tests/test1.cpp', 0),
    ('/content/data/tests/test3.cpp', '/content/data/tests/test6.cpp', 0),

]

def load_and_preprocess(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        code1 = f1.read()
        code2 = f2.read()
    code1 = preprocess_code(code1)
    code2 = preprocess_code(code2)
    return code1, code2

def preprocess_code(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    code = re.sub(r'#include\s*<.*>', '', code)
    code = re.sub(r'#include\s*".*"', '', code)
    code = re.sub(r'using\s+namespace\s+std;', '', code)
    code = re.sub(r';', '', code)
    code = code.strip()
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'\n+', '\n', code)
    return code

def augment_code_sample(code_sample):
    lines = code_sample.split(' ')
    random.shuffle(lines)
    return ' '.join(lines)

code_samples = []
labels = []

for file1, file2, label in file_pairs:
    code1, code2 = load_and_preprocess(file1, file2)
    code_samples.append(code1)
    code_samples.append(code2)
    labels.extend([label, label])

    for _ in range(100):
        augmented_code1 = augment_code_sample(code1)
        augmented_code2 = augment_code_sample(code2)
        code_samples.append(augmented_code1)
        code_samples.append(augmented_code2)
        labels.extend([label, label])

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(code_samples)

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, labels)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(f"Best parameters: {grid_search.best_params_}")

cv_scores = cross_val_score(grid_search.best_estimator_, X_resampled, y_resampled, cv=3)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Cross-validation scores: [0.56666667 0.75836431 0.73605948]
Mean cross-validation score: 0.6870301528294093
              precision    recall  f1-score   support

           0       1.00      0.50      0.66       105
           1       0.65      1.00      0.79        97

    accuracy                           0.74       202
   macro avg       0.82      0.75      0.72       202
weighted avg       0.83      0.74      0.72       202

