<a href="https://colab.research.google.com/github/Shubh220904/plaglyzer/blob/main/Code_plag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from transformers import RobertaTokenizer, RobertaModel
import torch


df1 = pd.read_csv("Type-1 and Type-2 for EclipseAnt from GCCD.csv")
df2 = pd.read_csv("Type-3 and Type-4 for EclipseAnt from GCCD.csv")

df1["Euclidean Distance"] = 0.0
df = pd.concat([df1, df2], ignore_index=True)

for col in ["Header 1", "Header 2", "Body 1", "Body 2"]:
    df[col] = df[col].astype(str)

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

def compute_cosine_sim(row):
    emb_h1 = get_embedding(row['Header 1'])
    emb_h2 = get_embedding(row['Header 2'])
    emb_b1 = get_embedding(row['Body 1'])
    emb_b2 = get_embedding(row['Body 2'])

    header_sim = cosine_similarity([emb_h1], [emb_h2])[0][0]
    body_sim = cosine_similarity([emb_b1], [emb_b2])[0][0]
    return pd.Series([header_sim, body_sim])

tqdm.pandas(desc="Computing CodeBERT similarities")
df[['Header Similarity', 'Body Similarity']] = df.progress_apply(compute_cosine_sim, axis=1)

X = df[['Header Similarity', 'Body Similarity', 'Euclidean Distance']]
y = df['Type']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Hyperparameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Computing CodeBERT similarities:   0%|          | 0/2843 [00:00<?, ?it/s][A
Computing CodeBERT similarities:   0%|          | 2/2843 [00:01<31:03,  1.52it/s][A
Computing CodeBERT similarities:   0%|          | 4/2843 [00:01<15:09,  3.12it/s][A
Computing CodeBERT similarities:   0%|          | 5/2843 [00:01<12:07,  3.90it/s][A
Computing CodeBERT similarities:   0%|          | 7/2843 [00:01<08:32,  5.54it/s][A
Computing CodeBERT similarities:   0%|          | 10/2843 [00:01<05:13,  9.03it/s][A
Computing CodeBERT similarities:   0%|          | 13/2843 [00:01<03:43, 12.68it/s][A
Computing CodeBERT similarities:   1%|          | 16/2843 [00:02<02:57, 15.95it/s][A
Computing CodeBERT similarities:   1%|          | 19/2843 [00:02<02:30, 18.77it/s][A
Computing CodeBERT similarities:   1%|          | 22/2843 [00:02<02:16, 20.66it/s][A
Computing CodeBERT similarities:   1%|          | 25/2843 [00:02<02:06, 22.29it/s][A
Computing CodeBERT similarities:   1%|          | 28/2843 [00:02<0

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test Accuracy: 0.9332161687170475


In [None]:
def predict_plagiarism(header1, body1, header2, body2):
    header1, body1, header2, body2 = map(str, [header1, body1, header2, body2])

    emb_h1 = get_embedding(header1)
    emb_h2 = get_embedding(header2)
    emb_b1 = get_embedding(body1)
    emb_b2 = get_embedding(body2)

    header_sim = cosine_similarity([emb_h1], [emb_h2])[0][0]
    body_sim = cosine_similarity([emb_b1], [emb_b2])[0][0]
    euclidean_dist = np.linalg.norm(emb_b1 - emb_b2)

    features = np.array([[header_sim, body_sim, euclidean_dist]])
    features_scaled = scaler.transform(features)

    prediction = best_model.predict(features_scaled)[0]
    prob = best_model.predict_proba(features_scaled)[0]

    print(f"\nPrediction: {'Plagiarized' if prediction == 1 else 'Not Plagiarized'}")
    print(f"Confidence: Plagiarized = {prob[1]:.4f}, Not Plagiarized = {prob[0]:.4f}")
    print(f"Features -> Header Sim: {header_sim:.4f}, Body Sim: {body_sim:.4f}, Euclidean Dist: {euclidean_dist:.4f}")
    return prediction

predict_plagiarism(
    header1="public void sortArray(int[] arr)",
    body1="for(int i=0;i<arr.length;i++){ for(int j=0;j<arr.length-1;j++){ if(arr[j]>arr[j+1]){ int temp=arr[j]; arr[j]=arr[j+1]; arr[j+1]=temp; }}}",
    header2="public void sortArray(int[] arr)",
    body2="for(int i=0;i<arr.length;i++){ for(int j=0;j<arr.length-1;j++){ if(arr[j]>arr[j+1]){ int temp=arr[j]; arr[j]=arr[j+1]; arr[j+1]=temp; }}}"
)



🔎 Prediction: Plagiarized
📊 Confidence: Plagiarized = 1.0000, Not Plagiarized = 0.0000
📌 Features -> Header Sim: 1.0000, Body Sim: 1.0000, Euclidean Dist: 0.0000




1