In [1]:
!pip install torch transformers scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [16]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
data = pd.read_csv("train_snli.txt", sep="\t", header=None, names=["sentence1", "sentence2", "label"])
data

Unnamed: 0,sentence1,sentence2,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
...,...,...,...
367368,A dog with a blue collar plays ball outside.,a dog is outside,1
367369,Four dirty and barefooted children.,four children have dirty feet.,1
367370,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',0
367371,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,0


In [18]:
print(data.isnull().sum())
data.dropna(inplace=True)

sentence1    0
sentence2    4
label        0
dtype: int64


In [19]:
# 2. Train-test split
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# 3. Convert train data to InputExamples
train_examples = [
    InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))
    for _, row in train_df.iterrows()
]

In [23]:
# 4. Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
# 5. DataLoader and Loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)

In [22]:
# 6. Save the model
model.save("output/plagiarism-checker-model")

In [25]:
# 7. Compute cosine similarity for test samples
y_true = []
y_pred = []
threshold = 0.5
i=0
for _, row in test_df.iterrows():
    i+=1
    if(i>400):
      break
    sent1 = row['sentence1']
    sent2 = row['sentence2']
    true_label = int(row['label'])

    embedding1 = model.encode(sent1)
    embedding2 = model.encode(sent2)

    score = util.cos_sim(embedding1, embedding2)[0][0].item()
    pred_label = 1 if score >= threshold else 0

    y_true.append(true_label)
    y_pred.append(pred_label)

In [26]:
print("\n📊 Evaluation Results on Test Set:")
print(f"Accuracy  : {accuracy_score(y_true, y_pred):.2f}")
print(f"Precision : {precision_score(y_true, y_pred):.2f}")
print(f"Recall    : {recall_score(y_true, y_pred):.2f}")
print(f"F1 Score  : {f1_score(y_true, y_pred):.2f}")



📊 Evaluation Results on Test Set:
Accuracy  : 0.81
Precision : 0.78
Recall    : 0.87
F1 Score  : 0.82


In [27]:
# 9. Custom input for prediction
print("\n📝 Test the model with your own input sentences.")
while True:
    input1 = input("\nEnter the first sentence (or type 'exit' to quit): ")
    if input1.lower() == 'exit':
        break
    input2 = input("Enter the second sentence: ")

    embedding1 = model.encode(input1)
    embedding2 = model.encode(input2)
    score = util.cos_sim(embedding1, embedding2)[0][0].item()
    pred_label = 1 if score >= threshold else 0

    print(f"\n🔍 Cosine Similarity Score: {score:.4f} & Prediction = {pred_label}")


📝 Test the model with your own input sentences.

Enter the first sentence (or type 'exit' to quit): Children smiling and waving at camera	
Enter the second sentence: There are children present

🔍 Cosine Similarity Score: 0.5892 & Prediction = 1

Enter the first sentence (or type 'exit' to quit): A man dribbles a ball during a basketball game
Enter the second sentence:  A man is a player in a basketball game.

🔍 Cosine Similarity Score: 0.7436 & Prediction = 1

Enter the first sentence (or type 'exit' to quit): A man is a player in a basketball game.
Enter the second sentence:  A basketball game is being played.

🔍 Cosine Similarity Score: 0.7984 & Prediction = 1

Enter the first sentence (or type 'exit' to quit): Children smiling and waving at camera
Enter the second sentence: The kids are frowning

🔍 Cosine Similarity Score: 0.2373 & Prediction = 0

Enter the first sentence (or type 'exit' to quit): exit


In [28]:
!zip -r /content/plagiarism-checker-model.zip /content/output/plagiarism-checker-model


  adding: content/output/plagiarism-checker-model/ (stored 0%)
  adding: content/output/plagiarism-checker-model/tokenizer.json (deflated 71%)
  adding: content/output/plagiarism-checker-model/tokenizer_config.json (deflated 73%)
  adding: content/output/plagiarism-checker-model/model.safetensors (deflated 9%)
  adding: content/output/plagiarism-checker-model/special_tokens_map.json (deflated 80%)
  adding: content/output/plagiarism-checker-model/1_Pooling/ (stored 0%)
  adding: content/output/plagiarism-checker-model/1_Pooling/config.json (deflated 57%)
  adding: content/output/plagiarism-checker-model/config_sentence_transformers.json (deflated 34%)
  adding: content/output/plagiarism-checker-model/config.json (deflated 48%)
  adding: content/output/plagiarism-checker-model/vocab.txt (deflated 53%)
  adding: content/output/plagiarism-checker-model/2_Normalize/ (stored 0%)
  adding: content/output/plagiarism-checker-model/modules.json (deflated 62%)
  adding: content/output/plagiarism

In [29]:
from google.colab import files
files.download('/content/plagiarism-checker-model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>