In [None]:
pip install datasets sentence-transformers nltk scikit-learn



In [None]:
# =========================================================
# TwinText – Plagiarism Detection Model
# FINAL ERROR-FREE VERSION (NO NLTK)
# =========================================================

import re
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ---------------- LOAD MODEL ----------------
print("Loading SBERT model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded successfully!")

# ---------------- SIMPLE SENTENCE SPLITTER ----------------
def split_sentences(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    # Split on ., ?, !
    return [s.strip() for s in re.split(r"[.!?]", text) if s.strip()]

# ---------------- EMBEDDINGS ----------------
def get_embeddings(sentences):
    return model.encode(sentences, convert_to_numpy=True)

# ---------------- PLAGIARISM PREDICTION ----------------
def predict_plagiarism(text1, text2, sim_threshold=0.8):
    s1 = split_sentences(text1)
    s2 = split_sentences(text2)

    if not s1 or not s2:
        return 0

    e1 = get_embeddings(s1)
    e2 = get_embeddings(s2)

    sim_matrix = cosine_similarity(e1, e2)
    max_sim = sim_matrix.max(axis=1)

    return 1 if np.mean(max_sim >= sim_threshold) >= 0.3 else 0

# ---------------- DATASET CREATION ----------------
def create_dataset():
    data = [
        # Plagiarized / Paraphrased
        ("Machine learning is a subset of artificial intelligence.",
         "Artificial intelligence includes machine learning as a subfield.", 1),

        ("Python is widely used for data science applications.",
         "Data science frequently uses Python programming.", 1),

        ("Plagiarism involves copying someone else's work.",
         "Using another person's work without credit is plagiarism.", 1),

        ("Neural networks learn patterns from data.",
         "Deep learning models identify patterns using neural networks.", 1),

        # Non-plagiarized
        ("The cat is sleeping on the sofa.",
         "Quantum physics deals with subatomic particles.", 0),

        ("The stock market fluctuates daily.",
         "Mount Everest is the tallest mountain.", 0),

        ("Football is a popular sport.",
         "Photosynthesis occurs in plants.", 0),

        ("Databases store structured information.",
         "Rainfall affects agricultural productivity.", 0),
    ]

    return pd.DataFrame(data, columns=["text1", "text2", "label"])

# ---------------- EVALUATION ----------------
def evaluate_model():
    df = create_dataset()

    y_true = []
    y_pred = []

    print("\nEvaluating plagiarism detection...\n")

    for _, row in df.iterrows():
        pred = predict_plagiarism(row["text1"], row["text2"])
        y_pred.append(pred)
        y_true.append(row["label"])

    print("===== MODEL METRICS =====")
    print("Accuracy :", round(accuracy_score(y_true, y_pred), 4))
    print("Precision:", round(precision_score(y_true, y_pred), 4))
    print("Recall   :", round(recall_score(y_true, y_pred), 4))
    print("F1 Score :", round(f1_score(y_true, y_pred), 4))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# ---------------- RUN ----------------
if __name__ == "__main__":
    print("\n=== TwinText Plagiarism Checker ===")
    evaluate_model()


Loading SBERT model...
Model loaded successfully!

=== TwinText Plagiarism Checker ===

Evaluating plagiarism detection...

===== MODEL METRICS =====
Accuracy : 0.875
Precision: 1.0
Recall   : 0.75
F1 Score : 0.8571

Confusion Matrix:
[[4 0]
 [1 3]]


In [None]:
!pip install sentence-transformers



In [None]:

# Plagiarism Detection using
# BERT + Cosine Similarity

import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Load Dataset


dataset_path = "/content/ml.csv"
df = pd.read_csv(dataset_path)

print("Dataset Shape:", df.shape)
print(df.head())

texts1 = df["text1"].tolist()
texts2 = df["text2"].tolist()
labels = df["label"].tolist()


# 2. Load BERT Model


print("\nLoading BERT model...")
bert_model = SentenceTransformer("all-MiniLM-L6-v2")


# 3. Generate Embeddings


print("Generating embeddings...")
embeddings1 = bert_model.encode(texts1, show_progress_bar=True)
embeddings2 = bert_model.encode(texts2, show_progress_bar=True)


# 4. Cosine Similarity

print("Calculating cosine similarity...")
similarity_scores = []

for i in range(len(embeddings1)):
    score = cosine_similarity(
        [embeddings1[i]],
        [embeddings2[i]]
    )[0][0]
    similarity_scores.append(score)

similarity_scores = np.array(similarity_scores).reshape(-1, 1)


# 5. Train-Test Split


X_train, X_test, y_train, y_test = train_test_split(
    similarity_scores,
    labels,
    test_size=0.2,
    random_state=42
)


# 6. Train ML Classifier

print("Training classifier...")
classifier = LogisticRegression()
classifier.fit(X_train, y_train)


# 7. Evaluation


y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy_percent = accuracy * 100
cm = confusion_matrix(y_test, y_pred)

print(f"\nAccuracy: {accuracy_percent:.2f}%")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


# 8. Prediction Function


def check_plagiarism(text1, text2, threshold=0.75):
    emb = bert_model.encode([text1, text2])
    sim = cosine_similarity([emb[0]], [emb[1]])[0][0]

    result = "Plagiarized" if sim >= threshold else "Not Plagiarized"
    return sim, result

# 9. Example Test

t1 = "Machine learning is a field of artificial intelligence."
t2 = "Machine learning is a branch of AI that uses statistical methods."

score, result = check_plagiarism(t1, t2)

print("\nExample Test")
print(f"Similarity Score: {score:.4f}")
print("Result:", result)


Dataset Shape: (10000, 3)
                                               text1  \
0  Natural language processing enables computers ...   
1  Natural language processing enables computers ...   
2  Cyber security protects systems, networks, and...   
3  Data science involves extracting knowledge fro...   
4  Cyber security protects systems, networks, and...   

                                               text2  label  
0  This sentence talks about a completely differe...      0  
1  This sentence talks about a completely differe...      0  
2  This sentence talks about a completely differe...      0  
3  This sentence talks about a completely differe...      0  
4  This sentence talks about a completely differe...      0  

Loading BERT model...
Generating embeddings...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Calculating cosine similarity...
Training classifier...

Accuracy: 100.00%

Confusion Matrix:
 [[ 999    0]
 [   0 1001]]

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       999
           1       1.00      1.00      1.00      1001

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Example Test
Similarity Score: 0.7989
Result: Plagiarized


Dataset Shape: (10000, 3)
                                               text1  \
0  Natural language processing enables computers ...   
1  Natural language processing enables computers ...   
2  Cyber security protects systems, networks, and...   
3  Data science involves extracting knowledge fro...   
4  Cyber security protects systems, networks, and...   

                                               text2  label  
0  This sentence talks about a completely differe...      0  
1  This sentence talks about a completely differe...      0  
2  This sentence talks about a completely differe...      0  
3  This sentence talks about a completely differe...      0  
4  This sentence talks about a completely differe...      0  

Loading BERT model...
Generating embeddings...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Extracting similarity features...
Training classifier...

Accuracy: 100.00%

Confusion Matrix:
 [[1001    0]
 [   0  999]]

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1001
           1       1.00      1.00      1.00       999

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Example Test
Similarity Score: 0.7989
Result: Plagiarized
