In [9]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [3]:
# Load dataset
df = pd.read_csv("/content/dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [28]:
df.tail()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
365,397,Playing musical instruments enhances creativity.,Creativity is enhanced by playing musical inst...,0
366,398,Studying history helps in understanding the pr...,Understanding the present is aided by studying...,0
367,399,Listening to classical music can improve focus.,Focus is improved by listening to classical mu...,0
368,400,Practicing yoga enhances physical flexibility.,Physical flexibility is enhanced by practicing...,0
369,401,Volunteering fosters community spirit.,Community spirit is fostered by volunteering.,0


In [29]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
source_text,0
plagiarized_text,0
label,0


In [30]:
df.duplicated().sum()

np.int64(0)

In [7]:
# Download NLTK resources
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Custom Preprocessor
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))

    def clean_text(self, text):
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = text.lower()
        return " ".join(word for word in text.split() if word not in self.stop_words)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_text)

In [6]:
X_raw = df["source_text"] + " " + df["plagiarized_text"]
y = df["label"]

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42)

In [10]:
pipelines = {
    "SVM": Pipeline([
        ("cleaner", TextCleaner()),
        ("tfidf", TfidfVectorizer()),
        ("classifier", SVC(kernel='linear', random_state=42))
    ]),

    "RandomForest": Pipeline([
        ("cleaner", TextCleaner()),
        ("tfidf", TfidfVectorizer()),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
    ]),

    "NaiveBayes": Pipeline([
        ("cleaner", TextCleaner()),
        ("tfidf", TfidfVectorizer()),
        ("classifier", MultinomialNB())
    ])
}

In [11]:
# Train and evaluate each pipeline
for name, model_pipeline in pipelines.items():
    print(f"\n--- Training {name} ---")
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    print(f"Accuracy for {name}: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


--- Training SVM ---
Accuracy for SVM: 0.8649
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion Matrix:
 [[30  5]
 [ 5 34]]

--- Training RandomForest ---
Accuracy for RandomForest: 0.8378
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.97      0.85        35
           1       0.97      0.72      0.82        39

    accuracy                           0.84        74
   macro avg       0.86      0.84      0.84        74
weighted avg       0.87      0.84      0.84        74

Confusion Matrix:
 [[34  1]
 [11 28]]

--- Training NaiveBayes ---
Accuracy for NaiveBayes: 0.8919
Classification Report:
               precision  

In [12]:
# Save the best-performing pipeline
with open("best_model_pipeline.pkl", "wb") as f:
    pickle.dump(pipelines["NaiveBayes"], f)

In [17]:
with open("best_model_pipeline.pkl", "rb") as f:
    model = pickle.load(f)

In [26]:
def detect_plagiarism(input_text):
    input_series = pd.Series([input_text])  # Convert list of one item to Pandas Series
    prediction = model.predict(input_series)
    return "Plagiarism Detected" if prediction[0] == 1 else "No Plagiarism"


In [27]:
input_text = 'Researchers have discovered a new species of butterfly in the Amazon rainforest.'
detect_plagiarism(input_text)

'Plagiarism Detected'

In [31]:
input_text = 'Playing musical instruments enhances creativity.'
detect_plagiarism(input_text)

'No Plagiarism'

In [32]:
input_text = 'Muhammad Iqbal (9 November 1877 – 21 April 1938) was an Islamic philosopher and poet.[1][2][3][4] His poetry is considered to be among the greatest of the 20th century,[5][6][7][8] and his vision of a cultural and political ideal for the Muslims of British-ruled India[9] is widely regarded as having animated the impulse for the Pakistan Movement.[1][10] He is commonly referred to by the honorific Allama (Persian: علامه, transl. "learned")[11][12] and widely considered one of the most important and influential Muslim thinkers and Western religious philosophers of the 20th century'
detect_plagiarism(input_text)

'Plagiarism Detected'