Install & Import Required Libraries

In [1]:
# Install NLTK resources
import nltk
nltk.download('popular')

# Importing core libraries
import pandas as pd
import numpy as np
import string
import pickle
from nltk.corpus import stopwords

# Sklearn tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

 Load the Dataset

In [2]:
# Upload dataset first in Colab: Go to the left sidebar > Files > Upload
df = pd.read_csv("customer_data.csv")  # Rename if needed
df.head()


Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


Check Class Distribution

In [3]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


### 🧼 Step 3: Preprocessing Text

We clean both `source_text` and `plagiarized_text` by:
- Removing punctuation
- Lowercasing
- Removing stop words


In [4]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words("english"))
    return " ".join(word for word in text.split() if word not in stop_words)

df["source_text"] = df["source_text"].apply(preprocess_text)
df["plagiarized_text"] = df["plagiarized_text"].apply(preprocess_text)


### 🔠 Step 4: TF-IDF Vectorization

We combine the two text columns into one and apply TF-IDF vectorization to convert text into numerical form.


In [5]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df["source_text"] + " " + df["plagiarized_text"])
y = df["label"]


### 🧪 Step 5: Split Dataset

Split the dataset into training and testing sets (80% / 20%).


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 🤖 Step 6A: Logistic Regression


In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8243243243243243
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

Confusion Matrix:
 [[30  5]
 [ 8 31]]


###  Step 6B: Random Forest Classifier


In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7972972972972973
              precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

Confusion Matrix:
 [[34  1]
 [14 25]]


###  Step 6C: Naive Bayes


In [9]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8648648648648649
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion Matrix:
 [[30  5]
 [ 5 34]]


### Step 6D: SVM (Best Performer)


In [10]:
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8783783783783784
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix:
 [[31  4]
 [ 5 34]]


###  Step 7: Save SVM Model & Vectorizer


In [11]:
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pkl", "wb"))


###  Step 8: Detect Plagiarism from New Text


In [12]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No Plagiarism"


### ✅ Step 9: Example Predictions

Below are a few examples where the system detects plagiarism and also confirms when there's no plagiarism. Try your own sentences too!


In [14]:
# Plagiarized Example (intentionally similar to training data)
text_plagiarized = "Scientists have found a previously unknown butterfly in the Amazon jungle."
print("Input:", text_plagiarized)
print("Prediction:", detect(text_plagiarized))  # Likely to be detected

# Non-Plagiarized Example
text_clean = "Yoga improves mental health and boosts concentration over time."
print("\nInput:", text_clean)
print("Prediction:", detect(text_clean))  # Likely to be clean

# Edge Case - General Knowledge
text_general = "The sun rises in the east and sets in the west."
print("\nInput:", text_general)
print("Prediction:", detect(text_general))


Input: Scientists have found a previously unknown butterfly in the Amazon jungle.
Prediction: No Plagiarism

Input: Yoga improves mental health and boosts concentration over time.
Prediction: No Plagiarism

Input: The sun rises in the east and sets in the west.
Prediction: No Plagiarism
