# Plagiarism Checker

In [1]:
import nltk
nltk.download("popular")
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/pavands/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading packag

##### Importing the Dataset

In [2]:
data = pd.read_csv("dataset.csv")
data
data= data.dropna(axis=1,how='all')
data= data.dropna(axis=0,how='all')

data

Unnamed: 0,source_text,plagiarized_text,label
0,Researcher have discovered a new species of bu...,Scientist have found a previously unknown butt...,1
1,The moon orbits the earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,Water is composed of two hydrogen atoms and on...,H20 consists of 2 hydrosen atom and 1 oxygen a...,1
3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,Pluto was once considered the ninth planet in ...,In the past Pluto was classified as the ninth ...,1
...,...,...,...
69,Spiders are insects,Insects and spiders belong to the same taxonom...,0
70,Fish can survive out of water for an indefinit...,Fish cannot survive outside of water for an e...,0
71,Goldfish have a three-second memory span,Goldfish possess a much longer memory span tha...,0
72,You can't fold a piece of paper more than seve...,It's impossible to fold a piece of paper more ...,0


In [3]:
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)


In [4]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])

In [5]:
y = data["label"]

In [6]:
model = LogisticRegression()
model.fit(X, y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [9]:
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.71      1.00      0.83        10

    accuracy                           0.73        15
   macro avg       0.86      0.60      0.58        15
weighted avg       0.81      0.73      0.67        15



In [10]:
joblib.dump(model, 'plagiarism_model.pkl')

['plagiarism_model.pkl']

### Load the saved model

In [11]:
# Load the saved model
loaded_model = joblib.load('plagiarism_model.pkl')


In [12]:
# New text for plagiarism detection
#new_text = "A new group of researchers found out a new class of butterfly in the Amazon forest of rain."
new_text="Rome dates back to 753 BC, so it has a long history."


In [13]:
# Preprocess the new text (e.g., apply the same preprocessing steps as during training)
new_text = preprocess_text(new_text)


In [14]:
# Convert the preprocessed text into TF-IDF vectors (assuming you have the vectorizer)
new_text_vector = tfidf_vectorizer.transform([new_text])

In [15]:

# Make predictions using the loaded model
prediction = loaded_model.predict(new_text_vector)

# Calculate cosine similarity between new text and training data
cosine_similarity_score = cosine_similarity(new_text_vector, X_train).max()


In [16]:

# Interpret the prediction and similarity score
if prediction[0] == 0:
    print("The text is not plagiarized.")
else:
    print(f"The text is plagiarized with a similarity score of {cosine_similarity_score*100:.2f}%.")


The text is plagiarized with a similarity score of 94.57%.
