# Import Necessary modules

In [27]:
import numpy as np
import pandas as pd
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\vijju\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\vijju\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\vijju\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\vijju\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\vijju\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

In [28]:
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [29]:
data = pd.read_csv('dataset.csv')
data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1
...,...,...,...,...
365,397,Playing musical instruments enhances creativity.,Creativity is enhanced by playing musical inst...,0
366,398,Studying history helps in understanding the pr...,Understanding the present is aided by studying...,0
367,399,Listening to classical music can improve focus.,Focus is improved by listening to classical mu...,0
368,400,Practicing yoga enhances physical flexibility.,Physical flexibility is enhanced by practicing...,0


In [30]:
print(data.shape)
data.label.value_counts()

(370, 4)


label
0    187
1    183
Name: count, dtype: int64

# Clean the text

In [31]:
def preprocess_text(text):
    # Remove the punctuation from the text
    text = text.translate(str.maketrans("","",string.punctuation))
    # Lower the text
    text = text.lower()
    # Remove stop-words
    stop_words = set(stopwords.words('english'))
    text = " ".join((word for word in text.split() if word not in stop_words))

    return text

preprocess_text("This is my @$!&#$ text used for dummy test")

'text used dummy test'

In [32]:
# Apply the custom pre-processing function on the dataset
data.source_text = data.source_text.apply(preprocess_text)
data.plagiarized_text = data.plagiarized_text.apply(preprocess_text)

In [33]:
data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1
...,...,...,...,...
365,397,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,398,studying history helps understanding present,understanding present aided studying history,0
367,399,listening classical music improve focus,focus improved listening classical music,0
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


# Vectorization

In [34]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data.source_text + " " + data.plagiarized_text)
y = data.label

# Splitting the data

In [35]:
(X_train,X_test,y_train,y_test) = train_test_split(X,y,test_size=0.2,random_state=42)

# Logistic Regression

In [36]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [37]:
# Prediction
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0])

In [38]:
# evaluation
print("accuracy_score: ",accuracy_score(y_pred,y_test))
print("classification_report: ",classification_report(y_pred,y_test))
print("Confusion Matrix ",confusion_matrix(y_pred,y_test))

accuracy_score:  0.8243243243243243
classification_report:                precision    recall  f1-score   support

           0       0.86      0.79      0.82        38
           1       0.79      0.86      0.83        36

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

Confusion Matrix  [[30  8]
 [ 5 31]]


# Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [40]:
# Prediction
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0])

In [41]:
# evaluation
print("accuracy_score: ",accuracy_score(y_pred,y_test))
print("classification_report: ",classification_report(y_pred,y_test))
print("Confusion Matrix ",confusion_matrix(y_pred,y_test))

accuracy_score:  0.8108108108108109
classification_report:                precision    recall  f1-score   support

           0       0.97      0.72      0.83        47
           1       0.67      0.96      0.79        27

    accuracy                           0.81        74
   macro avg       0.82      0.84      0.81        74
weighted avg       0.86      0.81      0.81        74

Confusion Matrix  [[34 13]
 [ 1 26]]


In [42]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train,y_train)

# Prediction
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0])

In [43]:
# evaluation
print("accuracy_score: ",accuracy_score(y_pred,y_test))
print("classification_report: ",classification_report(y_pred,y_test))
print("Confusion Matrix ",confusion_matrix(y_pred,y_test))

accuracy_score:  0.8648648648648649
classification_report:                precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion Matrix  [[30  5]
 [ 5 34]]


In [44]:
import pickle
pickle.dump(model,open("model.pkl","wb"))
pickle.dump(tfidf_vectorizer,open("tfidf_vectorizer.pkl","wb"))

In [45]:
from sklearn.svm import SVC

model = SVC(kernel='linear',probability=True)
model.fit(X_train,y_train)

# Prediction
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0])

In [46]:
# evaluation
print("accuracy_score: ",accuracy_score(y_pred,y_test))
print("classification_report: ",classification_report(y_pred,y_test))
print("Confusion Matrix ",confusion_matrix(y_pred,y_test))
type(model)

accuracy_score:  0.8783783783783784
classification_report:                precision    recall  f1-score   support

           0       0.89      0.86      0.87        36
           1       0.87      0.89      0.88        38

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix  [[31  5]
 [ 4 34]]


sklearn.svm._classes.SVC

Load the model and vectorizer

In [47]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))
print(type(model))

<class 'sklearn.naive_bayes.MultinomialNB'>


In [48]:
def detect(text):
    text = preprocess_text(text)
    vectorized_text = tfidf_vectorizer.transform([text])
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No plagiarism Detected"

In [49]:
input_text = 'The Earth is round.'
print(detect(input_text))

No plagiarism Detected
