## Import Modules

In [10]:
import nltk
nltk.download("popular")
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/princekay/nltk_data...
[nltk_data]    |   Package movie_

## Load Dataset

In [11]:
data=pd.read_csv("dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [14]:
data['label'].value_counts()

label
0    187
1    183
Name: count, dtype: int64

In [15]:
data.shape

(370, 4)

## Clean text

In [17]:
def preprocess_text(text):
    #remove punctuation
    text = text.translate(str.maketrans("","",string.punctuation))
    #convert to lowercase
    text=text.lower()
    #remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text
    
# preprocess_text("This, is, my $#$%!?/ text to use for dummy text")

'text use dummy text'

In [18]:
data['source_text'] = data['source_text'].apply(preprocess_text)
data['plagiarized_text'] = data['plagiarized_text'].apply(preprocess_text)


In [19]:
data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1
...,...,...,...,...
365,397,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,398,studying history helps understanding present,understanding present aided studying history,0
367,399,listening classical music improve focus,focus improved listening classical music,0
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


## Vectorization

In [21]:
tfidf_vectorizer =  TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(data['source_text'] + " " + data['plagiarized_text'])

In [22]:
y = data['label']

## Train Test Split

In [23]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## Applying logistic regression

In [24]:
model = LogisticRegression()

model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("accuracy ", accuracy_score(y_test, y_pred))
print("Classification ", classification_report(y_test, y_pred))
print("Confusion ", confusion_matrix(y_test, y_pred))

accuracy  0.8243243243243243
Classification                precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

Confusion  [[30  5]
 [ 8 31]]


## Random Forest Model

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("accuracy ", accuracy_score(y_test, y_pred))
print("Classification ", classification_report(y_test, y_pred))
print("Confusion ", confusion_matrix(y_test, y_pred))

accuracy  0.7972972972972973
Classification                precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

Confusion  [[34  1]
 [14 25]]


## Naiv Bays Model

In [26]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("accuracy ", accuracy_score(y_test, y_pred))
print("Classification ", classification_report(y_test, y_pred))
print("Confusion ", confusion_matrix(y_test, y_pred))

accuracy  0.8648648648648649
Classification                precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion  [[30  5]
 [ 5 34]]


## SVM

In [27]:
from sklearn.svm import SVC

model = SVC(kernel='linear', random_state=42)

model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("accuracy ", accuracy_score(y_test, y_pred))
print("Classification ", classification_report(y_test, y_pred))
print("Confusion ", confusion_matrix(y_test, y_pred))

accuracy  0.8783783783783784
Classification                precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion  [[31  4]
 [ 5 34]]


## Save SVM and Vectorizer

In [30]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

## Load Model and Vectorizer

In [31]:
import pickle
model=pickle.load(open('model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

## Detection System

In [34]:
def detect(input_text):
    #vectorise the text
    vectorized_text = tfidf_vectorizer.transform ([input_text])
    #then will do the prediction by model
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No plagiarism"

In [35]:
#example (it is a plagarized text)
input_text='Researchers have discovered a new species of butterfly in the Amazon rainforest'
detect(input_text)

'Plagiarism Detected'

In [36]:
#example (it has no plagiarism)
input_text = 'Playing musical instruments enhances creativity.'
detect(input_text)

'No plagiarism'