Importing Libraries

In [3]:
import nltk
nltk.download('popular')
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

Importing the dataset

In [4]:
df = pd.read_csv('/content/dataset.csv')

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label
count,370.0,370.0
mean,206.591892,0.494595
std,117.77457,0.500648
min,0.0,0.0
25%,112.5,0.0
50%,211.5,0.0
75%,308.75,1.0
max,401.0,1.0


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


In [8]:
df.shape

(370, 4)

Cleaning Text

In [9]:
 def preprocess_text(text):
  #remove panctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  #convert to lower case
  text = text.lower()
  #remove stop words
  stop_words = set(stopwords.words('english'))
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text
preprocess_text("This is demo $%^&*! text...")

'demo text'

In [10]:
df['source_text'] = df['source_text'].apply(preprocess_text)
df['plagiarized_text'] = df['plagiarized_text'].apply(preprocess_text)

In [11]:
df

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1
...,...,...,...,...
365,397,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,398,studying history helps understanding present,understanding present aided studying history,0
367,399,listening classical music improve focus,focus improved listening classical music,0
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


Vectorization

In [12]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['source_text'] + " " + df['plagiarized_text'])

In [13]:
y = df['label']

Training & Testing data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Logistic Regression

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("classification report ", classification_report(y_test, y_pred))

accuracy  0.8243243243243243
confusion matrix  [[30  5]
 [ 8 31]]
classification report                precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74



Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("classification report ", classification_report(y_test, y_pred))

accuracy  0.7972972972972973
confusion matrix  [[34  1]
 [14 25]]
classification report                precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74



Naive Bayes Classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("classification report ", classification_report(y_test, y_pred))

accuracy  0.8648648648648649
confusion matrix  [[30  5]
 [ 5 34]]
classification report                precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74



SVM

In [18]:
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state = 42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("classification report ", classification_report(y_test, y_pred))

accuracy  0.8783783783783784
confusion matrix  [[31  4]
 [ 5 34]]
classification report                precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74



SVM is Performing Well so saving it and Vectorizer

In [26]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', 'wb'))

Load Model & Vectorizer

In [27]:
model = pickle.load(open('model.pkl', 'rb'))
tfidf_vectorizer =  pickle.load(open('vectorizer.pkl', 'rb'))

Detection System

In [33]:
def detect(input_text):
  vectorized_text =  tfidf_vectorizer.transform([input_text])
  result = model.predict(vectorized_text)
  return "Plagiarism Detected" if result[0] == 1 else "Plagiarism Not Detected"

In [34]:
input_text = "I am Varshil Patel & I am an ML Developer"
detect(input_text)

'Plagiarism Not Detected'