# Import modules

In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
df = pd.read_csv('/content/dataset.csv')

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
source_text,0
plagiarized_text,0
label,0


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


In [None]:
df.shape

(370, 4)

# Clean text

In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import nltk
from nltk.corpus import stopwords
import string

# Download the stopwords corpus
nltk.download('stopwords')

# Define the preprocessing function
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing
df["source_text"] = df["source_text"].apply(preprocess_text)
df["plagiarized_text"] = df["plagiarized_text"].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Vectorization

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df["source_text"])

# Train test split

In [None]:
y = df["label"]

In [None]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix")
print(cm)

Accuracy: 0.8378378378378378
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        35
           1       0.86      0.82      0.84        39

    accuracy                           0.84        74
   macro avg       0.84      0.84      0.84        74
weighted avg       0.84      0.84      0.84        74

Confusion Matrix
[[30  5]
 [ 7 32]]


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Instantiate the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

#fit the model
model.fit(X_train, y_train)

#predict the model
y_pred = model.predict(X_test)

#evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

#generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix")
print(cm)

Accuracy: 0.7972972972972973
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.94      0.81        35
           1       0.93      0.67      0.78        39

    accuracy                           0.80        74
   macro avg       0.82      0.80      0.80        74
weighted avg       0.83      0.80      0.79        74

Confusion Matrix
[[33  2]
 [13 26]]


# Naive Bayes Model


In [None]:
from sklearn.naive_bayes import MultinomialNB
# Instantiate the model
model = MultinomialNB()
# Fit the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

Accuracy: 0.8648648648648649
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion Matrix:
[[30  5]
 [ 5 34]]


# SVM

In [None]:
from sklearn.svm import SVC

# Instantiate the model
model = SVC(kernel='linear', random_state=42)
# Fit the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

Accuracy: 0.8513513513513513
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84        35
           1       0.85      0.87      0.86        39

    accuracy                           0.85        74
   macro avg       0.85      0.85      0.85        74
weighted avg       0.85      0.85      0.85        74

Confusion Matrix:
[[29  6]
 [ 5 34]]


# Save naive bayes and vectorizer

In [None]:
import pickle

pickle.dump(model,open("model.pkl",'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))

# Load model and vectorizer

In [None]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

# Detection system

In [None]:
def detect(input_text):
  vectorized_text = tfidf_vectorizer.transform([input_text])
  result = model.predict(vectorized_text)
  return "Plagiarim Detected" if result[0] == 1 else "No Plagiarism"

In [None]:
# example ( it is a plagarized text)
input_text = "Astronomers have detected a mysterious radio signal coming from a distant galaxy."
detect(input_text)

'Plagiarim Detected'

In [None]:
# example ( it has no plagiarism)
input_text = 'Morning walk is really a good thing for health'
detect(input_text)

'No Plagiarism'