In [None]:
import nltk
nltk.download('stopwords')
import string
import pickle
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model and vectorizer
model = pickle.load(open('model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

# Preprocess text (cleaning function)
def preprocess_text(text):
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words("english"))  # Load stopwords
    text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Function to split large text into smaller chunks
def split_text(text, chunk_size=50):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to detect plagiarism for long texts
def detect_large_text(input_text, similarity_threshold=0.7):
    chunks = split_text(input_text)  # Split large text into smaller chunks
    processed_chunks = [preprocess_text(chunk) for chunk in chunks]  # Preprocess chunks
    vectorized_chunks = tfidf_vectorizer.transform(processed_chunks)  # Convert to TF-IDF

    predictions = model.predict(vectorized_chunks)  # Get model predictions

    # Cosine similarity check
    similarity_scores = cosine_similarity(vectorized_chunks, vectorized_chunks)
    max_similarity = np.max(similarity_scores)

    # If any chunk is plagiarized OR similarity is high, mark as plagiarism
    if 1 in predictions or max_similarity > similarity_threshold:
        return "Plagiarism Detected"
    return "No Plagiarism"

# Example 1: Large paragraph with potential plagiarism
large_text = """Albert Einstein developed the theory of relativity, which changed
the way we understand space and time. His work had a profound impact on physics
and reshaped scientific thought forever. His famous equation, E=mc^2, revolutionized energy calculations."""

print(detect_large_text(large_text))

# Example 2: Unique content (should return "No Plagiarism")
unique_text = """The development of artificial intelligence has led to breakthroughs
in various fields such as healthcare, finance, and automation. Researchers continue
to explore AI's potential in solving real-world problems."""

print(detect_large_text(unique_text))


Plagiarism Detected
Plagiarism Detected


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
large_text = """A self-introduction is a brief speech or written statement where you introduce yourself to others.
"""
print(detect_large_text(large_text))

Plagiarism Detected
