In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if not word in stop_words]
    return ' '.join(tokens)

In [None]:
def calculate_similarity(text1, text2):
    corpus = [text1, text2]
    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix[0, 1]

In [None]:
def check_plagiarism(text1, text2, threshold=0.9):
    similarity = calculate_similarity(text1, text2)
    if similarity >= threshold:
        return True
    else:
        return False

In [None]:
if __name__ == "__main__":
    original_text = "Original content goes here."
    suspicious_text = "Original kgvhutrcfvhghj m content goes here."

    is_plagiarized = check_plagiarism(original_text, suspicious_text)
    if is_plagiarized:
        print("Plagiarism detected!")
    else:
        print("No plagiarism detected.")


No plagiarism detected.
