In [1]:
import os
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def preprocess(text):
    # Tokenization and lowercasing
    words = nltk.word_tokenize(text.lower())

    # Remove stop words
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stopwords]

    return ' '.join(words)

In [5]:
def check_plagiarism(base_doc, docs):
    # Preprocess all documents
    processed_docs = [preprocess(base_doc)] + [preprocess(doc) for doc in docs]

    # Convert documents to TF-IDF features
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_docs)

    # Calculate cosine similarities
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    return similarity_scores.flatten()

In [7]:
if __name__ == "__main__":
    # Example base document
    base_document = """
    Plagiarism is the representation of another author's language, thoughts, ideas, or expressions as one's own original work.
    """

    # Example document repository
    documents = [
        """
        Plagiarism involves taking someone else's work or ideas and passing them off as your own.
        """,
        """
        Representing someone else's writing or ideas as your own is considered plagiarism and is unethical.
        """,
        """
        This document talks about something entirely unrelated to plagiarism.
        """,
        """
        Plagiarism is the representation of another author's language, thoughts, ideas, or expressions as one's own original work.
        """
    ]

    # Check for plagiarism
    scores = check_plagiarism(base_document, documents)

    # Display results
    for i, score in enumerate(scores):
        print(f"Document {i+1} Similarity: {score:.2f}")
        if score > 0.8:
            print(" - Highly similar (potential plagiarism).")
        elif score > 0.5:
            print(" - Some similarities detected.")
        else:
            print(" - Not similar.")


Document 1 Similarity: 0.17
 - Not similar.
Document 2 Similarity: 0.09
 - Not similar.
Document 3 Similarity: 0.04
 - Not similar.
Document 4 Similarity: 1.00
 - Highly similar (potential plagiarism).
