<h1>Plagarism Checker</h1>

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Preprocessing function

In [4]:
def preprocess_text(docs):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Apply preprocessing to each doc individually
    processed_docs = []
    for doc in docs:
        doc = doc.lower()
        doc = doc.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(doc)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words ]
        processed_docs.append(" ".join(tokens))
        
    return processed_docs

#str.maketrans(from_chars, to_chars, delete_chars)

In [5]:
# Creating a translation table to replace 'a' with 'x' and 'b' with 'y' 


In [7]:
# List of docs 
docs = [
    "I love programming in Python.",
    "Python is a great language for programming.",
    "I enjoy coding and learning new languages."
]

# Preprocess the each of docs individually 
processed_text = preprocess_text(docs)

# Print the processed text
for text in processed_text:
    print(text)

love programming python
python great language programming
enjoy coding learning new language


# Function to calcuate Similarity between text

In [12]:
def calculate_similarity(docs):
    vectorizer = TfidfVectorizer()

    # the resultin Tfidf_matrix is a sparse matrix of shape(no. of docs * no. of docs)
    tfidf_matrix = vectorizer.fit_transform(docs)
    return cosine_similarity(tfidf_matrix)

similarity_matrix = calculate_similarity(processed_text)
print(similarity_matrix)

[[1.         0.47627592 0.        ]
 [0.47627592 1.         0.16344687]
 [0.         0.16344687 1.        ]]


# Plagarism Checker function

In [22]:
# Local - doc you want to check

def check_plagarism(local_docs, sample_docs, threshold=0.3):
    # Combine all docs & compare together in one list
    all_docs = local_docs + sample_docs

    # List Comprehension - Call preprocess_text function
    preprocessed_docs = [' '.join(preprocess_text(doc)) for doc in all_docs]

    # Call calculate_similarity fun 
    similarity_matrix = calculate_similarity(preprocessed_docs)
    print(similarity_matrix)
    # Output Plagarism Results
    print("Plagarism results: ")
    for i in range(len(local_docs)):
        for j in range(len(local_docs), len(all_docs)):
            similarity_score = similarity_matrix[i][j]
            if similarity_score > threshold:
                print(f"Local Document {i+1} is plagarized with Sample doc {j- len(local_docs)+1} with Similarity score: {similarity_score:.2f}")

In [23]:
# Main Workflow
if __name__ == "__main__":
    local_docs = [
        "The only way to do great work is to love what you do.",
        "Success is not the key to happiness. Happiness is the key to success.",
        "In the middle of every difficulty lies opportunity."
    ]

    sample_docs = [
        "The best way to achieve great thinngs is to love what you do.",
        "Happiness is the true key to success, not the other way around.",
        "Every difficulty holds an opportunity within."
    ]

    # step1: preprocess the sample document
    print("\nPreprocessed Sample Documnets: ")
    # for i, doc in enumerate(sample_docs, 1):
    #     print(f"Document {i}: {preprocess_text(doc)}")
    for doc in sample_docs:
        print(preprocess_text(doc))

    # step 2: ckeck plagarism
    print("\nChecking for plagarism...")
    check_plagarism(local_docs, sample_docs)


Preprocessed Sample Documnets: 
['', 'h', 'e', '', 'b', 'e', '', '', '', 'w', '', '', '', '', '', '', '', 'c', 'h', '', 'e', 'v', 'e', '', 'g', 'r', 'e', '', '', '', '', 'h', '', 'n', 'n', 'g', '', '', '', '', '', '', '', '', 'l', '', 'v', 'e', '', 'w', 'h', '', '', '', '', '', 'u', '', '', '', '']
['h', '', 'p', 'p', '', 'n', 'e', '', '', '', '', '', '', '', 'h', 'e', '', '', 'r', 'u', 'e', '', 'k', 'e', '', '', '', '', '', '', 'u', 'c', 'c', 'e', '', '', '', '', 'n', '', '', '', '', 'h', 'e', '', '', '', 'h', 'e', 'r', '', 'w', '', '', '', '', 'r', '', 'u', 'n', '', '']
['e', 'v', 'e', 'r', '', '', '', '', 'f', 'f', '', 'c', 'u', 'l', '', '', '', 'h', '', 'l', '', '', '', '', 'n', '', '', 'p', 'p', '', 'r', '', 'u', 'n', '', '', '', '', 'w', '', '', 'h', '', 'n', '']

Checking for plagarism...


ValueError: empty vocabulary; perhaps the documents only contain stop words