In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Perform stemming on the words
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Join the stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text

def calculate_similarity(text1, text2):
    preprocessed_text1 = preprocess_text(text1)
    preprocessed_text2 = preprocess_text(text2)

    # Create a set of unique words from the preprocessed texts
    unique_words = set(preprocessed_text1.split() + preprocessed_text2.split())

    # Create frequency vectors for each text
    vector1 = [preprocessed_text1.split().count(word) for word in unique_words]
    vector2 = [preprocessed_text2.split().count(word) for word in unique_words]

    # Calculate the cosine similarity between the two vectors
    dot_product = sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
    magnitude1 = sum(v1 ** 2 for v1 in vector1) ** 0.5
    magnitude2 = sum(v2 ** 2 for v2 in vector2) ** 0.5
    similarity = dot_product / (magnitude1 * magnitude2)

    return similarity


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
document1 = """
It is a platform which connects all peoples together.
We can made our friend circle on the social media.
We can connect to our old friends too.
We can make new friends on social media from foreign countries also.
At a time we can stay connected with more peoples.
"""
document2 = """
It is a platform which connects all people.
We can make new friends on social media from foreign countries also.
"""

similarity_score = calculate_similarity(document1, document2)
print(f"Similarity score: {similarity_score}")

Similarity score: 0.8374357893586237


In [5]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [17]:
from fuzzywuzzy import fuzz

def check_plagiarism(original_text, suspicious_text):
    # Calculate the similarity ratio between the texts
    similarity_ratio = fuzz.token_set_ratio(original_text, suspicious_text)

    # Print the similarity ratio
    print(f"Similarity ratio: {similarity_ratio}%")

    # Determine if the content is plagiarized based on a threshold
    threshold = 70  # Adjust as needed
    if similarity_ratio >= threshold:
        print("The content is likely plagiarized.")
    else:
        print("The content is original.")

# Usage example
original_text = """
Health is the biggest wealth for a human being in his/her entire lifetime.
One can survive without excess money but can't survive without good health.
Health is something that we can't buy with money but we can take care of it and we can cure it when needed with the help of the money.
"""

suspicious_text = """
It is said in this proverb that health of a man is as much important as the wealth to live a healthy, peaceful and prosperous life.
One can survive without excess money but can't survive without good health.
"""

check_plagiarism(original_text, suspicious_text)


Similarity ratio: 68%
The content is original.
