In [None]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

# Function to scrape text from a website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract text from website (customize this based on the structure of the website)
    text = ' '.join([p.get_text() for p in soup.find_all('p')])
    return text

# Function to calculate plagiarism score
def calculate_plagiarism_score(written_text, internet_text):
    # Preprocess text
    written_text_processed = preprocess_text(written_text)
    internet_text_processed = preprocess_text(internet_text)

    # Feature extraction
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([written_text_processed, internet_text_processed])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(X)
    plagiarism_score = similarity_matrix[0][1]  # Similarity between written text and internet text

    return plagiarism_score

# Sample written text
written_text = input("Enter the written text: ")

# Sample website URL
website_url = input("Enter the URL of the website: ")

# Scraping text from the website
internet_text = scrape_website(website_url)

# Calculate plagiarism score
plagiarism_score = calculate_plagiarism_score(written_text, internet_text)

print("Plagiarism Score:", plagiarism_score)

#in general if the score is more than 0.5, then it is considered plagarism whereas if the score is below 0.5 then it is not considered plagarism
#in the below example we took the text from a different website and checked it for plagarism. As expected we got a score below 0.5 which is expected

Enter the written text: import requests from bs4 import BeautifulSoup from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer import string  # Function to preprocess text def preprocess_text(text):     # Tokenize the text     tokens = word_tokenize(text)      # Convert to lowercase     tokens = [token.lower() for token in tokens]      # Remove punctuation     tokens = [token for token in tokens if token not in string.punctuation]      # Remove stopwords     stop_words = set(stopwords.words('english'))     tokens = [token for token in tokens if token not in stop_words]      # Stemming     stemmer = PorterStemmer()     tokens = [stemmer.stem(token) for token in tokens]      return ' '.join(tokens)  # Function to scrape text from a website def scrape_website(url):     response = requests.get(url)     soup = BeautifulSoup(r

In [None]:

pip install nltk



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True