In [1]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertModel
import torch

# Download NLTK stopwords
nltk.download('stopwords')

# Load spaCy model for text processing
nlp = spacy.load("en_core_web_sm")

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Function to calculate cosine similarity between two texts using TF-IDF
def calculate_cosine_similarity(text1, text2):
    # Preprocess the texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    
    # Return the similarity score (between 0 and 1)
    return similarity_matrix[0][0]

# Function to calculate similarity using BERT embeddings
def calculate_bert_similarity(text1, text2):
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Preprocess the texts and tokenize
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Get BERT embeddings (last hidden state of [CLS] token)
    with torch.no_grad():
        output1 = model(**inputs1)
        output2 = model(**inputs2)
    
    # Compute cosine similarity between BERT embeddings of the texts
    emb1 = output1.last_hidden_state.mean(dim=1)  # Pooling (averaging) the embeddings
    emb2 = output2.last_hidden_state.mean(dim=1)
    cos_sim = torch.nn.functional.cosine_similarity(emb1, emb2)
    
    return cos_sim.item()

# Main function to check plagiarism
def check_plagiarism(doc1, doc2, use_bert=False, threshold=0.8):
    if use_bert:
        # Use BERT for similarity calculation
        similarity_score = calculate_bert_similarity(doc1, doc2)
    else:
        # Use TF-IDF and cosine similarity
        similarity_score = calculate_cosine_similarity(doc1, doc2)
    
    print(f"Similarity Score: {similarity_score}")
    
    # Compare with threshold to decide if plagiarized
    if similarity_score >= threshold:
        print("Plagiarism Detected!")
    else:
        print("No Plagiarism Detected.")

# Example documents
doc1 = """Artificial intelligence (AI) refers to the simulation of human intelligence
           in machines. These systems are designed to perform tasks such as visual
           perception, speech recognition, decision-making, and language translation."""
doc2 = """AI is a field of computer science that aims to create machines capable of
          performing tasks that typically require human intelligence, such as recognizing 
          speech, making decisions, or translating languages."""

# Run plagiarism check using Cosine Similarity (TF-IDF)
check_plagiarism(doc1, doc2, use_bert=False, threshold=0.8)

# Run plagiarism check using BERT embeddings
check_plagiarism(doc1, doc2, use_bert=True, threshold=0.8)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similarity Score: 0.2109964527251086
No Plagiarism Detected.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Similarity Score: 0.9445720314979553
Plagiarism Detected!
