In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import XLNetTokenizer, XLNetModel
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch

## **Function to load and preprocess data**


In [4]:
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data = data['abstract'].dropna()
    return data

In [6]:
# Load and preprocess data
source_data = load_and_preprocess_data(filepath = "../../dataset/data.csv")

  data = pd.read_csv(filepath)


## **Function to generate document vector**


In [7]:
def generate_document_vector(text, model, tokenizer, max_chunk_length=512, overlap=50):
    # Tokenize the text into chunks with specified overlap
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text, add_special_tokens=True)))
    chunks = [tokens[i:i + max_chunk_length] for i in range(0, len(tokens), max_chunk_length - overlap)]

    # Initialize an empty tensor to store the embeddings
    embeddings = torch.zeros((1, len(tokens), model.config.hidden_size))

    # Iterate through chunks and generate embeddings
    for i, chunk in enumerate(chunks):
        input_ids = tokenizer.convert_tokens_to_ids(chunk)
        with torch.no_grad():
            outputs = model(torch.tensor([input_ids]))
        last_hidden_states = outputs.last_hidden_state
        embeddings[:, i:i + len(chunk), :] = last_hidden_states

    # Average the embeddings over all tokens
    document_vector = embeddings.mean(dim=1).squeeze().detach().numpy()
    # Normalize the document vector
    document_vector = document_vector / np.linalg.norm(document_vector)
    return document_vector

## **Function to create vectors from BERT**

In [8]:
def create_bert_vectors(data, model, tokenizer):
    vectors = []
    for text in tqdm(data[:100]):
        vector = generate_document_vector(text, model, tokenizer)
        vectors.append(vector)
    # Normalize all vectors after creation
    vectors = np.array(vectors)
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors

## **Function to perform plagiarism analysis**

In [9]:
def plagiarism_analysis(document_vector, vector_index, source_data, threshold):
    similarity_scores = cosine_similarity([document_vector], vector_index)
    is_plagiarism = any(score > threshold for score in similarity_scores[0])

    if is_plagiarism:
        most_similar_index = np.argmax(similarity_scores[0])
        most_similar_article = source_data[most_similar_index]
    else:
        most_similar_article = None

    return [is_plagiarism, sorted(similarity_scores[0], reverse=True)[0], most_similar_article]

## **Function to report results**

In [10]:
def report_results(is_plagiarism, similarity_scores, document, most_similar_article):

    print("Plagiarism Analysis Results:\n")
    print(f"Similarity Score: {similarity_scores}")
    print(
        f"Plagiarism Decision: {'Plagiarism Detected' if is_plagiarism else 'No Plagiarism Detected'}"
    )
    print(f"Article submitted: \n{document}")
    if is_plagiarism:
        print(f"Most Similar Article:\n{most_similar_article}")
    else:
        print("\nNo evidence of plagiarism.")

## **Main function to run the plagiarism checker**

In [11]:
# Load BERT model and tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
# Generate vectors for source data
source_vectors = create_bert_vectors(source_data, model, tokenizer)

def run_plagiarism_checker(document, threshold=0.8):

    # Generate vector for the document
    document_vector = generate_document_vector(document, model, tokenizer)
    # Perform plagiarism analysis
    response = plagiarism_analysis(document_vector, source_vectors, source_data, threshold)

    # Report results and get the plagiarism decision dictionary
    report_results(response[0], response[1], document, response[2])

document_to_check = "The Bhagavad Gita, a revered Hindu scripture, unfolds as a dialogue between Lord Krishna and the warrior prince Arjuna on the battlefield of Kurukshetra. Spanning 700 verses, it encapsulates profound teachings on duty (dharma), righteousness, and the path to spiritual realization. Krishna imparts wisdom on fulfilling one's responsibilities without attachment to the results, emphasizing the pursuit of selflessness and inner harmony. Themes of devotion, discipline, and the nature of existence resonate throughout, offering guidance on navigating life's moral dilemmas and achieving spiritual enlightenment. The Gita's timeless wisdom continues to inspire seekers on the quest for deeper understanding and purpose."

100%|██████████| 100/100 [01:14<00:00,  1.35it/s]


## **Run the checker**

In [12]:
run_plagiarism_checker(document_to_check)

Plagiarism Analysis Results:

Similarity Score: 0.9776790142059326
Plagiarism Decision: Plagiarism Detected
Article submitted: 
The Bhagavad Gita, a revered Hindu scripture, unfolds as a dialogue between Lord Krishna and the warrior prince Arjuna on the battlefield of Kurukshetra. Spanning 700 verses, it encapsulates profound teachings on duty (dharma), righteousness, and the path to spiritual realization. Krishna imparts wisdom on fulfilling one's responsibilities without attachment to the results, emphasizing the pursuit of selflessness and inner harmony. Themes of devotion, discipline, and the nature of existence resonate throughout, offering guidance on navigating life's moral dilemmas and achieving spiritual enlightenment. The Gita's timeless wisdom continues to inspire seekers on the quest for deeper understanding and purpose.
Most Similar Article:
BACKGROUND AND OBJECTIVES: The daily incidence and deaths of coronavirus disease 2019 (COVID-19) in the USA are poorly understood. In