In [35]:
from sklearn.metrics import f1_score
from langdetect import detect
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle
import torch
import faiss
from transformers import XLNetTokenizer, XLNetModel, MarianMTModel, MarianTokenizer

In [36]:
# Load BERT model and tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

In [37]:
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath, low_memory=False)
    data = data["abstract"].dropna()
    return data

In [38]:
source_data = load_and_preprocess_data(filepath="../../dataset/data.csv")

In [39]:
def generate_document_vector(text, model, tokenizer, max_chunk_length=512, overlap=50):
    # Tokenize the text into chunks with specified overlap
    tokens = tokenizer.tokenize(
        tokenizer.decode(tokenizer.encode(text, add_special_tokens=True))
    )
    chunks = [
        tokens[i : i + max_chunk_length]
        for i in range(0, len(tokens), max_chunk_length - overlap)
    ]

    # Initialize an empty tensor to store the embeddings
    embeddings = torch.zeros((1, len(tokens), model.config.hidden_size))

    # Iterate through chunks and generate embeddings
    for i, chunk in enumerate(chunks):
        input_ids = tokenizer.convert_tokens_to_ids(chunk)
        with torch.no_grad():
            outputs = model(torch.tensor([input_ids]))
            last_hidden_states = outputs.last_hidden_state
            embeddings[:, i : i + len(chunk), :] = last_hidden_states

    # Average the embeddings over all tokens
    document_vector = embeddings.mean(dim=1).squeeze().detach().numpy()
    # Normalize the document vector
    document_vector = document_vector / np.linalg.norm(document_vector)
    return document_vector

In [40]:
def build_faiss_index(source_vectors):
    """
    This function builds a FAISS index for efficient similarity search.

    Args:
        source_vectors (np.ndarray): A numpy array containing source document vectors.

    Returns:
        faiss.Index: The built FAISS index object.
    """

    # Choose an appropriate FAISS index type based on your data and needs.
    # This example uses IndexFlatL2 for simplicity. You might explore other options
    # from the FAISS library.
    index = faiss.IndexFlatL2(source_vectors.shape[1])

    # Train the FAISS index on the source vectors
    index.add(source_vectors)

    return index

In [41]:
def analysis(document_vector, source_vectors, threshold):
    # Check if FAISS index exists
    try:
        # Assuming FAISS index is stored in a pickle file
        with open("faiss_index.pkl", "rb") as f:
            index = pickle.load(f)
    except FileNotFoundError:
        # If index doesn't exist, build it from source vectors
        print("FAISS index not found. Building a new index...")
        index = build_faiss_index(source_vectors)
        # Save the built index for future use
        with open("faiss_index.pkl", "wb") as f:
            pickle.dump(index, f)

    # Search for nearest neighbors using FAISS
    D, I = index.search(document_vector.reshape(1, -1), 1)  # Search for 1 neighbor

    most_similar_index = I.flatten()[0]
    most_similar_score = D.flatten()[0]
    most_similar_article = source_data[most_similar_index]

    is_matched = most_similar_score > threshold

    return [is_matched, most_similar_score, most_similar_article]

In [42]:
def report_results(is_matched, similarity_scores, document, most_similar_article):
    print("Analysis Results:\n")
    print(f"Similarity Score: {similarity_scores}")
    print(f"Decision: {'Match Detected' if is_matched else 'No match Detected'}")
    print(f"Article submitted: \n{document}")
    if is_matched:
        print(f"Most Similar Article:\n{most_similar_article}")
    else:
        print("\nNo evidence of similar document.")

In [43]:
# with open("abstracts_embeddings_XLNet.pkl", "rb") as f:
#     source_vectors = pickle.load(f)

In [44]:
def create_bert_vectors(data, model, tokenizer):
    vectors = []
    for text in tqdm(data[:100]):
        vector = generate_document_vector(text, model, tokenizer)
        vectors.append(vector)
        # Normalize all vectors after creation
    vectors = np.array(vectors)
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors

In [45]:
source_vectors = create_bert_vectors(source_data, model, tokenizer)

100%|██████████| 100/100 [01:11<00:00,  1.39it/s]


In [46]:
source_vectors = pd.DataFrame(source_vectors)

In [47]:
def run_plagiarism_checker(document, threshold=0.8):

    # Generate vector for the document
    document_vector = generate_document_vector(document, model, tokenizer)
    # Perform plagiarism analysis
    response = analysis(document_vector, source_vectors, threshold)

    # Report results and get the plagiarism decision dictionary
    report_results(response[0], response[1], document, response[2])

document_to_check = "The Bhagavad Gita, a revered Hindu scripture, unfolds as a dialogue between Lord Krishna and the warrior prince Arjuna on the battlefield of Kurukshetra. Spanning 700 verses, it encapsulates profound teachings on duty (dharma), righteousness, and the path to spiritual realization. Krishna imparts wisdom on fulfilling one's responsibilities without attachment to the results, emphasizing the pursuit of selflessness and inner harmony. Themes of devotion, discipline, and the nature of existence resonate throughout, offering guidance on navigating life's moral dilemmas and achieving spiritual enlightenment. The Gita's timeless wisdom continues to inspire seekers on the quest for deeper understanding and purpose."

run_plagiarism_checker(document_to_check, 800)

Analysis Results:

Similarity Score: 2148.67578125
Decision: Match Detected
Article submitted: 
The Bhagavad Gita, a revered Hindu scripture, unfolds as a dialogue between Lord Krishna and the warrior prince Arjuna on the battlefield of Kurukshetra. Spanning 700 verses, it encapsulates profound teachings on duty (dharma), righteousness, and the path to spiritual realization. Krishna imparts wisdom on fulfilling one's responsibilities without attachment to the results, emphasizing the pursuit of selflessness and inner harmony. Themes of devotion, discipline, and the nature of existence resonate throughout, offering guidance on navigating life's moral dilemmas and achieving spiritual enlightenment. The Gita's timeless wisdom continues to inspire seekers on the quest for deeper understanding and purpose.
Most Similar Article:
Epidemics such as viral haemorrhagic fevers, severe acute respiratory syndrome, Middle East respiratory syndrome coronavirus or yet unknown ones have few chances of 