In [240]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    BertTokenizer,
    BertModel,
    AutoTokenizer,
    AutoModel,
    RobertaTokenizer,
    RobertaModel,
    XLNetTokenizer,
    XLNetModel,
    MarianMTModel,
    MarianTokenizer,
)
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import f1_score
from langdetect import detect
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch

## **Function to load and preprocess data**


In [250]:
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data = data["abstract"].dropna()
    return data

In [251]:
# Load and preprocess data
source_data = load_and_preprocess_data(filepath="../dataset/data.csv")

  data = pd.read_csv(filepath)


In [310]:
source_data[10]

'Background. The epidemiology of pediatric febrile illness is shifting in sub-Saharan Africa, but malaria remains a major cause of childhood morbidity and mortality. The present study describes causes of febrile illness in hospitalized children in Ghana and aims to determine the burden of malaria coinfections and their association with parasite densities.\nMethods. In a prospective study, children (aged ≥30 days and ≤15 years) with fever ≥38.0°C were recruited after admission to the pediatric ward of a primary hospital in Ghana. Malaria parasitemia was determined and blood, stool, urine, respiratory, and cerebrospinal fluid specimens were screened for parasitic, bacterial, and viral pathogens. Associations of Plasmodium densities with other pathogens were calculated.\nResults. From November 2013 to April 2015, 1238 children were enrolled from 4169 admissions. A clinical/microbiological diagnosis could be made in 1109/1238 (90%) patients, with Plasmodium parasitemia (n = 728/1238 [59%])

## **Function to generate document vector**


In [303]:
def generate_document_vector(text, model, tokenizer, max_chunk_length=512, overlap=50):
    # Tokenize the text into chunks with specified overlap
    tokens = tokenizer.tokenize(
        tokenizer.decode(tokenizer.encode(text, add_special_tokens=True))
    )
    chunks = [
        tokens[i : i + max_chunk_length]
        for i in range(0, len(tokens), max_chunk_length - overlap)
    ]

    # Initialize an empty tensor to store the embeddings
    embeddings = torch.zeros((1, len(tokens), model.config.hidden_size))

    # Iterate through chunks and generate embeddings
    for i, chunk in enumerate(chunks):
        input_ids = tokenizer.convert_tokens_to_ids(chunk)
        with torch.no_grad():
            outputs = model(torch.tensor([input_ids]))
        last_hidden_states = outputs.last_hidden_state
        embeddings[:, i : i + len(chunk), :] = last_hidden_states

    # Average the embeddings over all tokens
    document_vector = embeddings.mean(dim=1).squeeze().detach().numpy()
    return document_vector

In [304]:
# def generate_document_vector(text, model, tokenizer, max_length=512, device=None):
#     if device is None:
#         device = "cuda" if torch.cuda.is_available() else "cpu"

#     # Move the model to the specified device
#     model.to(device)

#     # Set the model to evaluation mode
#     model.eval()

#     with torch.no_grad():
#         # Tokenize the text
#         input_ids = tokenizer.encode(
#             text, add_special_tokens=True, truncation=True
#         )

#         # Truncate or pad to the specified max length
#         input_ids = input_ids[:max_length]
#         input_ids += [tokenizer.pad_token_id] * (max_length - len(input_ids))

#         # Move the input to the specified device
#         input_ids = torch.tensor([input_ids]).to(device)

#         # Generate document vector
#         document_vector = model(input_ids)[0].mean(dim=1).squeeze().detach().cpu().numpy()

#     return document_vector


## **Function to create vectors using BERT**


In [253]:
# def create_bert_vectors(data, model, tokenizer):
#     vectors = []
#     for text in tqdm(data[:100]):
#         vector = generate_document_vector(text, model, tokenizer)
#         vectors.append(vector)
#     return vectors

In [287]:
def create_source_vectors(data, model, tokenizer, max_length=512, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Move the model to the specified device
    model.to(device)

    vectors = []

    # Set the model to evaluation mode
    model.eval()

    with torch.no_grad():
        for text in tqdm(data[:5000]):
            # Tokenize and handle sequences longer than max_length
            input_ids = tokenizer.encode(
                text, add_special_tokens=True, max_length=max_length, truncation=True
            )
            input_ids = torch.tensor([input_ids]).to(device)

            # Generate document vector
            vector = model(input_ids)[0].mean(dim=1).squeeze().detach().cpu().numpy()
            vectors.append(vector)

    return vectors

## **Function to evaluate**


## Cosine Similarity


In [255]:
def analysis(document_vector, source_vectors, source_data, threshold):
    similarity_scores = cosine_similarity([document_vector], source_vectors)

    most_similar_index = np.argmax(similarity_scores[0])
    most_similar_score = similarity_scores[0][most_similar_index]
    most_similar_article = source_data[most_similar_index]

    is_matched = most_similar_score > threshold

    return [is_matched, most_similar_score, most_similar_article]

## Blue Score


In [256]:
# def analysis(document, source_data, threshold):
#     highest_bleu_score = 0
#     most_similar_article = None
#     is_plagiarism = False

#     # Tokenize the candidate text as it's required for BLEU score calculation
#     candidate_tokens = document.split()

#     for source_text in source_data:
#         # Tokenize the source text
#         source_tokens = [source_text.split()]  # BLEU expects a list of tokenized reference texts

#         # Calculate the BLEU score between candidate text and this source text
#         bleu_score = sentence_bleu(source_tokens, candidate_tokens)

#         # Update if this is the highest BLEU score so far and check against threshold
#         if bleu_score > highest_bleu_score:
#             highest_bleu_score = bleu_score
#             most_similar_article = source_text
#             is_plagiarism = bleu_score > threshold

#     return [is_plagiarism, highest_bleu_score, most_similar_article]

## **Function to report results**


In [257]:
def report_results(is_matched, similarity_scores, document, most_similar_article):
    print("Analysis Results:\n")
    print(f"Similarity Score: {similarity_scores}")
    print(f"Decision: {'Match Detected' if is_matched else 'No match Detected'}")
    print(f"Article submitted: \n{document}")
    if is_matched:
        print(f"Most Similar Article:\n{most_similar_article}")
    else:
        print("\nNo evidence of similar document.")

## **Different Models**


In [258]:
# Load BERT model and tokenizer
# model = BertModel.from_pretrained("bert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load sciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")


# Load Roberta model and tokenizer
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaModel.from_pretrained('roberta-base')


# Load XLNet model and tokenizer
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# model = XLNetModel.from_pretrained('xlnet-base-cased')

## **Source vector generation**


In [259]:
# Generate vectors for source data
source_vectors = create_bert_vectors(source_data, model, tokenizer)

100%|██████████| 5000/5000 [42:02<00:00,  1.98it/s]


## **Save Embeddings**


In [260]:
import pickle

with open("embeddings_sciBERT.pkl", "wb") as f:
    pickle.dump(source_vectors, f)

## **Language Translation**


In [261]:
def translate_text(input_text, source_lang, target_lang):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"

    # Load pre-trained translation model and tokenizer
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Tokenize and translate
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    translation_ids = model.generate(input_ids)

    # Decode the translated text
    translated_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)

    return translated_text

## **Check Language**


In [262]:
def check_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        return "unknown"

## **Main function to run the checker**


In [305]:
def run_checker(document, threshold=0.9):
    # Check language of the document
    document_language = check_language(document)

    # Translate non-English document to English for consistency
    if document_language != "en":
        document = translate_text(document, document_language, "en")

    # Generate vector for the document

    document_vector = generate_document_vector(document, model, tokenizer)


    # Perform analysis

    response = analysis(document_vector, source_vectors, source_data, threshold)
    # response = analysis(document, source_data, threshold)


    # Report results and get the decision dictionary

    report_results(response[0], response[1], document, response[2])

## **Run the checker**


In [306]:
document_to_check = "The Bhagavad Gita, a revered Hindu scripture, unfolds as a dialogue between Lord Krishna and the warrior prince Arjuna on the battlefield of Kurukshetra. Spanning 700 verses, it encapsulates profound teachings on duty (dharma), righteousness, and the path to spiritual realization. Krishna imparts wisdom on fulfilling one's responsibilities without attachment to the results, emphasizing the pursuit of selflessness and inner harmony. Themes of devotion, discipline, and the nature of existence resonate throughout, offering guidance on navigating life's moral dilemmas and achieving spiritual enlightenment. The Gita's timeless wisdom continues to inspire seekers on the quest for deeper understanding and purpose."
run_checker(document_to_check)

Analysis Results:

Similarity Score: 0.9515116214752197
Decision: Match Detected
Article submitted: 
House flies, Musca domestica L. (Diptera: Muscidae), were examined for their ability to harbor and transmit Newcastle disease virus (family Paramyxoviridae, genus Avulavirus, NDV) by using a mesogenic NDV strain. Laboratory-reared flies were experimentally exposed to NDV (Roakin strain) by allowing flies to imbibe an inoculum consisting of chicken embryo-propagated virus. NDV was detected in dissected crops and intestinal tissues from exposed flies for up to 96 and 24 h postexposure, respectively; no virus was detected in crops
Most Similar Article:
House flies, Musca domestica L. (Diptera: Muscidae), were examined for their ability to harbor and transmit Newcastle disease virus (family Paramyxoviridae, genus Avulavirus, NDV) by using a mesogenic NDV strain. Laboratory-reared flies were experimentally exposed to NDV (Roakin strain) by allowing flies to imbibe an inoculum consisting of c