In [1]:
!pip install nltk
!pip install rouge_score
!pip install bert-score transformers
!pip install jovian --upgrade --quiet
!pip install datasets

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=08b8935b97d9aa7cebfee5a65ebb44417f1396e9bf2e7b949ffd755ecc7998f7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_runtime_cu12-

##Metrics

Bleu{1,2,3,4}

In [2]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt')

def calculate_bleu(candidate, references):
    candidate_tokens = nltk.word_tokenize(candidate)
    reference_tokens = [nltk.word_tokenize(ref) for ref in references]

    smoothing = SmoothingFunction().method1
    bleu_1 = sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_2 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_3 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_4 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    return {
        'BLEU-1': bleu_1,
        'BLEU-2': bleu_2,
        'BLEU-3': bleu_3,
        'BLEU-4': bleu_4
    }

candidate = "The study investigates the effect of ..."
references = ["This research explores the impact of ..."]
bleu_scores = calculate_bleu(candidate, references)
print(bleu_scores)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'BLEU-1': 0.42857142857142855, 'BLEU-2': 0.26726124191242434, 'BLEU-3': 0.11511121735118796, 'BLEU-4': 0.07730551756939454}


In [3]:
from rouge_score import rouge_scorer

def calculate_rouge(candidate, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = scorer.score(reference, candidate)

    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

candidate = "The study investigates the effect of ..."
reference = "This research explores the impact of ..."
rouge_scores = calculate_rouge(candidate, reference)
print(rouge_scores)

{'ROUGE-1': 0.3333333333333333, 'ROUGE-2': 0.0, 'ROUGE-L': 0.3333333333333333}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
def compute_bertscore(candidates, references, model, tokenizer, lang='fa'):
    candidate_tokens = [tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True) for text in candidates]
    reference_tokens = [tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True) for text in references]

    candidate_embeddings = [model(tokens)[0].squeeze(0) for tokens in candidate_tokens]
    reference_embeddings = [model(tokens)[0].squeeze(0) for tokens in reference_tokens]

    P, R, F1 = bert_score.score(candidates, references, model_type="HooshvareLab/bert-fa-base-uncased", lang='fa', rescale_with_baseline=False)

    return P, R, F1

In [7]:
def get_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    attention_mask = inputs.attention_mask
    return embeddings, attention_mask

In [8]:
def compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask):
    candidate_embeddings = candidate_embeddings.cpu().numpy()
    reference_embeddings = reference_embeddings.cpu().numpy()

    candidate_mask = candidate_mask.cpu().numpy()
    reference_mask = reference_mask.cpu().numpy()

    similarities = cosine_similarity(candidate_embeddings.reshape(-1, candidate_embeddings.shape[-1]),
                                     reference_embeddings.reshape(-1, reference_embeddings.shape[-1]))

    similarities = similarities.reshape(candidate_embeddings.shape[1], reference_embeddings.shape[1])

    candidate_mask = candidate_mask[0]
    reference_mask = reference_mask[0]

    precision_scores = []
    recall_scores = []

    for i in range(candidate_embeddings.shape[1]):
        if candidate_mask[i] == 0:
            continue
        candidate_sim = similarities[i, :reference_mask.sum()]
        precision = candidate_sim.max()
        precision_scores.append(precision)

    for j in range(reference_embeddings.shape[1]):
        if reference_mask[j] == 0:
            continue
        reference_sim = similarities[:candidate_mask.sum(), j]
        recall = reference_sim.max()
        recall_scores.append(recall)

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1

In [9]:
references = ["this is really bad."]
candidates = ["yes this university is really big and good."]

In [10]:
candidate_embeddings, candidate_mask = get_embeddings(candidates, tokenizer, model)
reference_embeddings, reference_mask = get_embeddings(references, tokenizer, model)

In [11]:
# P, R, F1 = bert_score.score(
#     cands=candidates,
#     refs=references,
#     model_type=None,
#     num_layers=None,
#     verbose=True,
#     idf=False,
#     device='cuda' if torch.cuda.is_available() else 'cpu',
#     batch_size=64,
#     lang=None,
#     return_hash=False,
#     rescale_with_baseline=False,
#     baseline_path=None,
#     use_fast_tokenizer=True,
#     custom_model=model,
#     custom_tokenizer=tokenizer
# )

In [12]:
precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)

In [13]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.5892
Recall: 0.6949
F1 Score: 0.6377


##Prepare Datasets

In [14]:
import jovian

Please download the token of your kaggle account which is a file named kaggle.json and upload it to the colab's root. Then run the cell below.

In [15]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download mahbodissaiy/pubmed-publication-type

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/mahbodissaiy/pubmed-publication-type
License(s): Apache 2.0
Downloading pubmed-publication-type.zip to /content
 93% 57.0M/61.5M [00:01<00:00, 57.8MB/s]
100% 61.5M/61.5M [00:01<00:00, 54.7MB/s]


In [16]:
from zipfile import ZipFile

file = "pubmed-publication-type.zip"
with ZipFile(file, 'r') as zip:
    zip.printdir()
    print('extraction...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
dataset_cleaned.csv                            2023-11-27 08:44:48    347216761
extraction...
Done!


In [17]:
import pandas as pd

df = pd.read_csv('dataset_cleaned.csv')
first_row = df.head(1)
print(first_row)

   pubmed_id                                              title  \
0   37994819  Survival and mechanical complications of singl...   

                                            abstract          type  label  \
0  PURPOSE: To evaluate the survival of and incid...  case-control      0   

                                                text  
0  Title:\nSurvival and mechanical complications ...  


In [18]:
import pandas as pd

original_df = pd.read_csv('dataset_cleaned.csv')
new_df = original_df[['pubmed_id', 'abstract']]

new_df.to_csv('pmid_abstracts.csv', index=False)
print("New CSV file 'pmid_abstracts.csv' created successfully!")

New CSV file 'pmid_abstracts.csv' created successfully!


QA dataset

we chose PubMedQA dataset from this link for the QA part: https://huggingface.co/datasets/qiaojin/PubMedQA

In [19]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_artificial")

Downloading readme:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [20]:
# df = pd.read_csv('dataset_cleaned.csv')
# from datasets import Dataset, DatasetDict

# # Create a Dataset from the DataFrame
# dataset = Dataset.from_pandas(df)

In [21]:
#see all the column titles of the dataset
for split in ds.keys():
    print(f"Columns in {split} split:")
    print(ds[split].column_names)

Columns in train split:
['pubid', 'question', 'context', 'long_answer', 'final_decision']


we would like to remove the 'pubid' column for the dataset that we work on, cause it's not going to be of any use for our purpose. and keep the other columns. and also rename the 'long_answer' column to just 'answer' column

In [22]:
ds = ds.remove_columns("pubid")
ds = ds.rename_column("long_answer", "answer")

for split in ds.keys():
    print(f"Columns in {split} split:")
    print(ds[split].column_names)
    print(ds[split][0])

Columns in train split:
['question', 'context', 'answer', 'final_decision']
{'question': 'Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?', 'context': {'contexts': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.', 'The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.', 'A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+)

In [23]:
print(ds['train'][0]['question'])
print(ds['train'][0]['answer'])
print(ds.keys())

ds = ds['train']

Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?
As ILC2s are elevated in patients with CRSwNP, they may drive nasal polyp formation in CRS. ILC2s are also linked with high tissue and blood eosinophilia and have a potential role in the activation and survival of eosinophils during the Th2 immune response. The association of innate lymphoid cells in CRS provides insights into its pathogenesis.
dict_keys(['train'])


In [24]:
print(ds[0])
print(ds[0]['question'])

{'question': 'Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?', 'context': {'contexts': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.', 'The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.', 'A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+) Lin(-) CD127(+) CD4(-) CD8(-) CRTH2(CD294)(+) CD161(+) cells in single cell

so we got rid of the extra notation of ['train'] and dataset is accessible using ds variable right now.

Later on based on the split you prefer, you can split it into 3 datasets like ds_train, ds_val and ds_test

##Retrieval Model with TF-IDF

In [25]:
import re
import nltk
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
all_abstracts = new_df["abstract"].tolist()

In [28]:
contexts = all_abstracts
questions = [item["question"] for item in ds]
answers = [item["answer"] for item in ds]

In [29]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [30]:
contexts_preprocessed = []
for context in tqdm(contexts, desc="Processing contexts"):
    contexts_preprocessed.append(preprocess(context))

Processing contexts: 100%|██████████| 100093/100093 [02:11<00:00, 760.75it/s] 


In [31]:
# questions_preprocessed = []
# for question in tqdm(questions, desc="Processing questions"):
#     questions_preprocessed.append(preprocess(question))

In [32]:
vectorizer = TfidfVectorizer()
X_contexts = vectorizer.fit_transform(tqdm(contexts_preprocessed, desc="Vectorizing..."))

Vectorizing...: 100%|██████████| 100093/100093 [00:12<00:00, 8077.19it/s]


In [33]:
def retrieve_relevant_texts(question, top_n=5):
    question_preprocessed = preprocess(question)
    question_vector = vectorizer.transform([question_preprocessed])
    similarity_scores = cosine_similarity(question_vector, X_contexts).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [34]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")

Relevant Text 1:
Atopic dermatitis (AD) is a common allergic inflammatory skin condition mainly caused by gene variants, immune disorders, and environmental risk factors. The T helper (Th) 2 immune response mediated by interleukin (IL)-4/13 is generally believed to be central in the pathogenesis of AD. It has been shown that innate lymphoid cells (ILCs) play a major effector cell role in the immune response in tissue homeostasis and inflammation and fascinating details about the interaction between innate and adaptive immunity. Changes in ILCs may contribute to the onset and progression of AD, and ILC2s especially have gained much attention. However, the role of ILCs in AD still needs to be further elucidated. This review summarizes the role of ILCs in skin homeostasis and highlights the signaling pathways in which ILCs may be involved in AD, thus providing valuable insights into the behavior of ILCs in skin homeostasis and inflammation, as well as new approaches to treating AD.

Relev

## Retrieval Model with BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
context_embeddings = [get_bert_embedding(context) for context in tqdm(contexts, desc="Get embeddings of contexts...", leave=True)]

In [None]:
def retrieve_relevant_texts(question, top_n=5):
    question_embedding = get_bert_embedding(question)
    similarity_scores = cosine_similarity([question_embedding], context_embeddings).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [None]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")

## Retrieval Model with USE

In [None]:
import numpy as np
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def get_use_embedding(texts, batch_size=128):
    embeddings = []
    num_batches = len(texts) // batch_size + int(len(texts) % batch_size != 0)
    with tqdm(total=num_batches, desc="Get embeddings of contexts...", leave=True) as pbar:
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = use_model(batch_texts).numpy()
            embeddings.append(batch_embeddings)
            pbar.update(1)
    return np.vstack(embeddings)

In [None]:
context_embeddings = get_use_embedding(contexts)

In [None]:
def retrieve_relevant_texts(question, top_n=5):
    question_embedding = get_use_embedding([question])[0]
    similarity_scores = cosine_similarity([question_embedding], context_embeddings).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [None]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")



Get embeddings of contexts...: 100%|██████████| 1/1 [00:00<00:00, 38.92it/s]


Relevant Text 1:
What role do female sex hormones play in the antisperm immune response?

Relevant Text 2:
Chronic wounds cause significant morbidity and mortality and cost our health care system millions of dollars each year. A major impediment to wound healing is the formation of bacterial biofilms. Biofilms are communities of bacteria associated with chronic infections. This article reviews the literature on chronic wounds and biofilms. The role of biofilms in chronic wounds is not widely known. The purpose is to increase awareness of their role and to discuss research into novel therapeutic options. PubMed searches were performed to identify publications on chronic wounds and biofilms. Biofilms contribute to chronic wound nonhealing. There is an abundance of research into novel antibiofilm strategies for chronic wounds.

Relevant Text 3:
To review current knowledge of neurobiologic mechanisms that generate and maintain chronic pain and to explain how they might be applied in target

# **Use T5**

In [35]:
number_of_questions = 1000

In [36]:
import random
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [37]:
indices = random.sample(range(len(questions)), number_of_questions)

question_and_contexts = list()
answers_for_questions = list()

for i in tqdm(indices):
    question_and_contexts_sample = list()
    question_and_contexts_sample.append(questions[i])
    relevant_texts = retrieve_relevant_texts(questions[i], 3)
    question_and_contexts_sample += relevant_texts
    question_and_contexts.append(question_and_contexts_sample)

    answers_for_questions.append(answers[i])

100%|██████████| 1000/1000 [05:18<00:00,  3.14it/s]


In [38]:
model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"
t5_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Type 1

In [40]:
def generate_answer(question, passages):
    context = " ".join(passages)

    input_text = f" question: {question} \n
    context: {context} \n
    Extract the specific information from the documents to answer the question."
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt')

    output_ids = t5_model.generate(input_ids.to(device))
    answer = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer

In [41]:
predicted = list()
labels = list()

for i, question_and_context in tqdm(enumerate(question_and_contexts)):
    question = question_and_context[0]
    context = question_and_context[1:]
    answer = generate_answer(question, context)
    true_answer = answers_for_questions[i]

    predicted.append(answer)
    labels.append(true_answer)

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
1000it [03:48,  4.38it/s]


In [42]:
bleu1_list = list()
bleu2_list = list()
bleu3_list = list()
bleu4_list = list()
rouge1_list = list()
rouge2_list = list()
rougel_list = list()
bert_score_p = list()
bert_score_r = list()
bert_score_f = list()

for i in tqdm(range(len(predicted))):
    current_predict = predicted[i]
    current_answer = labels[i]

    # bleu
    bleu_scores = calculate_bleu(current_predict, [current_answer])
    bleu1_list.append(bleu_scores['BLEU-1'])
    bleu2_list.append(bleu_scores['BLEU-2'])
    bleu3_list.append(bleu_scores['BLEU-3'])
    bleu4_list.append(bleu_scores['BLEU-4'])

    # rouge
    rouge_scores = calculate_rouge(current_predict, current_answer)
    rouge1_list.append(rouge_scores['ROUGE-1'])
    rouge2_list.append(rouge_scores['ROUGE-2'])
    rougel_list.append(rouge_scores['ROUGE-L'])

    # bert score
    candidate_embeddings, candidate_mask = get_embeddings([current_predict], tokenizer, model)
    reference_embeddings, reference_mask = get_embeddings([current_answer], tokenizer, model)
    precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)
    bert_score_p.append(precision)
    bert_score_r.append(recall)
    bert_score_f.append(f1)

100%|██████████| 1000/1000 [06:19<00:00,  2.64it/s]


In [44]:
print("BLEU1:", sum(bleu1_list) / len(bleu1_list))
print("BLEU2:", sum(bleu2_list) / len(bleu2_list))
print("BLEU3:", sum(bleu3_list) / len(bleu3_list))
print("BLEU4:", sum(bleu4_list) / len(bleu4_list))
print("ROUGE1:", sum(rouge1_list)/ len(rouge1_list))
print("ROUGE2:", sum(rouge2_list)/ len(rouge2_list))
print("ROUGEL:", sum(rougel_list)/ len(rougel_list))
print("Precision:", sum(bert_score_p)/ len(bert_score_p))
print("Recall:", sum(bert_score_r)/ len(bert_score_r))
print("F1:", sum(bert_score_f)/ len(bert_score_f))

BLEU1: 0.026920952792401857
BLEU2: 0.008312231074948091
BLEU3: 0.004804650617853295
BLEU4: 0.003384005643098116
ROUGE1: 0.09283019591564814
ROUGE2: 0.009782179275831327
ROUGEL: 0.07576765940690952
Precision: 0.5821063544154167
Recall: 0.47323058877885343
F1: 0.5170889470785149


# Type 2

In [45]:
def generate_answer(question, passages):
    context = " ".join(passages)

    input_text = f"question: {question} \ncontext: {context} \nInfer the relationships between the provided context to answer the question."
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt')

    output_ids = t5_model.generate(input_ids.to(device))
    answer = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer

In [46]:
predicted = list()
labels = list()

for i, question_and_context in tqdm(enumerate(question_and_contexts)):
    question = question_and_context[0]
    context = question_and_context[1:]
    answer = generate_answer(question, context)
    true_answer = answers_for_questions[i]

    predicted.append(answer)
    labels.append(true_answer)

1000it [03:50,  4.33it/s]


In [47]:
bleu1_list = list()
bleu2_list = list()
bleu3_list = list()
bleu4_list = list()
rouge1_list = list()
rouge2_list = list()
rougel_list = list()
bert_score_p = list()
bert_score_r = list()
bert_score_f = list()

for i in tqdm(range(len(predicted))):
    current_predict = predicted[i]
    current_answer = labels[i]

    # bleu
    bleu_scores = calculate_bleu(current_predict, [current_answer])
    bleu1_list.append(bleu_scores['BLEU-1'])
    bleu2_list.append(bleu_scores['BLEU-2'])
    bleu3_list.append(bleu_scores['BLEU-3'])
    bleu4_list.append(bleu_scores['BLEU-4'])

    # rouge
    rouge_scores = calculate_rouge(current_predict, current_answer)
    rouge1_list.append(rouge_scores['ROUGE-1'])
    rouge2_list.append(rouge_scores['ROUGE-2'])
    rougel_list.append(rouge_scores['ROUGE-L'])

    # bert score
    candidate_embeddings, candidate_mask = get_embeddings([current_predict], tokenizer, model)
    reference_embeddings, reference_mask = get_embeddings([current_answer], tokenizer, model)
    precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)
    bert_score_p.append(precision)
    bert_score_r.append(recall)
    bert_score_f.append(f1)

100%|██████████| 1000/1000 [06:17<00:00,  2.65it/s]


In [48]:
print("BLEU1:", sum(bleu1_list) / len(bleu1_list))
print("BLEU2:", sum(bleu2_list) / len(bleu2_list))
print("BLEU3:", sum(bleu3_list) / len(bleu3_list))
print("BLEU4:", sum(bleu4_list) / len(bleu4_list))
print("ROUGE1:", sum(rouge1_list) / len(rouge1_list))
print("ROUGE2:", sum(rouge2_list) / len(rouge2_list))
print("ROUGEL:", sum(rougel_list) / len(rougel_list))
print("Precision:", sum(bert_score_p)/ len(bert_score_p))
print("Recall:", sum(bert_score_r)/ len(bert_score_r))
print("F1:", sum(bert_score_f)/ len(bert_score_f))

BLEU1: 0.028548312094317366
BLEU2: 0.008812903034173127
BLEU3: 0.005093416510420871
BLEU4: 0.0035937901140651513
ROUGE1: 0.09472447872765954
ROUGE2: 0.010510662648064373
ROUGEL: 0.07736508984524958
Precision: 0.5819518983364105
Recall: 0.4762447394281626
F1: 0.5187487831532032


# Type 3

In [49]:
def generate_answer(question, passages):
    context = " ".join(passages)

    input_text = f"question: {question} \ncontext: {context} \nSummarize the information relevant to the question from the provided context."
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt')

    output_ids = t5_model.generate(input_ids.to(device))
    answer = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer

In [50]:
predicted = list()
labels = list()

for i, question_and_context in tqdm(enumerate(question_and_contexts)):
    question = question_and_context[0]
    context = question_and_context[1:]
    answer = generate_answer(question, context)
    true_answer = answers_for_questions[i]

    predicted.append(answer)
    labels.append(true_answer)

1000it [03:41,  4.51it/s]


In [51]:
bleu1_list = list()
bleu2_list = list()
bleu3_list = list()
bleu4_list = list()
rouge1_list = list()
rouge2_list = list()
rougel_list = list()
bert_score_p = list()
bert_score_r = list()
bert_score_f = list()

for i in tqdm(range(len(predicted))):
    current_predict = predicted[i]
    current_answer = labels[i]

    # bleu
    bleu_scores = calculate_bleu(current_predict, [current_answer])
    bleu1_list.append(bleu_scores['BLEU-1'])
    bleu2_list.append(bleu_scores['BLEU-2'])
    bleu3_list.append(bleu_scores['BLEU-3'])
    bleu4_list.append(bleu_scores['BLEU-4'])

    # rouge
    rouge_scores = calculate_rouge(current_predict, current_answer)
    rouge1_list.append(rouge_scores['ROUGE-1'])
    rouge2_list.append(rouge_scores['ROUGE-2'])
    rougel_list.append(rouge_scores['ROUGE-L'])

    # bert score
    candidate_embeddings, candidate_mask = get_embeddings([current_predict], tokenizer, model)
    reference_embeddings, reference_mask = get_embeddings([current_answer], tokenizer, model)
    precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)
    bert_score_p.append(precision)
    bert_score_r.append(recall)
    bert_score_f.append(f1)

100%|██████████| 1000/1000 [06:32<00:00,  2.55it/s]


In [53]:
print("BLEU1:", sum(bleu1_list) / len(bleu1_list))
print("BLEU2:", sum(bleu2_list) / len(bleu2_list))
print("BLEU3:", sum(bleu3_list) / len(bleu3_list))
print("BLEU4:", sum(bleu4_list) / len(bleu4_list))
print("ROUGE1:", sum(rouge1_list) / len(rouge1_list))
print("ROUGE2:", sum(rouge2_list) / len(rouge2_list))
print("ROUGEL:", sum(rougel_list)/ len(rougel_list))
print("Precision:", sum(bert_score_p)/ len(bert_score_p))
print("Recall:", sum(bert_score_r)/ len(bert_score_r))
print("F1:", sum(bert_score_f)/ len(bert_score_f))

BLEU1: 0.028367366585201656
BLEU2: 0.008709118286199705
BLEU3: 0.005044054411642594
BLEU4: 0.0035602085034886943
ROUGE1: 0.09495104983441416
ROUGE2: 0.01057050021726608
ROUGEL: 0.07793305259097985
Precision: 0.582078006118536
Recall: 0.47585565365850924
F1: 0.5186290635793053
