In [None]:
!pip install nltk
!pip install rouge_score
!pip install bert-score transformers
!pip install jovian --upgrade --quiet
!pip install datasets
!pip install evaluate
!pip install rouge

##Metrics

Bleu{1,2,3,4}

In [2]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt')

def calculate_bleu(candidate, references):
    candidate_tokens = nltk.word_tokenize(candidate)
    reference_tokens = [nltk.word_tokenize(ref) for ref in references]

    smoothing = SmoothingFunction().method1
    bleu_1 = sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_2 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_3 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_4 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    return {
        'BLEU-1': bleu_1,
        'BLEU-2': bleu_2,
        'BLEU-3': bleu_3,
        'BLEU-4': bleu_4
    }

candidate = "The study investigates the effect of ..."
references = ["This research explores the impact of ..."]
bleu_scores = calculate_bleu(candidate, references)
print(bleu_scores)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'BLEU-1': 0.42857142857142855, 'BLEU-2': 0.26726124191242434, 'BLEU-3': 0.11511121735118796, 'BLEU-4': 0.07730551756939454}


In [3]:
from rouge_score import rouge_scorer

def calculate_rouge(candidate, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = scorer.score(reference, candidate)

    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

candidate = "The study investigates the effect of ..."
reference = "This research explores the impact of ..."
rouge_scores = calculate_rouge(candidate, reference)
print(rouge_scores)

{'ROUGE-1': 0.3333333333333333, 'ROUGE-2': 0.0, 'ROUGE-L': 0.3333333333333333}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [6]:
def compute_bertscore(candidates, references, model, tokenizer, lang='fa'):
    candidate_tokens = [tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True) for text in candidates]
    reference_tokens = [tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True) for text in references]

    candidate_embeddings = [model(tokens)[0].squeeze(0) for tokens in candidate_tokens]
    reference_embeddings = [model(tokens)[0].squeeze(0) for tokens in reference_tokens]

    P, R, F1 = bert_score.score(candidates, references, model_type="HooshvareLab/bert-fa-base-uncased", lang='fa', rescale_with_baseline=False)

    return P, R, F1

In [7]:
def get_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    attention_mask = inputs.attention_mask
    return embeddings, attention_mask

In [8]:
def compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask):
    candidate_embeddings = candidate_embeddings.cpu().numpy()
    reference_embeddings = reference_embeddings.cpu().numpy()

    candidate_mask = candidate_mask.cpu().numpy()
    reference_mask = reference_mask.cpu().numpy()

    similarities = cosine_similarity(candidate_embeddings.reshape(-1, candidate_embeddings.shape[-1]),
                                     reference_embeddings.reshape(-1, reference_embeddings.shape[-1]))

    similarities = similarities.reshape(candidate_embeddings.shape[1], reference_embeddings.shape[1])

    candidate_mask = candidate_mask[0]
    reference_mask = reference_mask[0]

    precision_scores = []
    recall_scores = []

    for i in range(candidate_embeddings.shape[1]):
        if candidate_mask[i] == 0:
            continue
        candidate_sim = similarities[i, :reference_mask.sum()]
        precision = candidate_sim.max()
        precision_scores.append(precision)

    for j in range(reference_embeddings.shape[1]):
        if reference_mask[j] == 0:
            continue
        reference_sim = similarities[:candidate_mask.sum(), j]
        recall = reference_sim.max()
        recall_scores.append(recall)

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1

In [9]:
references = ["I am a student in Sharif university."]
candidates = ["I am a professor in Teharan university."]

In [10]:
candidate_embeddings, candidate_mask = get_embeddings(candidates, tokenizer, model)
reference_embeddings, reference_mask = get_embeddings(references, tokenizer, model)

In [11]:
# P, R, F1 = bert_score.score(
#     cands=candidates,
#     refs=references,
#     model_type=None,
#     num_layers=None,
#     verbose=True,
#     idf=False,
#     device='cuda' if torch.cuda.is_available() else 'cpu',
#     batch_size=64,
#     lang=None,
#     return_hash=False,
#     rescale_with_baseline=False,
#     baseline_path=None,
#     use_fast_tokenizer=True,
#     custom_model=model,
#     custom_tokenizer=tokenizer
# )

In [12]:
precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)

In [13]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.7919
Recall: 0.8684
F1 Score: 0.8284


##Prepare Datasets

In [14]:
import jovian

Please download the token of your kaggle account which is a file named kaggle.json and upload it to the colab's root. Then run the cell below.

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download mahbodissaiy/pubmed-publication-type

In [None]:
from zipfile import ZipFile

file = "pubmed-publication-type.zip"
with ZipFile(file, 'r') as zip:
    zip.printdir()
    print('extraction...')
    zip.extractall()
    print('Done!')

In [17]:
import pandas as pd

df = pd.read_csv('dataset_cleaned.csv')
first_row = df.head(1)
print(first_row)

   pubmed_id                                              title  \
0   37994819  Survival and mechanical complications of singl...   

                                            abstract          type  label  \
0  PURPOSE: To evaluate the survival of and incid...  case-control      0   

                                                text  
0  Title:\nSurvival and mechanical complications ...  


In [18]:
import pandas as pd

original_df = pd.read_csv('dataset_cleaned.csv')
new_df = original_df[['pubmed_id', 'abstract']]

new_df.to_csv('pmid_abstracts.csv', index=False)
print("New CSV file 'pmid_abstracts.csv' created successfully!")

New CSV file 'pmid_abstracts.csv' created successfully!


QA dataset

we chose PubMedQA dataset from this link for the QA part: https://huggingface.co/datasets/qiaojin/PubMedQA

In [None]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_artificial")

In [20]:
# df = pd.read_csv('dataset_cleaned.csv')
# from datasets import Dataset, DatasetDict

# # Create a Dataset from the DataFrame
# dataset = Dataset.from_pandas(df)

In [21]:
#see all the column titles of the dataset
for split in ds.keys():
    print(f"Columns in {split} split:")
    print(ds[split].column_names)

Columns in train split:
['pubid', 'question', 'context', 'long_answer', 'final_decision']


we would like to remove the 'pubid' column for the dataset that we work on, cause it's not going to be of any use for our purpose. and keep the other columns. and also rename the 'long_answer' column to just 'answer' column

In [22]:
ds = ds.remove_columns("pubid")
ds = ds.rename_column("long_answer", "answer")

for split in ds.keys():
    print(f"Columns in {split} split:")
    print(ds[split].column_names)
    print(ds[split][0])

Columns in train split:
['question', 'context', 'answer', 'final_decision']
{'question': 'Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?', 'context': {'contexts': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.', 'The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.', 'A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+)

In [23]:
print(ds['train'][0]['question'])
print(ds['train'][0]['answer'])
print(ds.keys())

ds = ds['train']

Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?
As ILC2s are elevated in patients with CRSwNP, they may drive nasal polyp formation in CRS. ILC2s are also linked with high tissue and blood eosinophilia and have a potential role in the activation and survival of eosinophils during the Th2 immune response. The association of innate lymphoid cells in CRS provides insights into its pathogenesis.
dict_keys(['train'])


In [24]:
print(ds[0])
print(ds[0]['question'])

{'question': 'Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?', 'context': {'contexts': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.', 'The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.', 'A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+) Lin(-) CD127(+) CD4(-) CD8(-) CRTH2(CD294)(+) CD161(+) cells in single cell

so we got rid of the extra notation of ['train'] and dataset is accessible using ds variable right now.

Later on based on the split you prefer, you can split it into 3 datasets like ds_train, ds_val and ds_test

##Retrieval Model with TF-IDF

In [25]:
import re
import nltk
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [27]:
all_abstracts = new_df["abstract"].tolist()

In [28]:
contexts = all_abstracts
questions = [item["question"] for item in ds]
answers = [item["answer"] for item in ds]

In [29]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [30]:
contexts_preprocessed = []
for context in tqdm(contexts, desc="Processing contexts"):
    contexts_preprocessed.append(preprocess(context))

Processing contexts: 100%|██████████| 100093/100093 [02:02<00:00, 818.63it/s]


In [31]:
# questions_preprocessed = []
# for question in tqdm(questions, desc="Processing questions"):
#     questions_preprocessed.append(preprocess(question))

In [32]:
vectorizer = TfidfVectorizer()
X_contexts = vectorizer.fit_transform(tqdm(contexts_preprocessed, desc="Vectorizing..."))

Vectorizing...: 100%|██████████| 100093/100093 [00:11<00:00, 8429.41it/s]


In [33]:
def retrieve_relevant_texts(question, top_n=5):
    question_preprocessed = preprocess(question)
    question_vector = vectorizer.transform([question_preprocessed])
    similarity_scores = cosine_similarity(question_vector, X_contexts).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [34]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")

Relevant Text 1:
Atopic dermatitis (AD) is a common allergic inflammatory skin condition mainly caused by gene variants, immune disorders, and environmental risk factors. The T helper (Th) 2 immune response mediated by interleukin (IL)-4/13 is generally believed to be central in the pathogenesis of AD. It has been shown that innate lymphoid cells (ILCs) play a major effector cell role in the immune response in tissue homeostasis and inflammation and fascinating details about the interaction between innate and adaptive immunity. Changes in ILCs may contribute to the onset and progression of AD, and ILC2s especially have gained much attention. However, the role of ILCs in AD still needs to be further elucidated. This review summarizes the role of ILCs in skin homeostasis and highlights the signaling pathways in which ILCs may be involved in AD, thus providing valuable insights into the behavior of ILCs in skin homeostasis and inflammation, as well as new approaches to treating AD.

Relev

## Retrieval Model with BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
context_embeddings = [get_bert_embedding(context) for context in tqdm(contexts, desc="Get embeddings of contexts...", leave=True)]

In [None]:
def retrieve_relevant_texts(question, top_n=5):
    question_embedding = get_bert_embedding(question)
    similarity_scores = cosine_similarity([question_embedding], context_embeddings).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [None]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")

## Retrieval Model with USE

In [None]:
import numpy as np
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def get_use_embedding(texts, batch_size=128):
    embeddings = []
    num_batches = len(texts) // batch_size + int(len(texts) % batch_size != 0)
    with tqdm(total=num_batches, desc="Get embeddings of contexts...", leave=True) as pbar:
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = use_model(batch_texts).numpy()
            embeddings.append(batch_embeddings)
            pbar.update(1)
    return np.vstack(embeddings)

In [None]:
context_embeddings = get_use_embedding(contexts)

In [None]:
def retrieve_relevant_texts(question, top_n=5):
    question_embedding = get_use_embedding([question])[0]
    similarity_scores = cosine_similarity([question_embedding], context_embeddings).flatten()
    relevant_indices = similarity_scores.argsort()[-top_n:][::-1]
    relevant_texts = [contexts[i] for i in relevant_indices]
    return relevant_texts

In [None]:
question = "What is the role of ILC2s in chronic rhinosinusitis?"
relevant_texts = retrieve_relevant_texts(question)
for idx, text in enumerate(relevant_texts):
    print(f"Relevant Text {idx + 1}:\n{text}\n")



Get embeddings of contexts...: 100%|██████████| 1/1 [00:00<00:00, 38.92it/s]


Relevant Text 1:
What role do female sex hormones play in the antisperm immune response?

Relevant Text 2:
Chronic wounds cause significant morbidity and mortality and cost our health care system millions of dollars each year. A major impediment to wound healing is the formation of bacterial biofilms. Biofilms are communities of bacteria associated with chronic infections. This article reviews the literature on chronic wounds and biofilms. The role of biofilms in chronic wounds is not widely known. The purpose is to increase awareness of their role and to discuss research into novel therapeutic options. PubMed searches were performed to identify publications on chronic wounds and biofilms. Biofilms contribute to chronic wound nonhealing. There is an abundance of research into novel antibiofilm strategies for chronic wounds.

Relevant Text 3:
To review current knowledge of neurobiologic mechanisms that generate and maintain chronic pain and to explain how they might be applied in target

# **Finetune T5**

In [35]:
import torch
import json
import random
from tqdm import tqdm
from torch.optim import Adam
import evaluate
import requests
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [None]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-small")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-small", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256
T_LEN = 32
BATCH_SIZE = 4
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL.to(DEVICE)

In [39]:
number_of_questions_to_finetune_with = 5000
number_of_epochs_to_finetune = 5

In [40]:
indices = random.sample(range(len(questions)), number_of_questions_to_finetune_with)

finetune_questions = list()
finetune_contexts = list()
finetune_answers = list()

for i in tqdm(indices):
    finetune_questions.append(questions[i])
    relevant_texts = retrieve_relevant_texts(questions[i], 1)
    finetune_contexts.append(relevant_texts[0])
    finetune_answers.append(answers[i])

100%|██████████| 5000/5000 [25:05<00:00,  3.32it/s]


In [41]:
d = {'context': finetune_contexts, 'question':finetune_questions, 'answer':finetune_answers}
data = pd.DataFrame(d)
data.head()

Unnamed: 0,context,question,answer
0,Zinc is essential for multiple aspects of meta...,Is subtype-specific accumulation of intracellu...,This comprehensive analysis of the Zn transpor...
1,Rickettsial diseases are a group of vector-bor...,Does control of neglected tropical diseases ne...,Success in controlling these neglected tropica...
2,OBJECTIVE: To evaluate the association of mate...,Does a single preoperative dose of gabapentin ...,A single preoperative dose of gabapentin (800 ...
3,"Integrins, which consist of two non-covalently...",Does mMP-2 regulate human platelet activation ...,This work clearly demonstrates that platelet a...
4,Regulatory decisions regarding attention defic...,Do comorbid attention deficit hyperactivity di...,The findings provide evidence that there is in...


In [42]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [43]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [44]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(number_of_epochs_to_finetune):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1

    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1

    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches: 100%|██████████| 1000/1000 [01:41<00:00,  9.84it/s]
Validation batches: 100%|██████████| 250/250 [00:23<00:00, 10.76it/s]


1/2 -> Train loss: 3.7924359402656553	Validation loss: 3.2919391651153562


Training batches: 100%|██████████| 1000/1000 [01:38<00:00, 10.11it/s]
Validation batches: 100%|██████████| 250/250 [00:24<00:00, 10.27it/s]


2/2 -> Train loss: 3.6588868844509124	Validation loss: 3.2249317502975465


Training batches: 100%|██████████| 1000/1000 [01:38<00:00, 10.11it/s]
Validation batches: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]


3/2 -> Train loss: 3.586332416534424	Validation loss: 3.1743464663823446


Training batches: 100%|██████████| 1000/1000 [01:38<00:00, 10.12it/s]
Validation batches: 100%|██████████| 250/250 [00:23<00:00, 10.52it/s]


4/2 -> Train loss: 3.53571589794755	Validation loss: 3.1314919880628587


Training batches: 100%|██████████| 1000/1000 [01:39<00:00, 10.10it/s]
Validation batches: 100%|██████████| 250/250 [00:23<00:00, 10.56it/s]

5/2 -> Train loss: 3.4968789376020433	Validation loss: 3.093811737060547





In [45]:
MODEL.save_pretrained("qa_model")
TOKENIZER.save_pretrained("qa_tokenizer")

('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/spiece.model',
 'qa_tokenizer/added_tokens.json',
 'qa_tokenizer/tokenizer.json')

In [46]:
def predict_answer(context, question):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)

    return predicted_answer

# **Use T5**

In [47]:
number_of_questions = 5000

In [48]:
indices = random.sample(range(len(questions)), number_of_questions_to_finetune_with)

questions_inference = list()
contexts_inference = list()
answers_inference = list()

for i in tqdm(indices):
    questions_inference.append(questions[i])
    relevant_texts = retrieve_relevant_texts(questions[i], 1)
    contexts_inference.append(relevant_texts[0])
    answers_inference.append(answers[i])

100%|██████████| 5000/5000 [25:14<00:00,  3.30it/s]


In [49]:
predicted = list()
labels = list()

for i, question in tqdm(enumerate(questions_inference)):
    context = contexts_inference[i]
    answer = predict_answer(context, question)
    true_answer = answers_inference[i]

    predicted.append(answer)
    labels.append(true_answer)

5000it [17:25,  4.78it/s]


In [50]:
bleu1_list = list()
bleu2_list = list()
bleu3_list = list()
bleu4_list = list()
rouge1_list = list()
rouge2_list = list()
rougel_list = list()
bert_score_p = list()
bert_score_r = list()
bert_score_f = list()

for i in tqdm(range(len(predicted))):
    current_predict = predicted[i]
    current_answer = labels[i]

    # bleu
    bleu_scores = calculate_bleu(current_predict, [current_answer])
    bleu1_list.append(bleu_scores['BLEU-1'])
    bleu2_list.append(bleu_scores['BLEU-2'])
    bleu3_list.append(bleu_scores['BLEU-3'])
    bleu4_list.append(bleu_scores['BLEU-4'])

    # rouge
    rouge_scores = calculate_rouge(current_predict, current_answer)
    rouge1_list.append(rouge_scores['ROUGE-1'])
    rouge2_list.append(rouge_scores['ROUGE-2'])
    rougel_list.append(rouge_scores['ROUGE-L'])

    # bert score
    candidate_embeddings, candidate_mask = get_embeddings([current_predict], tokenizer, model)
    reference_embeddings, reference_mask = get_embeddings([current_answer], tokenizer, model)
    precision, recall, f1 = compute_bert_score(candidate_embeddings, reference_embeddings, candidate_mask, reference_mask)
    bert_score_p.append(precision)
    bert_score_r.append(recall)
    bert_score_f.append(f1)

100%|██████████| 5000/5000 [31:48<00:00,  2.62it/s]


In [51]:
print("BLEU1:", sum(bleu1_list) / len(bleu1_list))
print("BLEU2:", sum(bleu2_list) / len(bleu2_list))
print("BLEU3:", sum(bleu3_list) / len(bleu3_list))
print("BLEU4:", sum(bleu4_list) / len(bleu4_list))
print("ROUGE1:", sum(rouge1_list) / len(rouge1_list))
print("ROUGE2:", sum(rouge2_list) / len(rouge2_list))
print("ROUGEL:", sum(rougel_list) / len(rougel_list))
print("Precision:", sum(bert_score_p) / len(bert_score_p))
print("Recall:", sum(bert_score_r) / len(bert_score_r))
print("F1:", sum(bert_score_f) / len(bert_score_f))

BLEU1: 0.06393183533876493
BLEU2: 0.03678886293706074
BLEU3: 0.024369488156062352
BLEU4: 0.016968770701503183
ROUGE1: 0.23805217618320254
ROUGE2: 0.08428510719129664
ROUGEL: 0.19363312134877123
Precision: 0.6943102435648442
Recall: 0.5923364686310292
F1: 0.6378548998159842
