---
## Dependencies

In [1]:
!pip install -q pinecone

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/427.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.3/427.3 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json

import pandas as pd
import torch
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec

---
## Load and Prepare Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with open('/content/drive/MyDrive/ir_project_dataset/training13b.json', 'rb') as f:
  data_dict = json.load(f)

In [5]:
data_dict['questions'][0]['snippets'][0]

{'offsetInBeginSection': 131,
 'offsetInEndSection': 358,
 'text': 'Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes',
 'beginSection': 'abstract',
 'document': 'http://www.ncbi.nlm.nih.gov/pubmed/15829955',
 'endSection': 'abstract'}

In [6]:
contexts = []

for q in data_dict['questions']:
  for context in q['snippets']:
    contexts.append(dict(context_text=context['text']))

In [7]:
contexts[0]

{'context_text': 'Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes'}

In [8]:
contexts_df = pd.DataFrame(contexts)
contexts_df.head(10)

Unnamed: 0,context_text
0,Hirschsprung disease (HSCR) is a multifactoria...
1,"In this study, we review the identification of..."
2,"Coding sequence mutations in e.g. RET, GDNF, E..."
3,For almost all of the identified HSCR genes in...
4,Hirschsprung disease (HSCR) is a multifactori...
5,The inheritance of Hirschsprung disease is ge...
6,Hirschsprung disease (HSCR) is a multifactoria...
7,"Differential contributions of rare and common,..."
8,BACKGROUND: RET is the major gene associated t...
9,In the etiology of Hirschsprung disease variou...


In [9]:
contexts_df.shape

(68860, 1)

---
## Retriever Model

Sentence Embeddings: https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base

In [10]:
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# load the retriever model from huggingface model hub
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base", device=device)
retriever

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

---
## Initialize Pinecone

In [11]:
with open('/content/drive/MyDrive/pinecone/pinecone_api_key.txt', 'r') as f:
  API_KEY = f.read().strip()

In [12]:
pc = Pinecone(api_key=API_KEY)

In [13]:
index_name = "bioasq-question-answering"

In [14]:
import time

# check if index already exists
if index_name not in pc.list_indexes().names():
  # if does not exist, create index
  pc.create_index(name=index_name,
                  dimension=768,
                  metric='cosine',
                  spec=ServerlessSpec(cloud='aws',
                                      region='us-east-1'))
  # wait for index to be initialized
  while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 68860}},
 'total_vector_count': 68860}

---
## Create Embeddings and Upsert/Insert to Pinecone

In [15]:
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(contexts_df), batch_size)):
  # find end of batch
  i_end = min(i+batch_size, len(contexts_df))
  # extract batch
  batch = contexts_df.iloc[i:i_end]
  # generate embeddings for batch
  emb = retriever.encode(batch["context_text"].tolist()).tolist()
  # get metadata
  meta = batch.to_dict(orient="records")
  # create unique IDs
  ids = [f"{idx}" for idx in range(i, i_end)]
  # add all to upsert list
  to_upsert = list(zip(ids, emb, meta))
  # upsert/insert these records to pinecone
  _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/1076 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 68860}},
 'total_vector_count': 68860}

In [16]:
def query_pinecone(query, top_k):
  # generate embeddings for the query
  xq = retriever.encode([query]).tolist()
  # search pinecone index for context passage with the answer
  xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
  return xc

In [17]:
query = "Is RANKL secreted from the cells?"
result = query_pinecone(query, top_k=1)
result

{'matches': [{'id': '50',
              'metadata': {'context_text': 'Activated human T cells express '
                                           'alternative mRNA transcripts '
                                           'encoding a secreted form of '
                                           'RANKL.'},
              'score': 0.553318918,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

---
## Generator Model

bart-lfqa model: https://huggingface.co/vblagoje/bart_lfqa

In [18]:
from transformers import BartTokenizer, BartForConditionalGeneration

# load bart tokenizer and model from huggingface
tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa').to(device)

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [19]:
def format_query(query, context):
  # extract context_text from Pinecone search result and add the  tag
  # inputs to the model should be formated -> question: question context: <P> context <P> context...
  context = [f"<P> {m['metadata']['context_text']}" for m in context]
  # concatinate all context passages
  context = " ".join(context)
  # contcatinate the query and context passages
  query = f"question: {query} context: {context}"
  return query

In [20]:
query = format_query(query, result["matches"])
print(query)

question: Is RANKL secreted from the cells? context: <P> Activated human T cells express alternative mRNA transcripts encoding a secreted form of RANKL.


In [21]:
def generate_answer(query):
  # tokenize the query to get input_ids
  inputs = tokenizer([query], max_length=1024, truncation=True, return_tensors="pt").to(device)
  # use generator to predict output ids
  ids = generator.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=40)
  # use tokenizer to decode the output ids
  answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
  return answer

In [22]:
query = "Is the protein Papilin secreted?"
context = query_pinecone(query, top_k=5)
query = format_query(query, context["matches"])
print(generate_answer(query))
# print(model_pipeline(query))

Papilin is a glycoprotein, meaning it is an extracellular matrix glycoprotein. It is secreted into the medium by hemocytes.


In [23]:
for doc in context["matches"]:
  print(doc["metadata"]["context_text"], end='\n---\n')

Papilins are extracellular matrix proteins 
---
Papilin is an extracellular matrix glycoprotein 
---
Ghrelin is a 28-amino acid peptide secreted mainly by the stomach.
---
 Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium. 
---
Ghrelin is a gut peptide composed of 28 amino acids mostly secreted in the gastric fundus mucosa.
---


In [24]:
query = "Are long non coding RNAs spliced?"
context = query_pinecone(query, top_k=5)
query = format_query(query, context["matches"])
print(generate_answer(query))

Yes, long non coding RNAs (LNCRNAs) can be spliced, but they are not normally spliced. The reason for this is that LNCRNAs are not


In [25]:
for doc in context["matches"]:
  print(doc["metadata"]["context_text"], end='\n---\n')

For alternative exons and long noncoding RNAs, splicing tends to occur later, and the latter might remain unspliced in some cases.
---
 Long noncoding RNAs (lncRNAs) are more than 200 nucleotides in length and lack transcriptional ability.
---
Natural SINEUPs suggest that embedded Transposable Elements may represent functional domains in long non-coding RNAs
---


---
## Evaluation: Compute Lexical Similarity with ROUGE Metric

ROUGE metric: https://medium.com/@eren9677/text-summarization-387836c9e178

In [26]:
questions_and_answers = []

for q in data_dict['questions']:
  questions_and_answers.append(dict(question=q['body'],
                                    answer=q['ideal_answer'][0]))

qna_df = pd.DataFrame(questions_and_answers)
qna_df.head(10)

Unnamed: 0,question,answer
0,Is Hirschsprung disease a mendelian or a multi...,"Coding sequence mutations in RET, GDNF, EDNRB,..."
1,List signaling molecules (ligands) that intera...,The 7 known EGFR ligands are: epidermal growt...
2,Is the protein Papilin secreted?,"Yes, papilin is a secreted protein"
3,Are long non coding RNAs spliced?,Long non coding RNAs appear to be spliced thro...
4,Is RANKL secreted from the cells?,Receptor activator of nuclear factor κB ligand...
5,Does metformin interfere thyroxine absorption?,No. There are not reported data indicating tha...
6,Which miRNAs could be used as potential biomar...,"miR-200a, miR-100, miR-141, miR-200b, miR-200c..."
7,Which acetylcholinesterase inhibitors are used...,Pyridostigmine and neostygmine are acetylcholi...
8,Has Denosumab (Prolia) been approved by FDA?,"Yes, Denosumab was approved by the FDA in 2010."
9,List the human genes encoding for the dishevel...,DVL-1\nDVL-2\nDVL-3


In [27]:
valid_df = qna_df.sample(frac=0.1, random_state=42)
valid_df.shape

(539, 2)

In [28]:
valid_questions = valid_df['question'].tolist()

valid_questions[:5]

['What is small-activating RNA?',
 'Can the CEP290 gene mutations be targeted by AAV-mediated gene therapy?',
 'Is there an approved vaccine against Helicobacter pylori?',
 'Which company produces Glybera?',
 'Cerliponase alfa is apprived for treatment of which disease?']

In [29]:
valid_df['answer'].tolist()[:5]

['small activating RNAs are double stranded RNAs (dsRNAs) that target gene promoters and trigger gene activation.',
 'The large size of the CEP290 gene prevents its use in adeno-associated virus (AAV)-mediated gene augmentation therapy.',
 'No, there is no approved vaccine against Helicobacter pylori.',
 'Glybera is a product of Chiesi Pharma.',
 'Cerliponase alfa is a recombinant human tripeptidyl peptidase-1 (TPP1) approved for use in patients with neuronal ceroid lipofuscinosis type 2 (CLN2), a paediatric neurodegenerative disease caused by a deficiency in TPP1.']

In [30]:
predicted_answers = []
for question in tqdm(valid_questions, leave=True):
  context = query_pinecone(question, top_k=5)
  query = format_query(question, context["matches"])
  answer = generate_answer(query)
  predicted_answers.append(answer)

  0%|          | 0/539 [00:00<?, ?it/s]

In [31]:
predicted_answers[:5]

['Small-activating RNA is a type of RNA that can bind to mRNA and inhibit its translation. It is a type of RNA that can bind to mRNA and inhibit its translation. It is',
 'The CEP290 gene is a very large gene, so it would be very difficult to target it with a gene therapy. The CEP290 gene is also very large, so it would',
 'There is an approved vaccine against Helicobacter pylori, but it is not a vaccine against Helicobacter pylori itself. It is a vaccine against the bacterium that causes',
 'Glybera is produced by Bayer, a subsidiary of Bayer. Bayer is a subsidiary of Bayer. Bayer is a subsidiary of Bayer. Bayer is a subsidiary of Bayer. Bayer is a',
 'Cerliponase Alfa is a recombinant human tripeptidyl peptidase-1 (TPP1) being developed by BioMarin Pharmaceutical Inc. for use in']

In [32]:
valid_df['predicted_answer'] = predicted_answers
valid_df.head(10)

Unnamed: 0,question,answer,predicted_answer
2810,What is small-activating RNA?,small activating RNAs are double stranded RNAs...,Small-activating RNA is a type of RNA that can...
2316,Can the CEP290 gene mutations be targeted by A...,The large size of the CEP290 gene prevents its...,"The CEP290 gene is a very large gene, so it wo..."
5085,Is there an approved vaccine against Helicobac...,"No, there is no approved vaccine against Helic...",There is an approved vaccine against Helicobac...
3084,Which company produces Glybera?,Glybera is a product of Chiesi Pharma.,"Glybera is produced by Bayer, a subsidiary of ..."
3039,Cerliponase alfa is apprived for treatment of ...,Cerliponase alfa is a recombinant human tripep...,Cerliponase Alfa is a recombinant human tripep...
3282,What is the SLC25A20 protein transporting?,The carnitine/acylcarnitine transporter (CACT;...,It's a protein that is involved in the transpo...
964,RTS S AS01 vaccine was developed to prevent wh...,"RTS,S/AS01 vaccine was developed for preventio...","The RTS,S/AS01 vaccine was developed to preven..."
3159,What is the price of KYMRIAH treatment in 2019?,"Kymriah, produced by Novartis has a price tag ...",Kymriah is currently being developed by Novart...
491,Which are the characteristics of Andersen syn...,the characteristics of Andersen syndrome are a...,I'm not sure if this is the right subreddit to...
4563,What are the currently FDA approved monoclonal...,The US Food and Drug Administration approved M...,I'm not sure if this is what you're looking fo...


In [33]:
import string, re

def normalize_text(text):
  regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) # remove articles -> a|an|the
  text = " ".join(re.sub(regex, " ", text.lower().strip()))
  text = "".join(ch for ch in text if ch not in set(string.punctuation)) # remove punctuations
  return text

In [34]:
ideal_answers = [normalize_text(text) for text in valid_df['answer'].tolist()]
predicted_answers = [normalize_text(text) for text in valid_df['predicted_answer'].tolist()]

### Use ROUGE metric from `evaluate` library

In [40]:
!pip install -q evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=bfc394c7bfa5f2e8d175031ea3655ce153f5d708ad4d9df2e9eec16bf56b9e92
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [41]:
import evaluate

In [42]:
rouge = evaluate.load('rouge')
# bleu = evaluate.load('bleu')

In [43]:
rouge_scores = []
# bleu_scores = []

for pred_answer, ideal_answer in zip(predicted_answers, ideal_answers):
  rouge_scores.append(rouge.compute(predictions=[pred_answer], references=[ideal_answer]))
  # bleu_scores.append(bleu.compute(predictions=[pred_answer], references=[ideal_answer]))

In [44]:
rouge_scores[0]

{'rouge1': 0.669683257918552,
 'rouge2': 0.36529680365296796,
 'rougeL': 0.4977375565610861,
 'rougeLsum': 0.4977375565610861}

In [45]:
rouge1_scores_eval = [score['rouge1'] for score in rouge_scores]
avg_rouge1_score_eval = sum(rouge1_scores_eval) / len(rouge1_scores_eval)
avg_rouge1_score_eval

0.6267222988867304