# Question 3

## Imports and functions

In [15]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, list_datasets
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
import pandas as pd
warnings.filterwarnings("ignore")

In [3]:
# Pipeline functions (based on tutorial 3)
def load_and_embedd_dataset(
        dataset_name: str = 'cnn_dailymail',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'highlights',
        rec_num: int = 400,
        shuffle = False,
        shuffle_seed = 3435
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset
    
    print("Loading and embedding the dataset")
    
    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)
    if shuffle:
        dataset = dataset.shuffle(seed=shuffle_seed)
    
    # Embed the first `rec_num` rows of the dataset  
    embeddings = model.encode(dataset[text_field][:rec_num])
    
    print("Done!")
    return dataset, embeddings


def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc


def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'highlights',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer as short as you can (if possible, in one word) to the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - mention it in the answer, don't use the contexts and answer on basis of your knowledge.
    Query: {query}"""
    return augmented_prompt, source_knowledge

## The RAG pipeline
Based on tutorial 3

### Loading, embedding and indexing

In [4]:
with open("cohere_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [5]:
EMBEDDING_MODEL = 'Alibaba-NLP/gte-base-en-v1.5' # best up-to ~0.5gb size model according to huggingface leaderboard (31 place of all models)
model = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)

In [6]:
# Document reading and embedding (no need for chunking - see Appendix)
DATASET_NAME = 'kroshan/BioASQ'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=1000,
    model=model,
    text_field='text',
    shuffle=True
)

Loading and embedding the dataset
Done!


In [7]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (1000, 768)


In [8]:
# Create the vector database. We are passing the index_name and the size of our embeddings
INDEX_NAME = 'bioask'
pc = create_pinecone_index(INDEX_NAME, embeddings.shape[1])

Creating a Pinecone index...
Done!


In [9]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset, text_field='text')

Upserting the embeddings to the Pinecone index...


100%|█████████████████████████████████████████████| 8/8 [00:06<00:00,  1.27it/s]


In [10]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

### Retrieval and augmentation

In [11]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

pd_dataset = dataset.to_pandas()
pd_dataset.head(100)  # show some random records (the dataset was shuffled)

Unnamed: 0,question,text
0,Which molecule is targeted by the drug Gevokizumab?,"<answer> IL-1β <context> Effects of gevokizumab on glycemia and inflammatory markers in type 2 diabetes. OBJECTIVE: Metabolic activation of the innate immune system governed by interleukin (IL)-1β contributes to β-cell failure in type 2 diabetes. Gevokizumab is a novel, human-engineered monoclonal anti-IL-1β antibody. We evaluated the safety and biological activity of gevokizumab in patients with type 2 diabetes. RESEARCH DESIGN AND METHODS: In a placebo-controlled, dose-escalation study, a total of 98 patients were randomly assigned to placebo (17 subjects) or gevokizumab (81 subjects) at increasing doses and dosing schedules. The primary objective of the study was to evaluate the safety profile of gevokizumab in type 2 diabetes. The secondary objectives were to assess pharmacokinetics for different dose levels, routes of administration, and regimens and to assess biological activity. RESULTS: The study drug was well tolerated with no serious adverse events. There was one hypoglycemic event whereupon concomitant insulin treatment had to be reduced. Clearance of gevokizumab was consistent with that for a human IgG(2), with a half-life of 22 days. In the combined intermediate-dose group (single doses of 0.03 and 0.1 mg/kg), the mean placebo-corrected decrease in glycated hemoglobin was 0.11, 0.44, and 0.85% after 1, 2 (P = 0.017), and 3 (P = 0.049) months, respectively, along with enhanced C-peptide secretion, increased insulin sensitivity, and a reduction in C-reactive protein and spontaneous and inducible cytokines. CONCLUSIONS: This novel IL-1β-neutralizing antibody improved glycemia, possibly via restored insulin production and action, and reduced inflammation in patients with type 2 diabetes. This therapeutic agent may be able to be used on a once-every-month or longer schedule."
1,Which clotting factor is inhibited by betrixaban?,"<answer> xa <context> Oral and parenteral anticoagulants: new kids on the block. Well-documented drawbacks of traditional anticoagulants have lead to the quest for an ideal anticoagulant resulting in a surge of novel anticoagulant molecules. These newer agents directly target specific steps in coagulation cascade and include newer low molecular weight heparins (adomiparin), ultra low molecular weight heparins (semuloparin, RO-14), inhibitors of activated factor II (dabigatran, AZD0837), X (rivaroxaban, apixaban, edoxaban, betrixaban), IX (REG1,2), XI (antisense oligonucleotides, BMS 262084, clavatadine A), VII/tissue factor (tifacogin, PCI 274836, and BMS 593214), V (recomodulin, solulin), VIII (TB402), dual thrombin/factor X inhibitors (EP21709, tanogitran), and newer vitamin K antagonists (tecarfarin). Direct thrombin inhibitors and Factor X inhibitors are the most clinically advanced. This article discusses the recent advances in the development of novel targets of anticoagulants. Medline, EMBASE, cochrane database, medscape, SCOPUS, and clinicaltrials.gov were searched using terms ""anticoagulants"", ""blood coagulation inhibitors"", ""anticoagulants and venous thromboembolism"", ""anticoagulants and atrial fibrillation"", and ""'antithrombins."" Journal articles published from 2007 to 2012 discussing pharmacology and/or clinical trials were screened."
2,Against which protein is the antibody used for immonostaining of Lewy bodies raised?,"<answer> alpha-synuclein <context> Nigral and cortical Lewy bodies and dystrophic nigral neurites in Parkinson's disease and cortical Lewy body disease contain alpha-synuclein immunoreactivity. A mutation in the alpha-synuclein gene has recently been linked to some cases of familial Parkinson's disease (PD). We characterized the expression of this presynaptic protein in the midbrain, striatum, and temporal cortex of control, PD, and dementia with Lewy bodies (DLB) brain. Control brain showed punctate pericellular immunostaining. PD brain demonstrated alpha-synuclein immunoreactivity in nigral Lewy bodies, pale bodies and abnormal neurites. Rare neuronal soma in PD brain were immunoreactive for alpha-synuclein. DLB cases demonstrated these findings as well as alpha-synuclein immunoreactivity in cortical Lewy bodies and CA2-3 neurites. These results suggest that, even in sporadic cases, there is an early and direct role for alpha-synuclein in the pathogenesis of PD and the neuropathologically related disorder DLB."
3,"What tyrosine kinase, involved in a Philadelphia- chromosome positive chronic myelogenous leukemia, is the target of Imatinib (Gleevec)?","<answer> Bcr-Abl <context> Drug responses of imatinib mesylate-resistant cells: synergism of imatinib with other chemotherapeutic drugs. Imatinib mesylate (STI571, Glivec, Gleevec) is a powerful inhibitor of the tyrosine kinase activity of Bcr-Abl, the oncoprotein responsible for chronic myeloid leukemia (CML). The drug shows great efficacy in chronic phase, but is less effective in maintaining hematologic remissions in blast crisis patients. Our group has previously described several cell lines made resistant to imatinib. We now examine the question of cross-resistance to other chemotherapeutic drugs used in CML. Four paired imatinib-sensitive/resistant CML cell lines were assessed by caspase-3 and MTS assays for their proliferative response to cytosine arabinoside (Ara-C), daunorubicin (DNR), homoharringtonine (HHT) and hydroxyurea (HU), either alone or in combination with imatinib. Primary blasts from advanced-stage CML patients refractory to imatinib therapy were studied by semi-solid media clonogenic assays. We found that these drugs are generally capable of major inhibition of proliferation of the CML cell lines, although differential responses to DNR and HHT were noted between some sensitive and resistant cell line pairs, implying that resistance to imatinib may confer a growth advantage under such conditions. The four drugs were also effective in preventing the formation of progenitor cell colonies from CML patients both before treatment with imatinib, and after relapse on the drug. Isobolographic analysis implied that these drugs will generally combine well with imatinib, and in some cases will be synergistic. We conclude that Ara-C, DNR or HHT, either alone or in combination with imatinib, are likely to be the best therapeutic alternatives in the management of patients who become resistant to imatinib monotherapy."
4,The drug JTV519 is derivative of which group of chemical compounds?,"<answer> benzothiazepine <context> JTV-519, a novel cardioprotective agent, improves the contractile recovery after ischaemia-reperfusion in coronary perfused guinea-pig ventricular muscles. A newly synthesized benzothiazepine derivative, JTV-519 (JT) has been reported to be cardioprotective. However, the precise mechanism underlying the cardioprotective effect of this drug is unknown. Coronary-perfused guinea-pig ventricular muscles were subjected to 20-min no-flow ischaemia followed by 60-min reperfusion (I/R). I/R significantly decreased the contraction in untreated preparations (control group, 34+/-4% of baseline value, n=6). Brief administration of JT (1.0 microM) prior to ischaemia significantly improved the postischaemic contractile recovery (63+/-5% of baseline value, n=4), as compared to the control group. JT (1.0 microM) slightly prolonged action potential duration before ischaemia and induced conduction disturbance (2 : 1 block) after the initiation of ischaemia. The cardioprotective effect of JT was antagonized by chelerythrine (CH, 5.0 microM), an inhibitor of protein kinase C (PKC) or by 5-hydroxydecanoic acid (5-HD, 400 microM), an inhibitor of mitochondrial ATP-sensitive K(+) (K(ATP)) channels. These results suggest that the protective effect of JT is due to the opening of mitochondrial K(ATP) channels, which, in turn, is linked to PKC activation."
5,Mutation of which gene is associated with McLeod syndrome?,"<answer> XK <context> McLeod syndrome: life-long neuropsychiatric disorder due to a novel mutation of the XK gene. A 50-year-old man presented with worsening, virtually lifelong, chorea and progressive behavioural disturbance, involving disinhibition and hoarding, over 10 years. Clinical assessment revealed chorea, dysarthria, areflexia, an inappropriately jovial, impulsive manner and neuropsychological evidence of frontosubcortical dysfunction. Investigation results included an elevated creatine kinase, caudate atrophy and hypoperfusion, acanthocytes in the peripheral blood and the McLeod phenotype. DNA studies demonstrated a single-base deletion at position 172 in exon 1 of the XK gene, giving rise to a premature stop codon at position 129 in exon 2."
6,What is the synonym of the lubag disease?,"<answer> x-linked dystonia-parkinsonism <context> Rare causes of dystonia parkinsonism. The list of genetic causes of syndromes of dystonia parkinsonism grows constantly. As a consequence, the diagnosis becomes more and more challenging for the clinician. Here, we summarize the important causes of dystonia parkinsonism including autosomal-dominant, recessive, and x-linked forms. We cover dopa-responsive dystonia, Wilson's disease, Parkin-, PINK1-, and DJ-1-associated parkinsonism (PARK2, 6, and 7), x-linked dystonia-parkinsonism/Lubag (DYT3), rapid-onset dystonia-parkinsonism (DYT12) and DYT16 dystonia, the syndromes of Neurodegeneration with Brain Iron Accumulation (NBIA) including pantothenate kinase (PANK2)- and PLA2G6 (PARK14)-associated neurodegeneration, neuroferritinopathy, Kufor-Rakeb disease (PARK9) and the recently described SENDA syndrome; FBXO7-associated neurodegeneration (PARK15), autosomal-recessive spastic paraplegia with a thin corpus callosum (SPG11), and dystonia parkinsonism due to mutations in the SLC6A3 gene encoding the dopamine transporter. They have in common that in all these syndromes there may be a combination of dystonic and parkinsonian features, which may be complicated by pyramidal tract involvement. The aim of this review is to familiarize the clinician with the phenotypes of these disorders."
7,Which is the transcript responsible for X-chromosome inactivation?,"<answer> Xist <context> X-inactivation: quantitative predictions of protein interactions in the Xist network. The transcriptional silencing of one of the female X-chromosomes is a finely regulated process that requires accumulation in cis of the long non-coding RNA X-inactive-specific transcript (Xist) followed by a series of epigenetic modifications. Little is known about the molecular machinery regulating initiation and maintenance of chromosomal silencing. Here, we introduce a new version of our algorithm catRAPID to investigate Xist associations with a number of proteins involved in epigenetic regulation, nuclear scaffolding, transcription and splicing processes. Our method correctly identifies binding regions and affinities of protein interactions, providing a powerful theoretical framework for the study of X-chromosome inactivation and other events mediated by ribonucleoprotein associations."
8,What is the name of Bruton's tyrosine kinase inhibitor that can be used for treatment of chronic lymphocytic leukemia?,"<answer> ibrutinib <context> Targeting the microenvironment in chronic lymphocytic leukemia is changing the therapeutic landscape. PURPOSE OF REVIEW: Despite ongoing efforts to decipher the cancer genome, discoveries of new targetable genetic lesions within cancer cells are rare. Therefore, alternative approaches are needed. Signals from the microenvironment are increasingly recognized as drivers of disease progression in hematologic and solid cancers. Consequently, there is growing interest in targeting the tumor-microenvironment cross-talk. This review highlights recent therapeutic advances in targeting the microenvironment in chronic lymphocytic leukemia (CLL). RECENT FINDINGS: CLL is the poster child for microenvironment-dependent malignancies, because the clonal CLL B cells are highly dependent on external signals for maintenance and expansion. These pathways recapitulate those responsible for normal B-cell expansion in germinal centers. The most prominent, conserved mechanism is B-cell receptor (BCR) signaling, which promotes CLL cell survival and expansion in lymphatic tissue areas designated proliferation centers. BCR signaling now can be targeted by new targeted kinase inhibitors. SUMMARY: Small molecule inhibitors of BCR signaling kinases, Bruton's tyrosine kinase (Btk) inhibitor ibrutinib and the phosphoinositide 3'-kinase delta (PI3Kδ) inhibitor GS-1101, are currently transforming the landscape of CLL therapy. This development exemplifies that the microenvironment has become a lively successful area of translational research."
9,Which is the process that Conserved noncoding elements mostly regulate?,"<answer> development <context> Disruption of long-distance highly conserved noncoding elements in neurocristopathies. One of the key discoveries of vertebrate genome sequencing projects has been the identification of highly conserved noncoding elements (CNEs). Some characteristics of CNEs include their high frequency in mammalian genomes, their potential regulatory role in gene expression, and their enrichment in gene deserts nearby master developmental genes. The abnormal development of neural crest cells (NCCs) leads to a broad spectrum of congenital malformation(s), termed neurocristopathies, and/or tumor predisposition. Here we review recent findings that disruptions of CNEs, within or at long distance from the coding sequences of key genes involved in NCC development, result in neurocristopathies via the alteration of tissue- or stage-specific long-distance regulation of gene expression. While most studies on human genetic disorders have focused on protein-coding sequences, these examples suggest that investigation of genomic alterations of CNEs will provide a broader understanding of the molecular etiology of both rare and common human congenital malformations."


In [12]:
# Anecdote examples of queries with wrong answers
queries = ["The drug JTV519 is derivative of which group of chemical compounds?",
           "What is the synonym of the lubag disease?",
           "Which medication should be administered when managing patients with suspected acute opioid overdose?"
          ]
answers = ["benzothiazepine",
           "x-linked dystonia-parkinsonism",
           "naloxone"
          ]

co = cohere.Client(api_key=COHERE_API_KEY)

for i in range(len(queries)):
    response = co.chat(
            model='command-r-plus',
            message=queries[i],
        )
    print(f"{i+1}.")
    print(response.text)
    print(f"\nCorrect answer: {answers[i]}\n")

1.
The drug JTV519 is a derivative of the chemical compound group called benzopyran. Specifically, it is a member of the chroman family, which is a type of benzopyran. Benzopyrans are characterized by a fused ring structure consisting of a six-membered benzene ring and a pyran ring, which is a five-membered ring containing four carbon atoms and one oxygen atom.

JTV519, also known as khellin, is naturally occurring and can be found in a plant called Ammi visnaga, which is native to the Mediterranean region. It has been studied for its potential therapeutic effects, particularly in the treatment of cardiovascular diseases and certain types of cancer.

The chemical structure of JTV519 includes a chroman ring system, which is a type of benzopyran. Modifications to the benzopyran structure through substitution of various functional groups at specific positions on the rings lead to the creation of derivatives with potentially altered biological activities.

In summary, JTV519 is a derivativ

In [13]:
# Anecdote examples of queries with wrong answers, now with augmented prompts
queries = ["The drug JTV519 is derivative of which group of chemical compounds?",
           "What is the synonym of the lubag disease?",
           "Which medication should be administered when managing patients with suspected acute opioid overdose?"
          ]
answers = ["benzothiazepine",
           "x-linked dystonia-parkinsonism",
           "naloxone"
          ]
for i in range(len(queries)):
    augmented_prompt, source_knowledge = augment_prompt(queries[i], model=model, index=index)
    response = co.chat(
            model='command-r-plus',
            message=augmented_prompt,
        )
    print(f"{i+1}.")
    print(response.text)
    print(f"\nCorrect answer: {answers[i]}\n")

1.
Benzothiazepine.

Correct answer: benzothiazepine

2.
X-linked dystonia-parkinsonism.

Correct answer: x-linked dystonia-parkinsonism

3.
Naloxone.

Correct answer: naloxone



## Appendix

In [14]:
for i in range(50):
    print(pd_dataset.loc[i, 'text'], '\n')

<answer> IL-1β <context> Effects of gevokizumab on glycemia and inflammatory markers in type 2 diabetes. OBJECTIVE: Metabolic activation of the innate immune system governed by interleukin (IL)-1β contributes to β-cell failure in type 2 diabetes. Gevokizumab is a novel, human-engineered monoclonal anti-IL-1β antibody. We evaluated the safety and biological activity of gevokizumab in patients with type 2 diabetes. RESEARCH DESIGN AND METHODS: In a placebo-controlled, dose-escalation study, a total of 98 patients were randomly assigned to placebo (17 subjects) or gevokizumab (81 subjects) at increasing doses and dosing schedules. The primary objective of the study was to evaluate the safety profile of gevokizumab in type 2 diabetes. The secondary objectives were to assess pharmacokinetics for different dose levels, routes of administration, and regimens and to assess biological activity. RESULTS: The study drug was well tolerated with no serious adverse events. There was one hypoglycemic

As can be seen, the documents are short paragraphs with semantically connected sentences. Thus, chonking may be redundant or even harmful for the model performance. Semantic shonking was considered, but after manual reading several docs, was decided to leave them as they are. Lastly, the embedding model outputs vectors of larger dimension (768), so theoretically it can encode enough information from those documents.