# Question 3

## Imports and functions

In [1]:
from sentence_transformers import SentenceTransformer
import datasets
from datasets import load_dataset, list_datasets
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
import pandas as pd
warnings.filterwarnings("ignore")

2024-06-30 18:43:09.669070: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-30 18:43:10.609766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/
2024-06-30 18:43:10.609914: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not

In [14]:
# Pipeline functions (based on tutorial 3)

def load_and_embedd_dataset(
        dataset_name: str = 'cnn_dailymail',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'highlights',
        rec_num: int = 400,
        shuffle: bool = False,
        shuffle_seed: int = 3435,
        semantic_chunk: bool = False,
        chunk_window: int = 2,
        chunk_overlap: int = 1,
        chunk_percentile_threshold: int = 90,
        preprocess_func = None,
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
        shuffle: Wheter to shuffle the dataset
        shuffle_seed: the seed for random shuffling (duh)
        semantic_chunk: wheter to semantically chunk the documents before embedding
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset
    
    print("Loading and embedding the dataset")
    
    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)
    # Preprocess (if needed)
    if preprocess_func is not None:
        dataset = dataset.map(preprocess_func)
    # Shuffle (if needed)
    if shuffle:
        dataset = dataset.shuffle(seed=shuffle_seed)
    # Semantic chunking (if needed)
    if semantic_chunk:
        dataset, rec_num = chunk_dataset(dataset, window=chunk_window, overlap=chunk_overlap, 
                                percentile_threshold=chunk_percentile_threshold, rec_num=rec_num)
    
    # Embed the first `rec_num` rows of the dataset  
    embeddings = model.encode(dataset[text_field][:rec_num])
    
    print("Done!")
    return dataset, embeddings


def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
) -> Pinecone:
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc


def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'highlights',
        batch_size: int = 128,
) -> Pinecone:
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
        text_field: The field to embed
    Returns:
        An updated pinecone index
    """       
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


def augment_prompt(
        query: str,
        index: Pinecone,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        top_k: int = 3,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
        model: The sentence embedding model
        top_k: Number of top results to retrieve from the knowlege base
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top k results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=top_k,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer as short as you can to the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - mention it in the answer, don't use the contexts and answer on basis of your knowledge.
    Query: {query}"""
    return augmented_prompt, source_knowledge


# Additional function for preprocessing and chunking

def bioasq_preprocess(dataset, text_field="text"):
    """Remove the answer at the beginning of the document (specific for the BioASQ dataset"""
    import re
    dataset[text_field] = re.sub(r'<answer>.*?<context> ', '', dataset[text_field])
    return dataset


def chunk_text(
        text: str, 
        window: int = 2, 
        overlap: int = 0,
) -> list:
    """
    Split text into chunks of window size of sentences, with possible overlaping.
    Args:
        text: input text to split
        window: number of consequent sentences in a chunk
        overlap: number of overlaping sentences between chunks (in case of window > 1)
        preprocess_func: function for text preprocessing
    Returns:
        list: a list of chunks
    """
    import re
    
    overlap = overlap if window > overlap else window - 1
    sentences = re.split(r'(?<=[.?!])\s+', text)

    chunks = []
    for i in range(0, len(sentences), window - overlap):
        chunks.append(" ".join(sentences[i: i + window]))

    return chunks


def get_semantic_chunks(
        chunks: list, 
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        percentile_threshold: int = 90,
) -> list:
    """
    Semantic chunking, given a list of sentences or sentences groups (each group will be embedded as a single sentence).
    Args:
        chunks: list of sentences or sentences groups/chunks
        model: the sentence embedding model
        percentile_threshold: all sentences distances that above the percentile will be considered as breakpoints to chunk between
    Returns:
        list: list of semantic chunks
    """
    import numpy as np
    import re
    from sklearn.metrics.pairwise import cosine_similarity

    # embed and calculate cosine distances (1 - cos_sim) between adjacent chunks
    embeddings = model.encode(chunks)
    distances = np.array([1 - cosine_similarity([embeddings[i]],[embeddings[i + 1]]) for i in range(len(embeddings) - 1)]).reshape(len(embeddings) - 1,)
    # find breakpoints (aka divide sentences into groups), this done by percentile value of distances - all above the percentile will be chunked
    try:
        distance_threshold = np.percentile(distances, percentile_threshold)
        breakpoint_indices = [i for i, x in enumerate(distances) if x > distance_threshold]

        start_idx = 0
        semantic_chunks = []
    
        for break_idx in breakpoint_indices:
            # in case of several sentences in the chunk, there may be overlaping, so need to remove duplicates before chunking
            sentences = sum([re.split(r'(?<=[.?!])\s+', chunk) for chunk in chunks[start_idx: break_idx + 1]], [])
            chunk = []
            for s in sentences:
                if s not in chunk:
                    chunk.append(s)
            chunk = " ".join(chunk)
            semantic_chunks.append(chunk)
            start_idx = break_idx + 1


    except Exception as e:
        print(e)
        print(chunks)
        print(distances)
    
    return semantic_chunks


def chunk_row(
            row, 
            model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
            text_field='text', 
            window=2, 
            overlap=1,
            percentile_threshold: int = 90,
    ) -> list:
    """
    Semantic chunk of text field in a single hugging face dataset row.
    Args:
        row: a Dataset row (hugging face dataset)
        text_field: a text column to chunk
        window: number of consequent sentences in a chunk
        overlap: number of overlaping sentences between chunks (in case of window > 1)
    """
    chunks = get_semantic_chunks(chunk_text(row[text_field], window=window, overlap=overlap), 
                                 model, percentile_threshold=percentile_threshold)
    result = []
    for chunk in chunks:
        new_row = row.copy()
        new_row[text_field] = chunk
        result.append(new_row)
    return result


def chunk_dataset(
        dataset, 
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'text',
        rec_num: int = 400,
        window: int = 2,
        overlap: int = 1,
        percentile_threshold: int = 90,
    ):
    """
    Semantic chunk of text field in the dataset and combining into new dataset, where each chunk will be in a new row, with duplicated data in other cells.
    Args:
        dataset: the hugging face dataset
        model: model for sentence embedding
        text_field: the field with text to chunk
        window: number of consequent sentences in a chunk
        overlap: number of overlaping sentences between chunks (in case of window > 1)
    Returns:
        chunked_dataset
        new_rec_num: updated records number, according to the number of chunks created from the first rec_num rows
    """
    new_rows = []
    new_rec_num = 0
    for i, row in enumerate(dataset):
        chunks = chunk_row(row, model, window=window, overlap=overlap, percentile_threshold=percentile_threshold)
        if i < rec_num:
            new_rec_num += len(chunks)
        new_rows.extend(chunks)

    chunked_dataset = datasets.Dataset.from_pandas(pd.DataFrame(new_rows))
    return chunked_dataset, new_rec_num


## The RAG pipeline
Based on tutorial 3

### Loading, embedding and indexing

In [3]:
with open("cohere_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [4]:
EMBEDDING_MODEL = 'Alibaba-NLP/gte-base-en-v1.5' # best up-to ~0.5gb size model according to huggingface leaderboard (31 place of all models)
model = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)

In [13]:
# Document reading, processing, semantic chinking and embedding
DATASET_NAME = 'kroshan/BioASQ'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=1000,
    model=model,
    text_field='text',
    shuffle=True,
    semantic_chunk=True,
    chunk_percentile_threshold=95,
    preprocess_func=bioasq_preprocess,
)

Loading and embedding the dataset
Done!


In [6]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (1007, 768)


In [7]:
# Create the vector database. We are passing the index_name and the size of our embeddings
INDEX_NAME = 'bioasq'
pc = create_pinecone_index(INDEX_NAME, embeddings.shape[1])

Creating a Pinecone index...
Done!


In [8]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset, text_field='text')

Upserting the embeddings to the Pinecone index...


100%|█████████████████████████████████████████████| 8/8 [00:06<00:00,  1.22it/s]


In [9]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1007}},
 'total_vector_count': 1007}

### Retrieval and augmentation

In [10]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

pd_dataset = dataset.to_pandas()
pd_dataset.head(10)  # show some random records (the dataset was shuffled)

Unnamed: 0,question,text
0,Which molecule is targeted by the drug Gevokizumab?,"Effects of gevokizumab on glycemia and inflammatory markers in type 2 diabetes. OBJECTIVE: Metabolic activation of the innate immune system governed by interleukin (IL)-1β contributes to β-cell failure in type 2 diabetes. Gevokizumab is a novel, human-engineered monoclonal anti-IL-1β antibody. We evaluated the safety and biological activity of gevokizumab in patients with type 2 diabetes. RESEARCH DESIGN AND METHODS: In a placebo-controlled, dose-escalation study, a total of 98 patients were randomly assigned to placebo (17 subjects) or gevokizumab (81 subjects) at increasing doses and dosing schedules. The primary objective of the study was to evaluate the safety profile of gevokizumab in type 2 diabetes. The secondary objectives were to assess pharmacokinetics for different dose levels, routes of administration, and regimens and to assess biological activity. RESULTS: The study drug was well tolerated with no serious adverse events. There was one hypoglycemic event whereupon concomitant insulin treatment had to be reduced. Clearance of gevokizumab was consistent with that for a human IgG(2), with a half-life of 22 days. In the combined intermediate-dose group (single doses of 0.03 and 0.1 mg/kg), the mean placebo-corrected decrease in glycated hemoglobin was 0.11, 0.44, and 0.85% after 1, 2 (P = 0.017), and 3 (P = 0.049) months, respectively, along with enhanced C-peptide secretion, increased insulin sensitivity, and a reduction in C-reactive protein and spontaneous and inducible cytokines. CONCLUSIONS: This novel IL-1β-neutralizing antibody improved glycemia, possibly via restored insulin production and action, and reduced inflammation in patients with type 2 diabetes. This therapeutic agent may be able to be used on a once-every-month or longer schedule."
1,Which clotting factor is inhibited by betrixaban?,Oral and parenteral anticoagulants: new kids on the block. Well-documented drawbacks of traditional anticoagulants have lead to the quest for an ideal anticoagulant resulting in a surge of novel anticoagulant molecules.
2,Against which protein is the antibody used for immonostaining of Lewy bodies raised?,"Nigral and cortical Lewy bodies and dystrophic nigral neurites in Parkinson's disease and cortical Lewy body disease contain alpha-synuclein immunoreactivity. A mutation in the alpha-synuclein gene has recently been linked to some cases of familial Parkinson's disease (PD). We characterized the expression of this presynaptic protein in the midbrain, striatum, and temporal cortex of control, PD, and dementia with Lewy bodies (DLB) brain. Control brain showed punctate pericellular immunostaining."
3,"What tyrosine kinase, involved in a Philadelphia- chromosome positive chronic myelogenous leukemia, is the target of Imatinib (Gleevec)?","Drug responses of imatinib mesylate-resistant cells: synergism of imatinib with other chemotherapeutic drugs. Imatinib mesylate (STI571, Glivec, Gleevec) is a powerful inhibitor of the tyrosine kinase activity of Bcr-Abl, the oncoprotein responsible for chronic myeloid leukemia (CML). The drug shows great efficacy in chronic phase, but is less effective in maintaining hematologic remissions in blast crisis patients. Our group has previously described several cell lines made resistant to imatinib. We now examine the question of cross-resistance to other chemotherapeutic drugs used in CML. Four paired imatinib-sensitive/resistant CML cell lines were assessed by caspase-3 and MTS assays for their proliferative response to cytosine arabinoside (Ara-C), daunorubicin (DNR), homoharringtonine (HHT) and hydroxyurea (HU), either alone or in combination with imatinib. Primary blasts from advanced-stage CML patients refractory to imatinib therapy were studied by semi-solid media clonogenic assays. We found that these drugs are generally capable of major inhibition of proliferation of the CML cell lines, although differential responses to DNR and HHT were noted between some sensitive and resistant cell line pairs, implying that resistance to imatinib may confer a growth advantage under such conditions. The four drugs were also effective in preventing the formation of progenitor cell colonies from CML patients both before treatment with imatinib, and after relapse on the drug. Isobolographic analysis implied that these drugs will generally combine well with imatinib, and in some cases will be synergistic."
4,The drug JTV519 is derivative of which group of chemical compounds?,"JTV-519, a novel cardioprotective agent, improves the contractile recovery after ischaemia-reperfusion in coronary perfused guinea-pig ventricular muscles. A newly synthesized benzothiazepine derivative, JTV-519 (JT) has been reported to be cardioprotective. However, the precise mechanism underlying the cardioprotective effect of this drug is unknown."
5,Mutation of which gene is associated with McLeod syndrome?,"McLeod syndrome: life-long neuropsychiatric disorder due to a novel mutation of the XK gene. A 50-year-old man presented with worsening, virtually lifelong, chorea and progressive behavioural disturbance, involving disinhibition and hoarding, over 10 years. Clinical assessment revealed chorea, dysarthria, areflexia, an inappropriately jovial, impulsive manner and neuropsychological evidence of frontosubcortical dysfunction. Investigation results included an elevated creatine kinase, caudate atrophy and hypoperfusion, acanthocytes in the peripheral blood and the McLeod phenotype."
6,What is the synonym of the lubag disease?,"Rare causes of dystonia parkinsonism. The list of genetic causes of syndromes of dystonia parkinsonism grows constantly. As a consequence, the diagnosis becomes more and more challenging for the clinician. Here, we summarize the important causes of dystonia parkinsonism including autosomal-dominant, recessive, and x-linked forms. We cover dopa-responsive dystonia, Wilson's disease, Parkin-, PINK1-, and DJ-1-associated parkinsonism (PARK2, 6, and 7), x-linked dystonia-parkinsonism/Lubag (DYT3), rapid-onset dystonia-parkinsonism (DYT12) and DYT16 dystonia, the syndromes of Neurodegeneration with Brain Iron Accumulation (NBIA) including pantothenate kinase (PANK2)- and PLA2G6 (PARK14)-associated neurodegeneration, neuroferritinopathy, Kufor-Rakeb disease (PARK9) and the recently described SENDA syndrome; FBXO7-associated neurodegeneration (PARK15), autosomal-recessive spastic paraplegia with a thin corpus callosum (SPG11), and dystonia parkinsonism due to mutations in the SLC6A3 gene encoding the dopamine transporter. They have in common that in all these syndromes there may be a combination of dystonic and parkinsonian features, which may be complicated by pyramidal tract involvement. The aim of this review is to familiarize the clinician with the phenotypes of these disorders."
7,Which is the transcript responsible for X-chromosome inactivation?,X-inactivation: quantitative predictions of protein interactions in the Xist network. The transcriptional silencing of one of the female X-chromosomes is a finely regulated process that requires accumulation in cis of the long non-coding RNA X-inactive-specific transcript (Xist) followed by a series of epigenetic modifications. Little is known about the molecular machinery regulating initiation and maintenance of chromosomal silencing.
8,What is the name of Bruton's tyrosine kinase inhibitor that can be used for treatment of chronic lymphocytic leukemia?,"Targeting the microenvironment in chronic lymphocytic leukemia is changing the therapeutic landscape. PURPOSE OF REVIEW: Despite ongoing efforts to decipher the cancer genome, discoveries of new targetable genetic lesions within cancer cells are rare. Therefore, alternative approaches are needed. Signals from the microenvironment are increasingly recognized as drivers of disease progression in hematologic and solid cancers. Consequently, there is growing interest in targeting the tumor-microenvironment cross-talk. This review highlights recent therapeutic advances in targeting the microenvironment in chronic lymphocytic leukemia (CLL). RECENT FINDINGS: CLL is the poster child for microenvironment-dependent malignancies, because the clonal CLL B cells are highly dependent on external signals for maintenance and expansion. These pathways recapitulate those responsible for normal B-cell expansion in germinal centers. The most prominent, conserved mechanism is B-cell receptor (BCR) signaling, which promotes CLL cell survival and expansion in lymphatic tissue areas designated proliferation centers. BCR signaling now can be targeted by new targeted kinase inhibitors. SUMMARY: Small molecule inhibitors of BCR signaling kinases, Bruton's tyrosine kinase (Btk) inhibitor ibrutinib and the phosphoinositide 3'-kinase delta (PI3Kδ) inhibitor GS-1101, are currently transforming the landscape of CLL therapy. This development exemplifies that the microenvironment has become a lively successful area of translational research."
9,Which is the process that Conserved noncoding elements mostly regulate?,"Disruption of long-distance highly conserved noncoding elements in neurocristopathies. One of the key discoveries of vertebrate genome sequencing projects has been the identification of highly conserved noncoding elements (CNEs). Some characteristics of CNEs include their high frequency in mammalian genomes, their potential regulatory role in gene expression, and their enrichment in gene deserts nearby master developmental genes."


In [17]:
# Anecdote examples of queries with wrong answers
queries = ["The drug JTV519 is derivative of which group of chemical compounds?",
           "What is the synonym of the lubag disease?",
           "Which medication should be administered when managing patients with suspected acute opioid overdose?"
          ]
answers = ["benzothiazepine",
           "x-linked dystonia-parkinsonism",
           "naloxone"
          ]

co = cohere.Client(api_key=COHERE_API_KEY)

for i in range(len(queries)):
    response = co.chat(
            model='command-r-plus',
            message=queries[i],
        )
    print(f"{i+1}. {queries[i]}")
    print(f"Model's answer: {response.text}")
    print(f"\nCorrect answer: {answers[i]}\n")

1. The drug JTV519 is derivative of which group of chemical compounds?
Model's answer: The drug JTV519 is a derivative of the chemical compound group called benzopyran. Specifically, it is a member of the chroman family, which is a type of benzopyran. Benzopyrans are characterized by a fused ring structure consisting of a six-membered benzene ring and a pyran ring, which is a five-membered ring containing four carbon atoms and one oxygen atom.

In the case of JTV519, the benzopyran structure is modified to include specific substituents that impart its pharmacological properties. These modifications are designed to target specific biological mechanisms and achieve the desired therapeutic effects.

JTV519, also known as k252a, is a potent and selective inhibitor of certain types of calcium channels, particularly the L-type calcium channels. It has been investigated for its potential therapeutic benefits in various cardiovascular and neurological disorders. The development and study of JT

In [19]:
# Anecdote examples of queries with wrong answers, now with augmented prompts
queries = ["The drug JTV519 is derivative of which group of chemical compounds?",
           "What is the synonym of the lubag disease?",
           "Which medication should be administered when managing patients with suspected acute opioid overdose?"
          ]
answers = ["benzothiazepine",
           "x-linked dystonia-parkinsonism",
           "naloxone"
          ]
for i in range(len(queries)):
    augmented_prompt, source_knowledge = augment_prompt(queries[i], model=model, index=index)
    response = co.chat(
            model='command-r-plus',
            message=augmented_prompt,
        )
    print(f"{i+1}. {queries[i]}")
    print(f"Model's answer: {response.text}")
    print(f"\nCorrect answer: {answers[i]}\n")

1. The drug JTV519 is derivative of which group of chemical compounds?
Model's answer: JTV519 is a derivative of 1,4-benzothiazepine.

Correct answer: benzothiazepine

2. What is the synonym of the lubag disease?
Model's answer: X-linked dystonia-parkinsonism.

Correct answer: x-linked dystonia-parkinsonism

3. Which medication should be administered when managing patients with suspected acute opioid overdose?
Model's answer: Naloxone.

Correct answer: naloxone

