### Import libraries, custom classes and functions

In [1]:
from pathlib import Path
from pprint import pprint
import sys
import os
import random

from llama_index.core import ServiceContext, set_global_service_context, set_global_handler
from llama_index.core.node_parser import SentenceSplitter

from task_dataset import PubMedQATaskDataset

sys.path.append("..")
from utils.hosting_utils import RAGLLM
from utils.rag_utils import (
    DocumentReader, RAGEmbedding, RAGQueryEngine, RagasEval, 
    extract_yes_no, evaluate, validate_rag_cfg
    )
from utils.storage_utils import RAGIndex

The chromadb package is not available on this system, skipping


In [2]:
import warnings
warnings.filterwarnings('ignore')

### Set RAG configuration

In [3]:
rag_cfg = {
    # Node parser config
    "chunk_size": 256,
    "chunk_overlap": 0,

    # Embedding model config
    "embed_model_type": "hf",
    "embed_model_name": "BAAI/bge-base-en-v1.5",

    # LLM config
    "llm_type": "local",
    "llm_name": "Llama-2-7b-chat-hf",
    "max_new_tokens": 256,
    "temperature": 0.0,
    "top_p": 1.0,
    "top_k": 50,
    "do_sample": False,

    # Vector DB config
    "vector_db_type": "weaviate", # "weaviate"
    "vector_db_name": "Pubmed_QA",
    # MODIFY THIS
    "weaviate_url": "https://rag-bootcamp-pubmed-qa-lsqv7od4.weaviate.network",

    # Retriever and query config
    "retriever_type": "vector_index", # "vector_index"
    "retriever_similarity_top_k": 5,
    "query_mode": "hybrid", # "default", "hybrid"
    "hybrid_search_alpha": 0.0, # float from 0.0 (sparse search - bm25) to 1.0 (vector search)
    "response_mode": "compact",
    "use_reranker": False,
    "rerank_top_k": 3,

    # Evaluation config
    "eval_llm_type": "openai",
    "eval_llm_name": "gpt-3.5-turbo",
}

### Read secrets

#### Weaviate Key

In [4]:
try:
    f = open(Path.home() / ".weaviate_api_key", "r")
    f.close()
except Exception as err:
    print(f"Could not read your Weaviate key. Please make sure this is available in plain text under your home directory in ~/.weaviate_api_key: {err}")

#### Cohere API Key

In [5]:
try:
    f = open(Path.home() / ".cohere_api_key", "r")
    os.environ["COHERE_API_KEY"] = f.read().rstrip("\n")
    f.close()
except Exception as err:
    print(f"Could not read your Cohere API key. Please make sure this is available in plain text under your home directory in ~/.cohere_api_key: {err}")

#### OpenAI API Key [Optional]

In [6]:
try:
    f = open(Path.home() / ".openai_api_key", "r")
    os.environ["OPENAI_API_KEY"] = f.read().rstrip("\n")
    f.close()
except Exception as err:
    print(f"Could not read your OpenAI API key. If you wish to run RAG evaluation, please make sure this is available in plain text under your home directory in ~/.openai_api_key: {err}")

## STAGE 0 - Preliminary config checks

In [7]:
validate_rag_cfg(rag_cfg)
pprint(rag_cfg)

{'chunk_overlap': 0,
 'chunk_size': 256,
 'do_sample': False,
 'embed_model_name': 'BAAI/bge-base-en-v1.5',
 'embed_model_type': 'hf',
 'eval_llm_name': 'gpt-3.5-turbo',
 'eval_llm_type': 'openai',
 'hybrid_search_alpha': 0.0,
 'llm_name': 'Llama-2-7b-chat-hf',
 'llm_type': 'local',
 'max_new_tokens': 256,
 'query_mode': 'hybrid',
 'rerank_top_k': 3,
 'response_mode': 'compact',
 'retriever_similarity_top_k': 5,
 'retriever_type': 'vector_index',
 'temperature': 0.0,
 'top_k': 50,
 'top_p': 1.0,
 'use_reranker': False,
 'vector_db_name': 'Pubmed_QA',
 'vector_db_type': 'weaviate',
 'weaviate_url': 'https://rag-bootcamp-pubmed-qa-lsqv7od4.weaviate.network'}


## STAGE 1 - Load dataset and documents

#### 1. Load PubMed QA dataset
PubMedQA ([github](https://github.com/pubmedqa/pubmedqa)) is a biomedical question answering dataset. Each instance consists of a question, a context (extracted from PubMed abstracts), a long answer and a yes/no/maybe answer. We make use of the test split of [this](https://huggingface.co/datasets/bigbio/pubmed_qa) huggingface dataset for this notebook.

**The context for each instance is stored as a text file** (referred to as documents), to align the task as a standard RAG use-case.

In [8]:
print('Loading PubMed QA data ...')
pubmed_data = PubMedQATaskDataset('bigbio/pubmed_qa')
print(f"Loaded data size: {len(pubmed_data)}")
# pubmed_data.mock_knowledge_base(output_dir='./data', one_file_per_sample=True)

Loading PubMed QA data ...


Preparing data: 100%|██████████| 500/500 [00:00<00:00, 1506.54it/s]

Loaded data size: 500





#### 2. Load documents

In [9]:
print('Loading documents ...')
reader = DocumentReader(input_dir="./data/pubmed_doc")
docs = reader.load_data()
print(f'No. of documents loaded: {len(docs)}')

Loading documents ...
No. of documents loaded: 500


## STAGE 2 - Load node parser, embedding, LLM and set service context

#### 1. Load node parser to split documents into smaller chunks

In [10]:
print('Loading node parser ...')
node_parser = SentenceSplitter(chunk_size=rag_cfg['chunk_size'], chunk_overlap=rag_cfg['chunk_overlap'])
# nodes = node_parser.get_nodes_from_documents(docs)

Loading node parser ...


#### 2. Load embedding model

In [11]:
embed_model = RAGEmbedding(model_type=rag_cfg['embed_model_type'], model_name=rag_cfg['embed_model_name']).load_model()

Loading hf embedding model ...


config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

#### 3. Load LLM for generation

In [12]:
llm = RAGLLM(rag_cfg['llm_type'], rag_cfg['llm_name']).load_model(**rag_cfg)

Loading local LLM model ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### 4. Use service context to set the node parser, embedding model, LLM, etc.

In [13]:
service_context = ServiceContext.from_defaults(
    node_parser=node_parser,
    embed_model=embed_model,
    llm=llm,
)
# Set it globally to avoid passing it to every class, this sets it even for rag_utils.py
set_global_service_context(service_context)

## STAGE 3 - Create index using the appropriate vector store

In [14]:
index = RAGIndex(db_type=rag_cfg['vector_db_type'], db_name=rag_cfg['vector_db_name'])\
    .create_index(docs, weaviate_url=rag_cfg["weaviate_url"])

Loading index from ./.weaviate_index_store/ ...


## STAGE 4 - Build query engine

Now build a query engine using *retriever* and *response_synthesizer*  
[Weaviate hybrid search](https://weaviate.io/blog/hybrid-search-explained)

In [15]:
def set_query_engine_args(rag_cfg, docs):
    query_engine_args = {
        "similarity_top_k": rag_cfg['retriever_similarity_top_k'], 
        "response_mode": rag_cfg['response_mode'],
        "use_reranker": False,
    }
    
    if (rag_cfg["retriever_type"] == "vector_index") and (rag_cfg["vector_db_type"] == "weaviate"):
        query_engine_args.update({
            "query_mode": rag_cfg["query_mode"], 
            "hybrid_search_alpha": rag_cfg["hybrid_search_alpha"]
        })
    elif rag_cfg["retriever_type"] == "bm25":
        nodes = service_context.node_parser.get_nodes_from_documents(docs)
        tokenizer = service_context.embed_model._tokenizer
        query_engine_args.update({"nodes": nodes, "tokenizer": tokenizer})
        
    if rag_cfg["use_reranker"]:
        query_engine_args.update({"use_reranker": True, "rerank_top_k": rag_cfg["rerank_top_k"]})

    return query_engine_args

In [16]:
query_engine_args = set_query_engine_args(rag_cfg, docs)
pprint(query_engine_args)

{'hybrid_search_alpha': 0.0,
 'query_mode': 'hybrid',
 'response_mode': 'compact',
 'similarity_top_k': 5,
 'use_reranker': False}


In [17]:
query_engine = RAGQueryEngine(
    retriever_type=rag_cfg['retriever_type'], vector_index=index, llm_model_name=rag_cfg['llm_name']).create(**query_engine_args)

## STAGE 5 - Finally query the model!
**Note:** We are using keyword based search or sparse search since *hybrid_search_alpha* is set to 0.0 by default.

#### TODO - Change seed to experiment with a different sample

In [18]:
random.seed(237)

In [19]:
sample_idx = random.randint(0, len(pubmed_data)-1)
sample_elm = pubmed_data[sample_idx]
pprint(sample_elm)

{'answer': ['no'],
 'context': 'Human immunodeficiency virus (HIV)-infected patients have '
            'generally been excluded from transplantation. Recent advances in '
            'the management and prognosis of these patients suggest that this '
            'policy should be reevaluated. To explore the current views of '
            'U.S. transplant centers toward transplanting asymptomatic '
            'HIV-infected patients with end-stage renal disease, a written '
            'survey was mailed to the directors of transplantation at all 248 '
            'renal transplant centers in the United States. All 148 responding '
            'centers said they require HIV testing of prospective kidney '
            'recipients, and 84% of these centers would not transplant an '
            'individual who refuses HIV testing. The vast majority of '
            'responding centers would not transplant a kidney from a cadaveric '
            '(88%) or a living donor (91%) into an asymp

In [20]:
query = sample_elm['question']

response = query_engine.query(query)

print(f'QUERY: {query}')
print(f'RESPONSE: {response}')
print(f'YES/NO: {extract_yes_no(response.response)}')
print(f'GT ANSWER: {sample_elm["answer"]}')
print(f'GT LONG ANSWER: {sample_elm["long_answer"]}')

QUERY: Should all human immunodeficiency virus-infected patients with end-stage renal disease be excluded from transplantation?
RESPONSE:  Based on the context information provided, I would say no to the query. The survey conducted among U.S. transplant centers revealed that while the majority of centers require HIV testing of prospective kidney recipients and would not transplant an individual who refuses HIV testing, there are some centers that would consider transplanting an HIV-infected patient. While the vast majority of responding centers would not transplant a kidney from a cadaveric or living donor into an asymptomatic HIV-infected patient who is otherwise a good candidate for transplantation, some centers are willing to consider transplantation in the face of HIV infection.
It is important to note that the survey was conducted in 1997, and since then, there have been significant advances in the management and prognosis of HIV-infected patients, including the development of hig

#### [OPTIONAL] Ragas evaluation

In [21]:
retrieved_nodes = query_engine.retriever.retrieve(query)

In [22]:
eval_data = {
    "question": [query],
    "answer": [response.response],
    "contexts": [[node.text for node in retrieved_nodes]],
    "ground_truths": [[sample_elm['long_answer']]],
    }
pprint(eval_data)

{'answer': [' Based on the context information provided, I would say no to the '
            'query. The survey conducted among U.S. transplant centers '
            'revealed that while the majority of centers require HIV testing '
            'of prospective kidney recipients and would not transplant an '
            'individual who refuses HIV testing, there are some centers that '
            'would consider transplanting an HIV-infected patient. While the '
            'vast majority of responding centers would not transplant a kidney '
            'from a cadaveric or living donor into an asymptomatic '
            'HIV-infected patient who is otherwise a good candidate for '
            'transplantation, some centers are willing to consider '
            'transplantation in the face of HIV infection.\n'
            'It is important to note that the survey was conducted in 1997, '
            'and since then, there have been significant advances in the '
            'management a

In [23]:
eval_obj = RagasEval(
            metrics=["faithfulness", "relevancy", "precision"], 
            eval_llm_type=rag_cfg["eval_llm_type"], eval_llm_name=rag_cfg["eval_llm_name"]
            )
eval_result = eval_obj.evaluate(eval_data)
print(eval_result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'faithfulness': 0.2500, 'answer_relevancy': 0.9437, 'context_precision': 1.0000}


### 5.1 - Dense Search
Set *hybrid_search_alpha* to 1.0 for dense vector search.

In [24]:
rag_cfg["hybrid_search_alpha"] = 1.0

In [25]:
# Recreate query engine
query_engine_args = set_query_engine_args(rag_cfg, docs)
pprint(query_engine_args)
query_engine = RAGQueryEngine(
    retriever_type=rag_cfg['retriever_type'], vector_index=index, llm_model_name=rag_cfg['llm_name']).create(**query_engine_args)

# Get response
response = query_engine.query(query)

# Print response
print(f'QUERY: {query}')
print(f'RESPONSE: {response}')
print(f'YES/NO: {extract_yes_no(response.response)}')
print(f'GT ANSWER: {sample_elm["answer"]}')
print(f'GT LONG ANSWER: {sample_elm["long_answer"]}')

{'hybrid_search_alpha': 1.0,
 'query_mode': 'hybrid',
 'response_mode': 'compact',
 'similarity_top_k': 5,
 'use_reranker': False}
QUERY: Should all human immunodeficiency virus-infected patients with end-stage renal disease be excluded from transplantation?
RESPONSE:  Based on the context information provided, I would say no to the query. The majority of transplant centers in the United States do not currently transplant kidneys from HIV-infected donors into HIV-infected recipients, even if the recipient is otherwise a good candidate for transplantation. However, there are some centers that would consider transplanting an HIV-infected patient, but none of them had performed such a transplant in the year prior to the survey.
It is important to note that the management and prognosis of HIV-infected patients have improved significantly in recent years, which may warrant a reevaluation of the current policy excluding them from transplantation. However, the majority of centers still fear t

#### [OPTIONAL] Ragas evaluation

In [27]:
retrieved_nodes = query_engine.retriever.retrieve(query)

eval_data = {
    "question": [query],
    "answer": [response.response],
    "contexts": [[node.text for node in retrieved_nodes]],
    "ground_truths": [[sample_elm['long_answer']]],
    }

eval_result = eval_obj.evaluate(eval_data)
print(eval_result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Invalid JSON response. Expected dictionary with key 'question'


{'faithfulness': 0.6000, 'answer_relevancy': nan, 'context_precision': 1.0000}


### 5.2 - Hybrid Search
Set *hybrid_search_alpha* to 0.5 for hybrid search with equal weightage for dense and sparse (keyword-based) search.

In [28]:
rag_cfg["hybrid_search_alpha"] = 0.5

In [29]:
# Recreate query engine
query_engine_args = set_query_engine_args(rag_cfg, docs)
pprint(query_engine_args)
query_engine = RAGQueryEngine(
    retriever_type=rag_cfg['retriever_type'], vector_index=index, llm_model_name=rag_cfg['llm_name']).create(**query_engine_args)

# Get response
response = query_engine.query(query)

# Print response
print(f'QUERY: {query}')
print(f'RESPONSE: {response}')
print(f'YES/NO: {extract_yes_no(response.response)}')
print(f'GT ANSWER: {sample_elm["answer"]}')
print(f'GT LONG ANSWER: {sample_elm["long_answer"]}')

{'hybrid_search_alpha': 0.5,
 'query_mode': 'hybrid',
 'response_mode': 'compact',
 'similarity_top_k': 5,
 'use_reranker': False}
QUERY: Should all human immunodeficiency virus-infected patients with end-stage renal disease be excluded from transplantation?
RESPONSE:  Based on the context information provided, I would say no to the query. The majority of the transplant centers surveyed in the United States do not exclude HIV-infected patients with end-stage renal disease from transplantation, and some centers have even performed transplants on such patients in the past. While there are concerns about the safety of transplanting organs from HIV-infected donors, the advances in the management and prognosis of HIV infection suggest that this policy should be reevaluated. Additionally, the study on HIV-infected patients with no clinical symptoms of coronary artery disease found that myocardial SPECT should be used for screening these patients, which further supports the idea that HIV-infe

#### [OPTIONAL] Ragas evaluation

In [30]:
retrieved_nodes = query_engine.retriever.retrieve(query)

eval_data = {
    "question": [query],
    "answer": [response.response],
    "contexts": [[node.text for node in retrieved_nodes]],
    "ground_truths": [[sample_elm['long_answer']]],
    }

eval_result = eval_obj.evaluate(eval_data)
print(eval_result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'faithfulness': 0.5000, 'answer_relevancy': 0.9745, 'context_precision': 1.0000}


### 5.3 - Using Re-ranker
Set *use_reranker* to *True* to re-rank the context after retrieving it from the vector database.

In [31]:
rag_cfg["use_reranker"] = True
rag_cfg["hybrid_search_alpha"] = 1.0 # Using dense search

In [32]:
# Recreate query engine
query_engine_args = set_query_engine_args(rag_cfg, docs)
pprint(query_engine_args)
query_engine = RAGQueryEngine(
    retriever_type=rag_cfg['retriever_type'], vector_index=index, llm_model_name=rag_cfg['llm_name']).create(**query_engine_args)

# Get response
response = query_engine.query(query)

# Print response
print(f'QUERY: {query}')
print(f'RESPONSE: {response}')
print(f'YES/NO: {extract_yes_no(response.response)}')
print(f'GT ANSWER: {sample_elm["answer"]}')
print(f'GT LONG ANSWER: {sample_elm["long_answer"]}')

{'hybrid_search_alpha': 1.0,
 'query_mode': 'hybrid',
 'rerank_top_k': 3,
 'response_mode': 'compact',
 'similarity_top_k': 5,
 'use_reranker': True}
QUERY: Should all human immunodeficiency virus-infected patients with end-stage renal disease be excluded from transplantation?
RESPONSE:  Based on the context information provided, I would say no to the query. The majority of the responding transplant centers in the United States do not support excluding all HIV-infected patients with end-stage renal disease from transplantation, as they recognize that recent advances in the management and prognosis of HIV infection suggest that this policy should be reevaluated. While the majority of centers are hesitant to transplant HIV-infected patients, some centers are willing to consider transplantation in certain cases. Therefore, it is not a blanket exclusion.
YES/NO: no
GT ANSWER: ['no']
GT LONG ANSWER: The great majority of U.S. renal transplant centers will not transplant kidneys to HIV-infec

#### [OPTIONAL] Ragas evaluation

In [33]:
retrieved_nodes = query_engine.retriever.retrieve(query)

eval_data = {
    "question": [query],
    "answer": [response.response],
    "contexts": [[node.text for node in retrieved_nodes]],
    "ground_truths": [[sample_elm['long_answer']]],
    }

eval_result = eval_obj.evaluate(eval_data)
print(eval_result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Invalid JSON response. Expected dictionary with key 'question'


{'faithfulness': 0.2500, 'answer_relevancy': nan, 'context_precision': 1.0000}
