# **Question Answering System**

In [1]:
%pip install python-dotenv
%pip install elasticsearch
%pip install langchain
%pip install datasets
%pip install rank_bm25
%pip install -U sentence-transformers
%pip install accelerate



In [2]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# from opensearch_utils import *
# from huggingface_utils import *


# from langchain_community.vectorstores import OpenSearchVectorSearch
# from langchain.embeddings.openai import OpenAIEmbeddings

IR_indexing = False

In [3]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')

# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id
# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key

ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')

## **Information Retrieval**

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings

## https://huggingface.co/NeuML/pubmedbert-base-embeddings
model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0' 

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
index_name = 'qa_project_pubmedbert' # previously index="test_pubmed_split"

elastic_vector_search = ElasticsearchStore(
    es_cloud_id = ELASTIC_CLOUD_ID,
    index_name = index_name,
    embedding = embeddings,
    es_api_key = ELASTIC_API_KEY,
)

print(elastic_vector_search.client.info())
# elastic_vector_search.client.indices.delete(index="test_index_sadasd", ignore=400)

{'name': 'instance-0000000001', 'cluster_name': 'c43649c2d54c4738aa0d6ec1a406da5e', 'cluster_uuid': '-Q4oeHW9TM-Ri5weeMvz5Q', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

if IR_indexing:
    loader = HuggingFaceDatasetLoader(HUGGINGFACE_DATASET_NAME,use_auth_token=HUGGINGFACE_TOKEN,page_content_column='Abstract')
    data = loader.load()



Downloading readme:   0%|          | 0.00/95.0 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
%pip install --quiet langchain_experimental langchain_openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.8/166.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
tensorflow-probability 0.22.0 requi

In [None]:
## TODO: experiment with different chunking mechanisms

if IR_indexing:
    text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10,model_name=model_name,tokens_per_chunk=400)
    split_data = text_splitter.split_documents(data)
    print(len(split_data))

33732

Do hybrid search as an ensemble retriever


https://python.langchain.com/docs/integrations/vectorstores/elasticsearch

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

if IR_indexing:
    bm25_retriever = BM25Retriever.from_documents(split_data)

# result = bm25_retriever.get_relevant_documents("achondroplasia")
# result

In [None]:

## TODO: define custom index with vector field for generated questions
'''
When hybrid is enabled, the query performed will be a combination of approximate semantic search and keyword based search.

It will use rrf (Reciprocal Rank Fusion) to balance the two scores from different retrieval methods.

Every iteration of this will add data to ElasticSearch, must remove the index before
'''

## delete index first
# elastic_vector_search.client.indices.delete(index=index_name, ignore=400)

if IR_indexing:

    db = ElasticsearchStore.from_documents(
        split_data,
        embeddings,
        es_cloud_id=ELASTIC_CLOUD_ID,
        index_name=index_name,
        es_api_key=ELASTIC_API_KEY,
        distance_strategy="COSINE",
        strategy=ElasticsearchStore.ApproxRetrievalStrategy(
            hybrid=True,
        )

    )

    db.client.indices.refresh(index=index_name)

ObjectApiResponse({'_shards': {'total': 2, 'successful': 2, 'failed': 0}})

In [None]:
if IR_indexing:
  neuro_retriever = db.as_retriever()

  bm25_retriever_weight = 0.5

  ensemble_retriever = EnsembleRetriever(
      retrievers=[bm25_retriever, neuro_retriever], weights=[bm25_retriever_weight, 1 - bm25_retriever_weight]
  )

  results = ensemble_retriever.invoke("achondroplasia")
  for result in results:
    print(result)

page_content='legs with radiographic evaluation and referral to an orthopedist if necessary in adults clinical history and neurologic examination to screen for spinal stenosis with development of any new signs or symptoms or at least every three to five years discuss social adjustment at each visit with primary care provider agents circumstances to avoid rear facing car seats should be used as long as possible to avoid injury from motor vehicle accident avoid soft back infant seats and front carriers without a firm back avoid activities in which there is risk of injury to the craniocervical junction such as collision sports use of a trampoline diving from diving boards vaulting in gymnastics and hanging upside down from the knees or feet on playground equipment due to risk of falling onto the head or neck pregnancy management pregnant women with achondroplasia must undergo cesarean section delivery because of small pelvic size genetic counseling achondroplasia is inherited in an autoso

In [None]:
IF IR_indexing:

    query = "What is the recommended mode of delivery for pregnant women with achondroplasia?"
    results = db.similarity_search(query)
    print(results[0])
    print([x.metadata['Published_date'] for x in results])

page_content='legs with radiographic evaluation and referral to an orthopedist if necessary in adults clinical history and neurologic examination to screen for spinal stenosis with development of any new signs or symptoms or at least every three to five years discuss social adjustment at each visit with primary care provider agents circumstances to avoid rear facing car seats should be used as long as possible to avoid injury from motor vehicle accident avoid soft back infant seats and front carriers without a firm back avoid activities in which there is risk of injury to the craniocervical junction such as collision sports use of a trampoline diving from diving boards vaulting in gymnastics and hanging upside down from the knees or feet on playground equipment due to risk of falling onto the head or neck pregnancy management pregnant women with achondroplasia must undergo cesarean section delivery because of small pelvic size genetic counseling achondroplasia is inherited in an autoso

In [None]:
if IR_indexing:
    docs = db.similarity_search(
        "What is the recommended mode of delivery for pregnant women with achondroplasia?",
        filter=[{"range": {"metadata.date": {"gte": "2000-01-01"}}}],
    )
    print(docs)

[]


**Text Generation Pipeline**

In [7]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

In [8]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=os.environ.get('HUGGINGFACE_TOKEN')
)



In [9]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    token=os.environ.get('HUGGINGFACE_TOKEN')
)
model.eval()# we only use the model for inference
print(f"Model loaded ")

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



Model loaded 


In [10]:
from transformers import LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf", use_auth_token=HUGGINGFACE_TOKEN)



In [11]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.01,
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [12]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [13]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    verbose=True,
    retriever = elastic_vector_search.as_retriever(search_kwargs={"k":3}),
    chain_type_kwargs={
        "verbose": True },

)

In [None]:
query = "What is the recommended mode of delivery for pregnant women with achondroplasia?"
rag_pipeline(query)