### Install Hugging Face transofrmers

In [1]:
!pip install transformers torch



### Install FAISS for similarity search


In [1]:
!pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2 (from faiss-gpu-cu12)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_gpu_cu12-1.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, faiss-gpu-cu12
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy

In [1]:

from getpass import getpass

# Securely ask for GitHub token
token = getpass("Enter your GitHub token: ")
!git clone https://{token}@github.com/SRINIRAGZ/Medical_Asst_Bot.git

Enter your GitHub token: ··········
Cloning into 'Medical_Asst_Bot'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 18 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 4.78 MiB | 18.62 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [2]:
from typing import List, Dict, Any, Optional, Tuple
import os, json, math, uuid
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import nltk
from sklearn.model_selection import train_test_split
import time
import copy

In [3]:
#Required with NLTK package
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

##Configurations

In [4]:
#1. Configs
class Config():
  data_path = "/content/Medical_Asst_Bot/mle_screening_dataset.csv"
  max_tokens = 500#for text chunk
  #embedding
  embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"  # fast, 384-dim
  cross_encoder_model_name: Optional[str] = "cross-encoder/stsb-roberta-base"       #"cross-encoder/stsb-roberta-base". #"cross-encoder/ms-marco-MiniLM-L-6-v2"
  #top k candidates to consider
  top_k = 6
  max_context_chars: int = 6000  # pack retrieved chunks to fit your model's context
  faiss_index_path: str = "./qa_index.faiss" #faiss index path
  store_path: str = "./qa_store.jsonl"  # metadata store
  gen_model = "google/flan-t5-base"  #  "google/flan-t5-base" #.  "facebook/bart-large"

CFG = Config() #config object

In [5]:
#logger
def logging(text, time):
  return f"{time:^10.2f}| {text[:100]:<100}"

### Read and Preprocess Data

In [7]:
#2. Read Data
class QASession:
  def __init__(self):
    self.df = pd.DataFrame()

  def read_data(self, file_path: str):
    #REad data from file_path
    self.df = pd.read_csv(file_path)

  def preprocess_data(self):
    #clean Data
    self.df.dropna(inplace=True)
    self.df.drop_duplicates(inplace=True)

    #preprocessing
    self.df = self.df.reset_index().rename(columns={'index': 'qa_id'})

    #Train Validaion and Test set
    self.train_val_test_splitting()

    #updating df for exploration
    self.df = self.chunk_pipeline(self.df).copy(deep=True)

  #chunking any big text less than max_tokens
  def chunk_text(self, text: str, max_tokens: int = CFG.max_tokens) -> List[str]:
    #parachunk
    def paragraph_chunk(text):
      return text.split("\n\n")
    #sentence chunk
    def sentence_chunk(text):
      return nltk.sent_tokenize(text.strip())

    if len(text.split()) <= max_tokens:
      return [text.strip()]
    res = []
    for chunks in paragraph_chunk(text):
      if len(chunks.split())<=max_tokens:
        res.append(chunks.strip())
      else:
        res.extend(sentence_chunk(chunks))
    return res

  #chunking the answers of entire dataframe
  def chunk_pipeline(self, data: pd.DataFrame):
    dt = data.copy()
    dt['ans_chunk'] = dt.answer.apply(self.chunk_text)
    dt = dt.explode('ans_chunk').reset_index(drop=True)
    dt['chunk_idx'] = dt.groupby('qa_id', as_index=False).cumcount()
    dt['text'] = dt.apply(lambda x: f'Question: {x.question} \n Answer: {x.ans_chunk}', axis=1)
    return dt


  #splitting the data into training, validatin(for evaluation) and test(for final output)
  def train_val_test_splitting(self, val=0.01, test=0.01):
    #Train, Val and Test splits
    x_t, x_test_p = train_test_split(self.df, test_size=test, random_state=42)
    x_train_p, x_val_p = train_test_split(x_t, test_size=val, random_state=42)

    self.x_test = x_test_p.question.tolist()
    self.x_val = x_val_p[['question','answer']].values.tolist()

    #chunk X train answers
    x_train_p = self.chunk_pipeline(x_train_p).copy(deep=True)
    self.x_train = x_train_p.to_dict('records')




## Creating Vector Indices and vector search

In [8]:

''' 3. Create Vector indices and storing it in vector store
  it also provides functionality for FAISS similarity search'''
class Vectordb:
    def __init__(self, index_path: str, store_path: str, embd_model: str, crossEnc_model: str=None):
        #initial embedding model
        self.embd_model = SentenceTransformer(embd_model)
        #cross encoder model for reranking for better prompt to generator
        self.crossEnc_model = CrossEncoder(crossEnc_model) if crossEnc_model else None
        self.dim = self.embd_model.get_sentence_embedding_dimension()
        self.index_path = index_path
        self.store_path = store_path
        self.id_to_row: Dict[int, Dict[str, Any]] = {}
        self._load()

    #encode the data into embeddings
    def encode(self, texts: List[str]) -> np.ndarray:
        return np.asarray(self.embd_model.encode(texts, normalize_embeddings=True, show_progress_bar=False))

    def _new_index(self):
        # Cosine similarity via inner product on normalized vectors
        return faiss.IndexFlatIP(self.dim)

    #creating new indices on the data
    def _load(self):
        if os.path.exists(self.index_path):
          os.remove(self.index_path)
        if os.path.exists(self.store_path):
          os.remove(self.store_path)
        self.index = self._new_index()

    #save the indices and meta data into the indes and data store
    def save(self):
        faiss.write_index(self.index, self.index_path)
        with open(self.store_path, "w", encoding="utf-8") as f:
            for row_id, row in self.id_to_row.items():
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

    #add the vectors to the faiss index
    def add(self, vecs: np.ndarray, rows: List[Dict[str, Any]]):
        start = len(self.id_to_row)
        self.index.add(vecs.astype(np.float32))
        for i, row in enumerate(rows):
            rid = start + i
            row["row_id"] = rid
            self.id_to_row[rid] = row

    #Search the vector database for
    '''The idea here is obtain the top k results and look for unique q&a id from the original data. generate all the chunks  of the q&a id's in the top k,
      rerank the result and use only top 40 of those
    '''
    def search(self, query: List, top_k: int) -> List:
        query_vec = self.embd_model.encode(query)
        query_vec = query_vec.astype(np.float32)
        factor = 2 if self.crossEnc_model else 1
        _, I = self.index.search(query_vec, top_k)
        return_candidates = []
        # printflag=True
        if self.crossEnc_model:
          for i,candidates in enumerate(I):
            if len(candidates)>0:
              qaids = set()
              for c in candidates:
                if int(c) in self.id_to_row:
                  qaids.add(self.id_to_row.get(int(c))['qa_id'])
              #get all the chunks of the chosen qa_ids
              new_candidates = [ r for r in self.id_to_row.values() if r['qa_id'] in qaids]
              # new_candidates = [ self.id_to_row.get(int(c)) for c in candidates if self.id_to_row.get(int(c))['qa_id']]
              #rerank the candidates

              new_candidates = self.rerank(query[i], new_candidates)
              new_candidates = new_candidates[:40]#top 40 results

              return_candidates.append(new_candidates)
              new_candidates=[]
            else:
              print(f'no candidates for {query[i]}')
        else:
          for i,candidates in enumerate(I):
            if len(candidates)>0:
              qaids = set()
              for c in candidates:
                if int(c) in self.id_to_row:
                  qaids.add(self.id_to_row.get(int(c))['qa_id'])
              new_candidates = (query[i], [ r for r in self.id_to_row.values() if r['qa_id'] in qaids])[:40]
              # new_candidates = (query[i], [ self.id_to_row.get(int(c)) for c in can÷\didates])
              return_candidates.append(new_candidates)
              new_candidates=[]
        return return_candidates

    def rerank(self, query: str, candidates: List[str]) -> List[Dict[str, Any]]:
      pairs = [(query, c['text'].strip()) for c in candidates]
      scores = self.crossEnc_model.predict(pairs)
      zipped = [(s, c) for s, c in zip(scores, candidates) ]
      ranked = sorted(zipped, key=lambda x: (x[0],x[1]['qa_id'],x[1]['chunk_idx']), reverse=True)
      return (query,[c for _, c in ranked])



## Generator model for generating answers from context

In [9]:
'''Using RAG pipeling to feed the prompt created from question and context to feed to generator to generate the answers to the queries'''

PROMPT_TEMPLATE = (
    "Being a medical assistant, use the CONTEXT to answer the QUESTION concisely and accurately in fewer than 200 words.\n"
    "If the answer is not in the context, say you don't know.\n\n"
    "QUESTION:\n{QUESTION}\n\n"
    "CONTEXT:\n{CONTEXT}\n\n"
    "ANSWER:"
)

#Pack the contextswith right formatting
def pack_context(queries: Tuple[str,List[Dict[str, Any]]]) -> str:
  contexts = []
  # print(queries[0])
  for q, cand in queries:
    context = ""
    prev_qaid = -1
    for c in cand:
      qa_id = c['qa_id']
      if qa_id != prev_qaid and prev_qaid!=-1:
        context += "\n\n"
      context += ' '+c["ans_chunk"]
      prev_qaid = qa_id
    contexts.append((q,context))
  # print(f'contexts len {len(contexts)}')
  return contexts


#Create a generator model
class HuggingFace_Generator():
  def __init__(self, model: str=None):
    self.model_name = "google/flan-t5-base"  #  "google/flan-t5-base" #.  "facebook/bart-large"
    self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

  def generate(self, prompt: str) -> str:
    tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    generator = pipeline("text2text-generation", model=self.model, tokenizer=tokenizer)
    # output = generator(prompt, max_length=100, num_beams=4)
    output = generator(prompt, max_new_tokens=300, num_beams=3)
    final = output[0]['generated_text']
    return final.split("ANSWER:")[-1].strip()


#Rag Pipeline
class RAGPipeline:
  def __init__(self, generator: HuggingFace_Generator):
    self.generator = generator

  def answer(self, test_candidates: List[Tuple]) -> Dict[str, Any]:
    contexts = pack_context(test_candidates)
    ret = []
    for i,(question, context) in enumerate(contexts):
      logging(f'Generating results for Question idx: {i}',time.time()-t)
      prompt = PROMPT_TEMPLATE.format(QUESTION=question, CONTEXT=context)
      answer = self.generator.generate(prompt)
      ret.append({"question": question, "answer": answer,
                "sources": [{"qa_id": h["qa_id"], "chunk_index": h["chunk_idx"]} for h in test_candidates[i][1]],
                "raw_context": context,})
    return ret

## Model Evaluation Technique

In [30]:
'''Using precision @k and recall @k to evaluate the model from validation data. more than exact ranking of the candidates, relevant candidates appearing in top k matters the most
'''
#Retriever Evaluation
#Generates similarity scores between answers for the matches instead of exact matches
class Retriever_Eval():
  def __init__(self, train_answers, validation_data, eval_model: str, threshold=0.5, k=10):
    self.data = copy.deepcopy(validation_data)
    self.sen_tfr_model = SentenceTransformer(eval_model)
    self.threshold = threshold
    self.train_answers = copy.deepcopy(train_answers)
    self.k = k
    self.relevance_cnt = self.for_recall()
    self.similarity_score()

  def similarity_score(self):
    print(f"computing similarity score")
    for i,q in enumerate(self.data):#for each query
      candidates = q[2]
      score = []
      for c in candidates:
        sim = util.cos_sim(self.sen_tfr_model.encode(q[1]), self.sen_tfr_model.encode(c))
        score.append(sim)
      self.data[i].append(score)

  def for_recall(self):
    print(f"computing similarity score from the entire repo of answers")
    cnt = 0
    for i,q in enumerate(self.data):
      if i%10==0:
        print(f"sim score for {i}")
      for t in self.train_answers:
        sim = util.cos_sim(self.sen_tfr_model.encode(q[1],normalize_embeddings=True), self.sen_tfr_model.encode(t,normalize_embeddings=True))
        if sim>=self.threshold:
          cnt+=1
    return cnt

  def precision_at_k(self):
    print(f"Evaluating precision @ {self.k}")
    p_k = []
    for i,q in enumerate(self.data):
      retrieved_k = copy.deepcopy(q[3][:self.k])
      p_k.append(len([r for r in retrieved_k if r>=self.threshold]) / self.k)
    return sum(p_k) / len(p_k) if len(p_k)>=1 else 0

  def recall_at_k(self):
    r_k = []
    for i,q in enumerate(self.data):
      retrieved_k = copy.deepcopy(q[3][:self.k])
      r_k.append(len([r for r in retrieved_k if r>=0.8]) / self.relevance_cnt)
    return sum(r_k) / len(r_k) if len(r_k)>=1 else 0

##Main

### 1. Data input object

In [11]:

#Main
#1. Read Data
t = time.time()
print(logging("Reading Data",time.time()-t))
qa_session = QASession()
qa_session.read_data(CFG.data_path)
qa_session.preprocess_data()
print(logging("Data Read Complete",time.time()-t))

   0.00   | Reading Data                                                                                        
   3.96   | Data Read Complete                                                                                  


### 2. Creating vector Database

In [12]:
# 2. Build Vector Database
t = time.time()
print(logging("Creating Vector DB",time.time()-t))
vdb = Vectordb(index_path=CFG.faiss_index_path, store_path=CFG.store_path, embd_model=CFG.embedding_model_name, crossEnc_model=CFG.cross_encoder_model_name)
print(logging("Created Vector DB",time.time()-t))

   0.00   | Creating Vector DB                                                                                  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

  16.27   | Created Vector DB                                                                                   


### 3. vector indexing the q&a sessions from training data

In [13]:
#3. Encode and Add q&a vectors to vector database
t = time.time()
print(logging("Embedding Q&A session",time.time()-t))
embd_txt = [ t['text'] for t in qa_session.x_train]
vecs = vdb.encode(embd_txt)
print(logging("Adding Q&A session Vectors to Vector DB",time.time()-t))
vdb.add(vecs, qa_session.x_train)
print(logging("Saving to Vector DB",time.time()-t))
vdb.save()

   0.00   | Embedding Q&A session                                                                               
  22.54   | Adding Q&A session Vectors to Vector DB                                                             
  22.64   | Saving to Vector DB                                                                                 


### 4. Prepping the Validation data and doing similarity search  

In [20]:
#Validation Data Model Evaluation
t = time.time()
print(logging("Validatin Query Search",time.time()-t))
val_queries = [ q for q,_ in qa_session.x_val]
val_ground_truth = [ a for _,a in qa_session.x_val]
val_candidates = vdb.search(val_queries, top_k=CFG.top_k)
# print(val_candidates[0])
val_candidates = [[c[0],val_ground_truth[i],[ cd['ans_chunk'] for cd in c[1]]] for i,c in enumerate(val_candidates)]
#prepping the original answers

anslst = set()
for qa in qa_session.x_train:
  anslst.add(qa['answer'])
anslst = list(anslst)




   0.00   | Validatin Query Search                                                                              


### 5. Model Validation

In [29]:
#validation evaluation
print(logging("Validatin Query Evaluation",time.time()-t))
eval = Retriever_Eval(anslst, val_candidates, eval_model=CFG.embedding_model_name)

print(logging(f"Precision@{10} score: {eval.precision_at_k()}",time.time()-t))
# print(logging(f"Recall@{10} score: {eval.recall_at_k()}",time.time()-t))

  760.13  | Validatin Query Evaluation                                                                          
computing similarity score
Evaluating precision @ 10
  894.62  | Precision@10 score: 0.4185185185185185                                                              


### 6. Searching the vecotr db on test data for similar candidates

In [16]:
#4. Similarity search on Vectordb on the test queries
print(logging("Test Query Search",time.time()-t))
test_queries = qa_session.x_test
test_candidates = vdb.search(test_queries, top_k=CFG.top_k)

  216.41  | Test Query Search                                                                                   


### 7. Generating answers using RAG pipeling for the test queries


In [17]:
#5. RAG Pipeline and test candidates for generation
print(logging("Creating RAG Pipeline",time.time()-t))
generator = HuggingFace_Generator(model=CFG.gen_model)  # swap to OpenAIGenerator() or your preferred provider
rag = RAGPipeline(generator)

print(logging("Generating Medical BOT Responses",time.time()-t))
#generating for top 20 test cases
result = rag.answer(test_candidates[:20])

print(f'{len(result)} queries answered')
for i,r in enumerate(result):
  print(f"{i+1}). {r['question']}\nAnswer: {r['answer']}")
  # print("Sources:")
  # for s in r['sources']:
  #   print(f"\t- qa_id: {s['qa_id']}, chunk_index: {s['chunk_index']}")
  # print(f"\nRaw Context:\n{r['raw_context']}")

  246.18  | Creating RAG Pipeline                                                                               


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  251.54  | Generating Medical BOT Responses                                                                    


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (3500 > 512). Running this sequence through the model will result in indexing errors
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (2449 > 512). Running this sequence through the model will result in indexing errors
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (4909 > 512). Running this sequence through the model will result in indexing errors
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (3805 > 512). Running this sequence through the model will result in indexing errors
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (3398 > 512). Running this sequence through t

20 queries answered
1). What are the symptoms of Breast Cancer ?
Answer: Signs of breast cancer include a lump or change in the breast. These and other signs may be caused by breast cancer or by other conditions. Check with your doctor if you have any of the following symptoms:
2). What are the symptoms of Thyrotoxic periodic paralysis ?
Answer: Signs and Symptoms Approximate number of patients (when available) Abnormality of the fontanelles or cranial sutures 90% Abnormality of the liver 90% Abnormality of the tongue 90% Aplasia/Hypoplasia of the abdominal wall musculature 90% Coarse facial features 90% Constipation 90% Muscular hypothyroidism 90% Sleep disturbance 90% Umbilical hernia 90% Abnormality of metabolism/homeostasis
3). What are the symptoms of Duane syndrome type 3 ?
Answer: Signs and Symptoms of Duane syndrome type 3. Signs and Symptoms of Duane syndrome type 2. Signs and Symptoms of Duane syndrome type 2. Signs and Symptoms of Duane syndrome type 1. Signs and Symptoms of