In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None # None for auto detection. 
load_in_4bit = True # 4bit quantization.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Connecting to the vector store (MongoDB Atlas)

In [4]:
import pymongo
from pymongo import MongoClient

ATLAS_CONNECTION_STRING = "mongodb+srv://siddhesh:Zxcvbnm1234@cluster101.myohf76.mongodb.net/"

client = MongoClient(ATLAS_CONNECTION_STRING)

db_name = "Project_work"
collection_name = "medical_data"
collection = client[db_name][collection_name]
vector_search_index = "vector_index"

In [5]:
try:
    client = MongoClient('mongodb+srv://siddhesh:Zxcvbnm1234@cluster101.myohf76.mongodb.net/', serverSelectionTimeoutMS=5000)
    client.server_info()  # Trigger a server request
    print("Connection successful")
except Exception as e:
    print("Connection failed:", e)

Connection successful


Using the BERT model for vector embedding

In [None]:
from transformers import BertModel, BertTokenizer

model_2 = BertModel.from_pretrained('bert-base-uncased')
tokenizer_2 = BertTokenizer.from_pretrained('bert-base-uncased')

In [106]:
user_question = "Hello doctor, I have acne scars. What medical treatments should I use?"

Single-query implementation

In [107]:
inputs_1 = tokenizer_2(user_question, return_tensors='pt',
                        truncation=True, padding=True)
outputs_1 = model_2(**inputs_1)
user_embedding = outputs_1.last_hidden_state[:, 0, :].detach().numpy().flatten()


pipeline = [
  {
    "$vectorSearch": {
      "index": "vector_index",
      "path": "embedding",
      "queryVector": user_embedding.tolist(),
      "numCandidates": 100,
      "limit": 10
    }
  }
]

results = collection.aggregate(pipeline)

a = []

for result in results:
    combine = result["input"] + "\n\n" + result["answer_chatdoctor"]

    a.append(combine)

def format_docs(docs):
   return "\n\n".join(doc for doc in docs)

context = format_docs(a)

Multi-query implementation

In [109]:
user_prompt = """
You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Only output the questions.

### User question:
{}

### Response:
{}
"""

In [110]:
FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    user_prompt.format(
        user_question, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
text = tokenizer.batch_decode(outputs)
parts = text[0].split("### Response:")

# Get the response part and clean it
print(parts[1].split("### Input:")[0].strip())

multi_query = parts[1].split("### Input:")[0].strip().split("\n")

multi_query.append(user_question)

What are the most effective treatments for acne scars?
What treatments can I use to remove acne scars?
What are the best treatments for acne scars?
What medical treatments can help reduce acne scars?
What are the most popular treatments for acne scars?<|eot_id|>


Retrieving the 10 relevant documents for each of the above questions based on cosine similarity. 

In [57]:
q_text = []
a_text = []
combine = []
for i in range(len(multi_query)):
  inputs_1 = tokenizer_2(multi_query[i], return_tensors='pt',
                        truncation=True, padding=True)
  outputs_1 = model_2(**inputs_1)
  user_embedding = outputs_1.last_hidden_state[:, 0, :].detach().numpy().flatten()


  pipeline = [
    {
      "$vectorSearch": {
        "index": "vector_index",
        "path": "embedding",
        "queryVector": user_embedding.tolist(),
        "numCandidates": 100,
        "limit": 10
      }
    }
  ]

  results = collection.aggregate(pipeline)


  for result in results:
      combine.append(result)

Removing duplciate documents

In [58]:
for i in range(len(combine)):
  q_text.append(combine[i]["input"])
  a_text.append(combine[i]["answer_chatdoctor"])

In [64]:
q_text_2 = []
a_text_2 = []
seen = set()
seen_2 = set()
for item in q_text:
    if item not in seen:
        q_text_2.append(item)
        seen.add(item)

seen = set()
for item in a_text:
    if item not in seen_2:
        a_text_2.append(item)
        seen_2.add(item)

Active RAG

Grading each retrival and filtering to inculding the documents that are relevant to the inital question as determined by the model. 

In [None]:
# Prompt
grade_prompt = """You are a grader assessing the relevance of a retrieved document to a user question.
If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant.
Output 'yes' or 'no' to indicate whether the document is relevant to the question. Do not output anything else.

### Document:
{}

### Question:
{}

### Response:
"""


grading = []

for i in range(len(q_text_2)):
  print(i)
  FastLanguageModel.for_inference(model)
  inputs = tokenizer(
    [
        grade_prompt.format(
            q_text_2[i], #document
            user_question, # question
            "", # output
        )
    ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
  text = tokenizer.batch_decode(outputs)
  parts = text[0].split("### Response:")

  # Get the response part and clean it
  relevant_part = parts[1].split("### Explanation:")[0].strip()

  # Extract the "yes" from the relevant part
  grading.append(relevant_part.split('\n')[0].strip())


In [74]:
subset_q_text = [q_text_2[i] for i in range(len(grading)) if grading[i] == 'yes']
subset_a_text = [a_text_2[i] for i in range(len(grading)) if grading[i] == 'yes']


In [77]:
final_context = []

for i in range(len(subset_q_text)):
  final_context.append(subset_q_text[i] + "\n\n" + subset_a_text[i])


Web search if we have less than 5 relevant documents

In [78]:
from langchain_community.tools.tavily_search import TavilySearchResults
import os

if len(subset_q_text) < 5:
  os.environ['TAVILY_API_KEY'] = API_KEY

  web_search_tool = TavilySearchResults()

  web_info = web_search_tool.invoke({"query": user_question})

  for i in range(len(web_info)):
    final_context.append(web_info[i]["content"])

  final ="\n".join([d for d in final_context])

Generation

Simple Generation without checks

In [115]:
final_prompt = """
Follow the following rules:
- Use the following pieces of context to answer the users question.
- Always start your answer saying "Hello, welcome to the medical chat forum"
- If you don't know the answer, just say that you don't know, don't try to make up an answer.

### context:
{}

### question:
{}

### Response:
{}
"""

In [None]:
FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    final_prompt.format(
        #context, #single-query
        final, #multi-query
        user_question, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 300, use_cache = True)
text = tokenizer.batch_decode(outputs)
parts = text[0].split("### Response:")

# Get the response part and clean it
print(parts[1].split("### Input:")[0].strip())

llm_generation = parts[1].split("### Input:")[0].strip()

Hallucination Check

In [96]:
from langchain_cohere import ChatCohere
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


score == GradeHallucinations(binary_score='no')

os.environ["COHERE_API_KEY"] = "BQ6OWM0bOZPTMVYpig7JRA0QwkHHOHelsUE4fscz"

class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )


# Preamble
preamble = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""

# LLM with function call
llm = ChatCohere(model="command-r", temperature=0)
structured_llm_grader = llm.with_structured_output(
    GradeHallucinations, preamble=preamble
)

# Prompt
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        # ("system", system),
        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader
score = hallucination_grader.invoke({"documents": final, "generation": llm_generation})

In [97]:
score

GradeHallucinations(binary_score='yes')

Regenerates reponses until the model find no hallucination in the reponse

In [98]:
while score == GradeHallucinations(binary_score='no'):
  FastLanguageModel.for_inference(model) 
  inputs = tokenizer(
  [
      final_prompt.format(
          #context, #single-query
          final, #multi-query
          user_question, # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
  text = tokenizer.batch_decode(outputs)
  parts = text[0].split("### Response:")

  # Get the response part and clean it
  print(parts[1].split("### Input:")[0].strip())

  llm_generation = parts[1].split("### Input:")[0].strip()

  score = hallucination_grader.invoke({"documents": final, "generation": llm_generation})

Final output by the model

In [None]:
llm_generation