In [6]:
!pip install unsloth datasets langchain faiss-gpu transformers sentence-transformers langchain-community -q

import torch
from google.colab import drive
drive.mount('/content/drive')
drive_path = "/content/drive/My Drive/MedQA-Llama3.1-8B_LoRA_Model/lora_model"

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=drive_path,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto"
)
FastLanguageModel.for_inference(model)

from datasets import load_dataset
dataset = load_dataset("bigbio/pubmed_qa", name="pubmed_qa_labeled_fold0_source", split="train", trust_remote_code=True)
docs = []
for i, ex in enumerate(dataset):
    t = ex["LONG_ANSWER"] or ""
    docs.append({"id": f"pubmed_qa_train_{i}", "text": t})

from langchain.text_splitter import RecursiveCharacterTextSplitter
ts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = []
for d in docs:
    c = ts.split_text(d["text"])
    for idx, x in enumerate(c):
        chunks.append({"id": f"{d['id']}_chunk_{idx}", "text": x})

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
e = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
texts = [x["text"] for x in chunks]
meta = [{"source": x["id"]} for x in chunks]
db = FAISS.from_texts(texts, embedding=e, metadatas=meta)

from pydantic import PrivateAttr
from langchain.llms.base import LLM
from typing import Any, Optional, List

class LoRAMedicalLLM(LLM):
    _model: Any = PrivateAttr()
    _tokenizer: Any = PrivateAttr()
    max_new_tokens: int = 512
    device: str = "cuda"
    def __init__(self, model, tokenizer, max_new_tokens=256, device="cuda", **kwargs):
        super().__init__(**kwargs)
        self._model = model
        self._tokenizer = tokenizer
        self.max_new_tokens = max_new_tokens
        self.device = device
    @property
    def _llm_type(self) -> str:
        return "lora-medical-llm"
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        inp = self._tokenizer([prompt], return_tensors="pt", padding=True, truncation=True, max_length=2048).to(self.device)
        with torch.no_grad():
            out_toks = self._model.generate(**inp, max_new_tokens=self.max_new_tokens, use_cache=True)
        return self._tokenizer.decode(out_toks[0], skip_special_tokens=True)

llm = LoRAMedicalLLM(model=model, tokenizer=tokenizer)

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
pt = """You are a medical QA system.
Use the following context to answer the question concisely, dont repeat the context or any lines in the final answer:
{context}

Question: {question}

Answer:
"""
pr = PromptTemplate(template=pt, input_variables=["context","question"])
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
rag = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs={"prompt":pr})

q = "What is the role of insulin in managing diabetes?"
r = rag({"query": q})
raw_answer = r["result"]
ans = raw_answer.split("Answer:")[-1].strip() if "Answer:" in raw_answer else raw_answer.strip()

print("Question:", q)
print("Answer:", ans)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Question: What is the role of insulin in managing diabetes?
Answer: Insulin is the only treatment for diabetes that can normalize both blood glucose and blood pressure. It is also the only treatment that can prevent microvascular complications. In this context, insulin is a first-line treatment for type 2 diabetes. It is also indicated as an adjunct to oral antidiabetic drugs in type 1 diabetes and typ