##**WHO** Fact sheet extraction

In [None]:
import requests
from bs4 import BeautifulSoup
import pdfkit
import os
import urllib.parse

url = "https://www.who.int/news-room/fact-sheets"

# Extract links using BeautifulSoup
try:
    # Set a user-agent to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)  # Fetch the HTML content.
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
except requests.exceptions.RequestException as e:
    print(f"Error fetching URL {url}: {e}")  # Handle request errors
    exit()

soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content.
links = []

# Find all 'a' (anchor) tags which typically contain links.
for link_tag in soup.find_all('a', href=True):  # Filter for tags with 'href' attribute.
    href = link_tag.get('href')  # Get the 'href' attribute value.

    # Filter for links that seem relevant to fact sheets
    if "fact-sheets/detail" in href and not href.startswith('#'):
        # Construct absolute URL if it's a relative path.
        if not href.startswith('http'):
            href = urllib.parse.urljoin(url, href)  # Resolve relative URLs.
        links.append(href)

# Remove duplicate links if any
links = list(set(links))
print(f"Found {len(links)} unique fact sheet links.")

# 3. Convert each link to a PDF
output_folder = "data"  # Create a directory for PDFs.
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist.

path_wkhtmltopdf = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

for link in links:
    try:
        # Get the path part of the URL (e.g., /news-room/fact-sheets/detail/anaemia)
        url_path = urllib.parse.urlparse(link).path

        # Extract the last part of the path as the filename slug
        file_slug = os.path.basename(os.path.normpath(url_path)) # handles trailing slashes correctly

        # If the slug is empty for any reason, use a fallback name
        if not file_slug or file_slug == 'detail':
             # Create a fallback name from the full path to ensure uniqueness
            file_slug = url_path.strip('/').replace('/', '_')

        # Sanitize the filename to remove invalid characters if any
        safe_filename = "".join([c for c in file_slug if c.isalpha() or c.isdigit() or c in ('_','-')]).rstrip()

        # Create the final PDF filename
        pdf_filename = f"{safe_filename}.pdf"
        output_path = os.path.join(output_folder, pdf_filename)

        print(f"Converting {link} to {output_path}...")

        # Convert the URL to a PDF
        pdfkit.from_url(link, output_path, configuration=config)
        print(f"Successfully created {output_path}")

    except Exception as e:
        print(f"Could not convert {link}. Error: {e}")

print("\nPDF conversion process completed.")


#**FineTuning**

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install protobuf==3.20.3

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-f5wmbh0g/unsloth_aece11d7fb4e44a98d9bcbe2798d2122
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-f5wmbh0g/unsloth_aece11d7fb4e44a98d9bcbe2798d2122
  Resolved https://github.com/unslothai/unsloth.git to commit 5266ead104938c4908c7f2d2a60526555faf7e85
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.7.11 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.7.11-py3-none-any.whl.metadata (8.1 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
import torch
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
import json
import os

# --- Dataset 1: MedQuAD (Medical Question Answering) ---
print("Downloading MedQuAD dataset...")
medquad_dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")

def format_medquad(example):
    return {
        "instruction": example["Question"],
        "output": example["Answer"]
    }
medquad_dataset = medquad_dataset.map(format_medquad, remove_columns=["Question", "Answer"])
print(f"Loaded {len(medquad_dataset)} examples from MedQuAD.")


# --- Dataset 2: AI Medical Chatbot (Conversational) ---
print("\nDownloading AI Medical Chatbot conversational dataset...")
medical_chatbot_dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="train")

def format_chatbot_dataset(example):
    instruction = f"Patient Description: {example['Description']}\n\nPatient Dialogue: {example['Patient']}"
    return {
        "instruction": instruction,
        "output": example["Doctor"]
    }
medical_chatbot_dataset = medical_chatbot_dataset.map(format_chatbot_dataset, remove_columns=["Description", "Patient", "Doctor"])
print(f"Loaded {len(medical_chatbot_dataset)} examples from AI Medical Chatbot.")


# --- Combine the datasets ---
print("\nCombining datasets...")
combined_dataset = concatenate_datasets([
    medquad_dataset.select(range(1000)),
    medical_chatbot_dataset.select(range(1000))
]).shuffle(seed=42)

print("\nCombined dataset created successfully:")
print(combined_dataset)
print(f"\nExample from combined set: {combined_dataset[0]}")

Downloading MedQuAD dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

Loaded 16407 examples from MedQuAD.

Downloading AI Medical Chatbot conversational dataset...


README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

Map:   0%|          | 0/256916 [00:00<?, ? examples/s]

Loaded 256916 examples from AI Medical Chatbot.

Combining datasets...

Combined dataset created successfully:
Dataset({
    features: ['qtype', 'instruction', 'output'],
    num_rows: 2000
})

Example from combined set: {'qtype': None, 'instruction': 'Patient Description: Q. Why are there abdominal pain with loose motion and fever?\n\nPatient Dialogue: Hello doctor, There is abdomen pain since three days. Pain is not continous but really painful. Little loose motion, light headache when gets out of bed, little fever on day 1 and 2.', 'output': 'Hi. Such pain in abdomen with stool discomfort can happen as a result of stomach infection. Important here is clinical examination to know the site of the pain. So it is advisable to see your doctor. If he would feel suspicious, he would send you for an ultrasound abdomen. If all normal, you would be treated with antibiotics like Ofloxacin. Till you get medical help, you may take a combination of Pantoprazole and Domperidone that would reduce y

In [None]:
import unsloth
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

max_seq_length = 2048  # sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.11: Fast Mistral patching. Transformers: 4.53.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank -> higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

Unsloth 2025.7.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

prompt_template = """<|user|>
{}<|end|>
<|assistant|>
{}<|end|>"""

# 1. Create a function to format the dataset into a single 'text' column
def create_final_text(example):
    example["text"] = prompt_template.format(
        example["instruction"],
        example["output"]
    )
    return example

# 2. Apply this function to your combined dataset
final_dataset = combined_dataset.map(create_final_text)

# Check the new structure
print("\nDataset after final formatting:")
print(final_dataset)
print(f"\nExample text field: {final_dataset[0]['text']}")

def pass_through_formatting_func(example):
    # return it in a list to satisfy the trainer's API.
    return [example["text"]]

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=final_dataset,
    #dataset_text_field="text",
    formatting_func=pass_through_formatting_func,
    max_seq_length=max_seq_length,
    dataset_num_proc= None,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=1,
        learning_rate=2e-5,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none", # Disable Weights & Biases logging
    ),
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]


Dataset after final formatting:
Dataset({
    features: ['qtype', 'instruction', 'output', 'text'],
    num_rows: 2000
})

Example text field: <|user|>
Patient Description: Q. Why are there abdominal pain with loose motion and fever?

Patient Dialogue: Hello doctor, There is abdomen pain since three days. Pain is not continous but really painful. Little loose motion, light headache when gets out of bed, little fever on day 1 and 2.<|end|>
<|assistant|>
Hi. Such pain in abdomen with stool discomfort can happen as a result of stomach infection. Important here is clinical examination to know the site of the pain. So it is advisable to see your doctor. If he would feel suspicious, he would send you for an ultrasound abdomen. If all normal, you would be treated with antibiotics like Ofloxacin. Till you get medical help, you may take a combination of Pantoprazole and Domperidone that would reduce your acidity and pain. Take light food and drink plenty of fluids. Stomach infection. Ultrasoun

Unsloth: Tokenizing ["text"]:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,7.8686
50,4.6838
75,2.1021
100,0.9247
125,0.1619


Step,Training Loss
25,7.8686
50,4.6838
75,2.1021
100,0.9247
125,0.1619
150,0.0623
175,0.0279
200,0.019
225,0.0259
250,0.0158


In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.3G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.49 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:01<00:00, 19.29it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gguf_model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gguf_model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at gguf_model into f16 GGUF format.
The output location will be /content/gguf_model/unsloth.F16.gguf
This might take 3 minutes...


Unsloth: Extending gguf_model/tokenizer.model with added_tokens.json.
Originally tokenizer.model is of size (32000).
But we need to extend to sentencepiece vocab size (32011).


INFO:hf-to-gguf:Loading model: gguf_model
INFO:hf-to-gguf:Model architecture: MistralForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 32064}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:

In [None]:
# download the fine tuned model to run it locally using ollama
from google.colab import files
import os

gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)

# **RAG**

In [None]:
# -----------------------------------------------------------------------
# SECTION 1: INSTALL ALL DEPENDENCIES
# -----------------------------------------------------------------------
print("STEP 1: Installing all required dependencies...")
%pip install --upgrade --force-reinstall torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --upgrade transformers accelerate bitsandbytes
%pip install --upgrade gradio sentence-transformers langchain langchain-huggingface langchain-chroma pypdf pytesseract sqlalchemy faker
print("Dependencies installed successfully.")

STEP 1: Installing all required dependencies...
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m120.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-

Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
! pip install pytesseract langchain_huggingface langchain_chroma langchain_community sqlalchemy bitsandbytes

Collecting langchain_community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.70 (from langchain_huggingface)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests<3,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading mypy_extensions-1.1.0-py3-none-any

In [None]:
# ----------------------------------------
# SECTION 2: IMPORTING NECESSARY LIBRARIES
# ----------------------------------------
import os
import gradio as gr
import pytesseract
from datetime import datetime, timedelta
import torch
import warnings
from google.colab import userdata
from huggingface_hub import login
from pydantic import BaseModel
from sqlalchemy import create_engine, Column, func, Integer, String, Date, Boolean, ForeignKey, Table, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base, relationship
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline as hf_pipeline
from langchain.tools import Tool
from langchain.agents import create_react_agent, AgentExecutor
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage, AIMessage
import gc
import re
from PIL import Image
import random
from faker import Faker



os.environ["LANGCHAIN_TRACING_V2"] = "false"; warnings.filterwarnings("ignore")
try:
    hf_token = userdata.get('HF_TOKEN'); login(token=hf_token); print("Hugging Face login successful.")
except Exception as e:
    print("ERROR: Hugging Face token not found. Please add your token named HF_TOKEN.")

# ---------------------------------------------------
# SECTION 3: SYNTHETIC DATA CREATION (DATABASE SETUP)
# ---------------------------------------------------
print("\nSTEP 2: Setting up the database...")
DATABASE_URL = "sqlite:///./pharmacy_database.db"
Base = declarative_base()
# Association tables
user_allergies = Table('user_allergies', Base.metadata, Column('user_id', String, ForeignKey('users.user_id')), Column('allergy_id', Integer, ForeignKey('allergies.id')))
user_conditions = Table('user_conditions', Base.metadata, Column('user_id', String, ForeignKey('users.user_id')), Column('condition_id', Integer, ForeignKey('chronic_conditions.id')))
# SQLAlchemy Models
class User(Base):
    __tablename__ = 'users'
    user_id = Column(String, primary_key=True)
    name = Column(String)
    gender = Column(String(1))
    date_of_birth = Column(Date)
    address = Column(String)
    email = Column(String)
    phone_number = Column(String)
    allergies = relationship("Allergy", secondary=user_allergies, back_populates="users")
    chronic_conditions = relationship("ChronicCondition", secondary=user_conditions, back_populates="users")
    purchases = relationship("PurchaseHistory", back_populates="user")
    prescriptions = relationship("Prescription", back_populates="user")

class Allergy(Base):
    __tablename__ = 'allergies'
    id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String, unique=True)
    users = relationship("User", secondary=user_allergies, back_populates="allergies")

class ChronicCondition(Base):
    __tablename__ = 'chronic_conditions'
    id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String, unique=True)
    users = relationship("User", secondary=user_conditions, back_populates="chronic_conditions")

class Medicine(Base):
    __tablename__ = 'medicines'
    id = Column(Integer, primary_key=True, autoincrement=True)
    drug_name = Column(String, unique=True, index=True)
    stock_quantity = Column(Integer, default=1000)
    is_prescription = Column(Boolean, default=False)
    purchases = relationship("PurchaseHistory", back_populates="medicine")

class PurchaseHistory(Base):
    __tablename__ = 'purchase_history'
    order_id = Column(String, primary_key=True)
    user_id = Column(String, ForeignKey('users.user_id'))
    drug_name = Column(String, ForeignKey('medicines.drug_name'))
    quantity = Column(Integer)
    purchase_date = Column(DateTime)
    pharmacy = Column(String, nullable=True)
    is_prescription_purchase = Column(Boolean, default=False)
    user = relationship("User", back_populates="purchases")
    medicine = relationship("Medicine", back_populates="purchases")

class Prescription(Base):
    __tablename__ = 'prescriptions'
    prescription_id = Column(String, primary_key=True)
    user_id = Column(String, ForeignKey('users.user_id'))
    drug_name = Column(String, ForeignKey('medicines.drug_name'))
    doctor_name = Column(String)
    issue_date = Column(Date)
    dosage = Column(String)
    refills_remaining = Column(Integer)
    is_auto_refill_enabled = Column(Boolean)
    refill_due_date = Column(Date, nullable=True)
    order_status = Column(String, default="Initial Order Placed")
    user = relationship("User", back_populates="prescriptions")
    medicine = relationship("Medicine", backref="prescriptions")

NUM_USERS = 100
COMMON_ALLERGIES = [
    "Penicillin", "Sulfa drugs", "Aspirin", "Ibuprofen", "Codeine", "Latex",
    "Peanuts", "Shellfish", "Pollen", "Dust Mites"
]
CHRONIC_CONDITIONS = [
    "Hypertension", "Type 2 Diabetes", "Asthma", "Arthritis", "Hypothyroidism",
    "Chronic Kidney Disease", "Depression", "Migraine"
]
PHARMA_PRODUCTS = {
    # OTC = Over-the-counter, Rx = Prescription
    "Pain Relief": [
        {"name": "Paracetamol 500mg", "type": "OTC"},
        {"name": "Ibuprofen 200mg", "type": "OTC"},
        {"name": "Aspirin 75mg", "type": "OTC"},
        {"name": "Tramadol 50mg", "type": "Rx"},
    ],
    "Allergy": [
        {"name": "Cetirizine 10mg", "type": "OTC"},
        {"name": "Loratadine 10mg", "type": "OTC"},
        {"name": "Fexofenadine 180mg", "type": "Rx"},
    ],
    "Diabetes": [
        {"name": "Metformin 500mg", "type": "Rx"},
        {"name": "Gliclazide 80mg", "type": "Rx"},
        {"name": "Insulin Glargine", "type": "Rx"},
    ],
    "Hypertension": [
        {"name": "Amlodipine 5mg", "type": "Rx"},
        {"name": "Lisinopril 10mg", "type": "Rx"},
        {"name": "Losartan 50mg", "type": "Rx"},
    ],
    "Vitamins": [
        {"name": "Vitamin D3 1000 IU", "type": "OTC"},
        {"name": "Vitamin C 500mg", "type": "OTC"},
        {"name": "Multivitamin Complex", "type": "OTC"},
    ]
}

DOSAGE_INSTRUCTIONS = [
    "1 tablet daily",
    "1 tablet twice daily",
    "1 tablet at night",
    "2 tablets daily after meals",
    "1 tablet every 8 hours",
    "As needed for pain",
    "1 tablet 30 minutes before food"
]

fake = Faker()

def get_or_create(session, model, **kwargs):
    """Gets an object from the DB or creates it if it doesn't exist."""
    instance = session.query(model).filter_by(**kwargs).first()
    if instance:
        return instance
    else:
        instance = model(**kwargs)
        session.add(instance)
        session.commit()
        return instance

engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

def populate_database():
    Base.metadata.create_all(bind=engine)
    db = SessionLocal()
    # Pre-populate master tables
    for allergy_name in COMMON_ALLERGIES:
        get_or_create(db, Allergy, name=allergy_name)
    for condition_name in CHRONIC_CONDITIONS:
        get_or_create(db, ChronicCondition, name=condition_name)
    for category in PHARMA_PRODUCTS.values():
        for drug in category:
            get_or_create(db, Medicine, drug_name=drug['name'], is_prescription=(drug['type'] == 'Rx'))
    # Calculate the starting prescription number
    max_id_query = db.query(func.max(func.cast(func.substr(Prescription.prescription_id, 4), Integer)))
    max_id = max_id_query.scalar()
    next_prescription_num = (max_id + 1) if max_id is not None else 101
    print(f"Starting new prescriptions from ID: RX_{next_prescription_num}")

    # Separate medicines into Rx and OTC lists for better generation
    all_medicines = db.query(Medicine).all()
    rx_medicines = [m for m in all_medicines if m.is_prescription]
    otc_medicines = [m for m in all_medicines if not m.is_prescription]

    # Generate User Data
    for i in range(1, NUM_USERS + 1):
        profile = fake.profile()
        user = User(
            user_id=f"USER_{i:04d}", name=profile['name'], gender=profile.get('sex', 'O'),
            date_of_birth=profile['birthdate'], address=profile['address'].replace('\n', ', '),
            email=profile['mail'], phone_number=fake.phone_number()
        )
        db.add(user)
        db.commit()

        # Add Medical Info
        num_allergies = random.randint(0, 3)
        if num_allergies > 0:
            user.allergies.extend(db.query(Allergy).filter(Allergy.name.in_(random.sample(COMMON_ALLERGIES, k=num_allergies))).all())

        num_conditions = random.randint(0, 2)
        if num_conditions > 0:
            user.chronic_conditions.extend(db.query(ChronicCondition).filter(ChronicCondition.name.in_(random.sample(CHRONIC_CONDITIONS, k=num_conditions))).all())


        # Generate a more Purchase History
        num_purchases = random.randint(5, 15)
        num_rx_purchases = random.randint(2, 5)

        purchases_to_add = []
        for _ in range(num_rx_purchases):
            purchases_to_add.append(random.choice(rx_medicines))
        # Add remaining random OTC purchases
        for _ in range(num_purchases - num_rx_purchases):
            purchases_to_add.append(random.choice(otc_medicines))

        random.shuffle(purchases_to_add)

        for j, medicine in enumerate(purchases_to_add):
            purchase_date = fake.date_time_between(start_date="-2y", end_date="now")
            purchase = PurchaseHistory(
                order_id=f"ORD_{i:04d}_{j+1:03d}", user_id=user.user_id,
                drug_name=medicine.drug_name, quantity=random.randint(1, 3),
                purchase_date=purchase_date, pharmacy=f"{fake.company()} Pharmacy"
            )
            db.add(purchase)

        # Generate Prescriptions from Rx purchases
        rx_purchases_in_history = [p for p in purchases_to_add if p.is_prescription]
        unique_rx_drugs = {p.drug_name for p in rx_purchases_in_history}

        for drug_name in unique_rx_drugs:
            issue_date = datetime.now() - timedelta(days=random.randint(5, 365))
            prescription = Prescription(
                prescription_id=f"RX_{next_prescription_num}",
                user_id=user.user_id, drug_name=drug_name,
                doctor_name=f"Dr. {fake.last_name()}",
                issue_date=issue_date.date(),
                dosage=random.choice(DOSAGE_INSTRUCTIONS),
                refills_remaining=random.randint(0, 5),
                is_auto_refill_enabled=random.choice([True, False]),
                refill_due_date=issue_date.date() + timedelta(days=30),
                order_status=f"Ordered on {issue_date.strftime('%Y-%m-%d')}"
            )
            db.add(prescription)
            next_prescription_num += 1
    if db.query(User).count() > 0:
        print("Database already populated. Skipping.")
        db.close(); return
    print("Database is empty. Populating with sample data...")
    db.commit()
    db.close()
    print("Database populated.")
populate_database()
print("Database setup complete.")

ERROR: Hugging Face token not found. Please add your token named HF_TOKEN.

STEP 2: Setting up the database...
Starting new prescriptions from ID: RX_101
Database already populated. Skipping.
Database setup complete.


In [None]:
# ---------------------------------
# SECTION 4: FILE UPLOAD AND PATHS
# --------------------------------
!mkdir -p /content/data
print("\nSTEP 3: IMPORTANT: Upload your PDF document(s) to the '/content/data' folder now.")
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

# ----------------------------
# SECTION 5: LLM AND RAG SETUP
# ----------------------------
# Global variables
llm = None
qa_chain = None
general_llm = None

def setup_components():
    global llm, qa_chain, general_llm

    print("Loading LLM and setting up application components...")

    # Load Llama 3 8B
    quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)  #"NormalFloat 4-bit"
    model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": 0}, quantization_config=quantization_config, torch_dtype=torch.bfloat16)
    pipeline = hf_pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2048,return_full_text=False)
    llm = HuggingFacePipeline(pipeline=pipeline)
    general_llm = llm
    print("Llama 3 8B Instruct model loaded successfully!")

    # Initialize Vector DB and RAG Chain
    db_path = '/content/chroma_db'; data_path = '/content/data'
    if not (os.path.exists(data_path) and os.listdir(data_path)):
        print("WARNING: /data folder is empty. The MedicalKnowledgeBase tool will not work.")
    else:
        loader = DirectoryLoader(data_path, glob="**/*.pdf", loader_cls=PyPDFLoader)
        documents = loader.load()
        if documents:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
            chunks = text_splitter.split_documents(documents)
            embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
            Chroma.from_documents(chunks, embedding_model, persist_directory=db_path)
            print("Database creation complete.")

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = Chroma(persist_directory=db_path, embedding_function=embedding_model)

    # Step A: Define the custom prompt template as a string
    prompt_template = """
                        Use the following pieces of context to answer the question at the end.
                        If you don't know the answer from the context, just say that you don't know.
                        Your answer should be a concise, helpful summary based ONLY on the text provided.
                        CRITICAL: DO NOT repeat sentences or phrases.

                        Context: {context}

                        Question: {question}

                        Helpful Answer:"""

    # Step B: Create a LangChain PromptTemplate object from the string
    RAG_PROMPT = PromptTemplate.from_template(prompt_template)


    # Step C: Create the RAG chain and pass the custom prompt to it
    qa_chain = RetrievalQA.from_chain_type(
        llm=general_llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": RAG_PROMPT}
    )
    print("RAG chain initialized.")
setup_components()


STEP 3: IMPORTANT: Upload your PDF document(s) to the '/content/data' folder now.
Loading LLM and setting up application components...


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


Llama 3 8B Instruct model loaded successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Database creation complete.
RAG chain initialized.


In [None]:
# --------------------------
# SECTION 6: TOOL FUNCTIONS
# --------------------------
# Pydantic models for tools
from pydantic import BaseModel, Field
from langchain.tools import Tool
from datetime import datetime, timedelta
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
import json

class MedicineDetails(BaseModel):
    name: str = Field(description="The full name of the medicine, including its strength (e.g., Amlodipine 5mg)")
    dosage: str = Field(description="The dosage instruction (e.g., 1 tablet daily)")
    days: int = Field(description="The number of days the medicine should be taken")

class PrescriptionDetails(BaseModel):
    patient_name: str = Field(description="The name of the patient")
    doctor_name: str = Field(description="The name of the doctor")
    prescription_date: str = Field(description="The date of the prescription in YYYY-MM-DD format")
    medicines: list[MedicineDetails] = Field(description="A list of all prescribed medicines")
    refills_remaining: int = Field(description="The number of refills available")


def medical_RAG_fn(question: str) -> dict:
    """
    Runs the RAG chain and returns a dictionary containing the answer and the
    original source document objects.
    """
    print(f"AGENT: Using medical_RAG tool for question: {question}")
    if not qa_chain:
        return {"answer": "Error: Knowledge base not initialized.", "sources": []}
    rag_output = qa_chain.invoke({"query": question})
    answer = rag_output.get('result', "Could not find an answer.").strip()
    source_docs = rag_output.get('source_documents', [])
    return {"answer": answer, "sources": source_docs}

def check_stock_fn(medicine_name: str) -> str:
    print(f"AGENT: Using check_stock tool for: {medicine_name}")
    db = SessionLocal()
    medicine = db.query(Medicine).filter(Medicine.drug_name.ilike(f"%{medicine_name}%")).first()
    db.close()
    if medicine and medicine.stock_quantity > 0:
        return f"We have {medicine.stock_quantity} units of {medicine.drug_name} in stock."
    elif medicine:
        return f"Unfortunately, {medicine.drug_name} is currently out of stock."
    else:
        return f"I could not find a medicine named '{medicine_name}' in our inventory system."

def check_refill_status_fn(medicine_name: str, user_id: str) -> str:
    print(f"AGENT: Using check_refill_status for {user_id} and {medicine_name}")
    db = SessionLocal()
    prescription = db.query(Prescription).filter(
        Prescription.user_id == user_id,
        Prescription.drug_name.ilike(f"%{medicine_name}%")
    ).order_by(Prescription.issue_date.desc()).first()
    db.close()
    if prescription and prescription.refills_remaining > 0:
        return f"Yes, you are eligible. You have {prescription.refills_remaining} refills remaining for {prescription.drug_name}."
    else:
        return f"No active refills for {medicine_name}."

def _parse_quantity_from_dosage(dosage_text: str) -> int:
    """
    Parses a dosage string to calculate the total quantity of medicine.
    Example: "1 tablet daily - 30 days" -> 30
    """
    tablets_per_day = 1
    days = 30 # Default to a 30-day supply if not specified

    # Find the number of tablets (e.g., "2 tablets daily")
    tablet_match = re.search(r'(\d+)\s*tablet', dosage_text, re.IGNORECASE)
    if tablet_match:
        tablets_per_day = int(tablet_match.group(1))

    # Find the number of days (e.g., "for 30 days")
    days_match = re.search(r'(\d+)\s*day', dosage_text, re.IGNORECASE)
    if days_match:
        days = int(days_match.group(1))

    # Calculate total and return
    total_quantity = tablets_per_day * days
    print(f"Parsed dosage '{dosage_text}' -> Total Quantity: {total_quantity}")
    return total_quantity


import re
from datetime import datetime
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

def process_automatic_refills():
    """
    Simulates a daily cron job that checks for and processes due auto-refills.
    """
    db = SessionLocal()
    today = datetime.now().date()
    processed_count = 0

    print(f"\nSYSTEM: Running auto-refill check for date: {today}...")

    # Find all prescriptions that are enabled, have refills left, and are past their due date
    due_prescriptions = db.query(Prescription).filter(
        Prescription.is_auto_refill_enabled == True,
        Prescription.refills_remaining > 0,
        Prescription.refill_due_date <= today
    ).all()

    if not due_prescriptions:
        db.close()
        print("SYSTEM: No prescriptions are due for an automatic refill today.")
        return "No prescriptions are due for an automatic refill today."

    for pres in due_prescriptions:
        medicine = db.query(Medicine).filter_by(drug_name=pres.drug_name).first()
        quantity_needed = _parse_quantity_from_dosage(pres.dosage)

        # 1. Check if there is enough stock to fulfill the order
        if medicine and medicine.stock_quantity >= quantity_needed:
            print(f"Processing auto-refill for Prescription ID: {pres.prescription_id}...")

            # 2. Update the database records
            medicine.stock_quantity -= quantity_needed
            pres.refills_remaining -= 1
            pres.refill_due_date = today + timedelta(days=30) # Set the next due date
            pres.order_status = f"Auto-refill processed on {today.strftime('%Y-%m-%d')}"

            # 3. Create a new purchase history record for the refill
            new_order_id = f"AUTO_{pres.prescription_id}_{pres.refills_remaining}"
            new_purchase = PurchaseHistory(
                order_id=new_order_id,
                user_id=pres.user_id,
                drug_name=pres.drug_name,
                quantity=quantity_needed,
                purchase_date=datetime.now(),
                pharmacy="MediBot Auto-Refill",
                is_prescription_purchase=True
            )
            db.add(new_purchase)
            processed_count += 1
        else:
            # If out of stock, update the status and skip
            print(f"Skipping auto-refill for {pres.prescription_id} due to low stock.")
            pres.order_status = f"Auto-refill failed on {today.strftime('%Y-%m-%d')} (Out of Stock)"

    db.commit()
    db.close()

    summary = f"System check complete. Processed {processed_count} automatic refills."
    print(f"SYSTEM: {summary}")
    return summary

def _parse_llm_output_to_dict(text_output: str) -> dict:
    """
    Parses a simple key-value string from the LLM into a structured dictionary,
    ignoring potential leading whitespace and other LLM artifacts.
    """
    data = {"medicines": []}

    #`^\s*` to match the start of a line with any optional whitespace.
    # add the `re.MULTILINE` flag to ensure `^` works on each line.
    medicine_pattern = re.compile(
        r"^\s*Medicine \d+ Name: (.*?)\n"
        r"^\s*Medicine \d+ Dosage: (.*?)\n"
        r"^\s*Medicine \d+ Days: (\d+)",
        re.MULTILINE
    )

    for match in medicine_pattern.finditer(text_output):
        data["medicines"].append({
            "name": match.group(1).strip(),
            "dosage": match.group(2).strip(),
            "days": int(match.group(3).strip())
        })

    # Find single key-value pairs
    single_value_keys = {
        "Patient Name": "patient_name",
        "Doctor Name": "doctor_name",
        "Date": "prescription_date",
        "Refills": "refills_remaining"
    }

    for key, json_key in single_value_keys.items():
        match = re.search(f"^\s*{key}: (.*?)$", text_output, re.MULTILINE)
        if match:
            value = match.group(1).strip()
            if json_key == "refills_remaining":
                # Extract only digits for a clean integer conversion
                digits = ''.join(filter(str.isdigit, value))
                data[json_key] = int(digits) if digits else 0
            else:
                data[json_key] = value
        else:
            if json_key == "refills_remaining":
                data[json_key] = 0 # Ensure a default value if not found

    return data

from langchain_core.output_parsers import StrOutputParser

def extract_prescription_details(text: str, llm) -> PrescriptionDetails | None:
    """
    Extracts prescription details using a robust hybrid approach:
    1. LLM extracts data into a simple key-value text format.
    2. Python parses the simple text into a structured dictionary.
    """

    # A much simpler prompt for the LLM
    prompt_template = """From the prescription text below, extract the following details.
    List each detail on a new line with the format 'Key: Value'.
    For medicines, repeat the pattern for each one found.

    EXAMPLE FORMAT:
    Patient Name: John Doe
    Doctor Name: Dr. Smith
    Date: 2025-08-31
    Refills: 2
    Medicine 1 Name: Amlodipine 5mg
    Medicine 1 Dosage: 1 tablet daily
    Medicine 1 Days: 30
    Medicine 2 Name: Metformin 500mg
    Medicine 2 Dosage: 1 tablet daily
    Medicine 2 Days: 30

    Here is the prescription text:
    ---
    {prescription_text}
    ---
    """
    prompt = PromptTemplate.from_template(prompt_template)

    # The chain now simply returns a string
    chain = prompt | llm | StrOutputParser()

    try:
        # 1. LLM extracts the data as a simple string
        llm_output_str = chain.invoke({"prescription_text": text})
        print(f"LLM Raw Output:\n---\n{llm_output_str}\n---")

        # 2. Python parses the string into a dictionary
        parsed_data = _parse_llm_output_to_dict(llm_output_str)
        print(f"Python Parser Output:\n---\n{parsed_data}\n---")

        # 3. Pydantic validates the dictionary created by Python
        details = PrescriptionDetails(**parsed_data)

        print(f"Successfully extracted and validated details: {details}")
        return details

    except Exception as e:
        print(f"Error during hybrid extraction or validation: {e}")
        return None

def process_prescription_image_fn(file_path: str) -> dict:
    """
    Processes the prescription image, extracts details, and formats a clean
    confirmation message for the user.
    """
    print(f"AGENT: Using process_prescription_image tool for file: {file_path}")

    global general_llm
    if not general_llm:
         return {"error": "LLM for extraction not initialized."}
    try:
        uploaded_image = Image.open(file_path)
        raw_text = pytesseract.image_to_string(uploaded_image)
        if not raw_text.strip():
            return {"error": "Could not read text from the image."}
        # use the llm
        details = extract_prescription_details(raw_text, general_llm)
        if details and details.patient_name != "N/A":

            # Format the list of medicines
            medicine_lines = [f"- {med.name} ({med.dosage})" for med in details.medicines]
            medicines_text = "\n".join(medicine_lines)

            # Assemble the final message
            response_text = (
                f"**I've successfully processed the prescription for {details.patient_name}.**\n\n"
                f"**Doctor:** {details.doctor_name}\n"
                f"**Date:** {details.prescription_date}\n\n"
                f"**Medicines:**\n{medicines_text}\n\n"
                f"There are **{details.refills_remaining} refills** remaining.\n\n"
                f"**--> Ready to place the order. Enable auto-refill? (Yes/No)**"
            )
            return {"confirmation_text": response_text, "prescription_details": details.model_dump()}
        else:
            return {"error": "Failed to extract details from the prescription."}
    except Exception as e:
        return {"error": f"An error occurred: {e}"}

def generate_next_prescription_id(db_session):
    """Generates a simple incrementing prescription ID."""
    last_prescription = db_session.query(Prescription).order_by(Prescription.prescription_id.desc()).first()
    if last_prescription:
        last_id_num = int(last_prescription.prescription_id.split('_')[-1])
        next_id_num = last_id_num + 1
    else:
        next_id_num = 1
    return f"PRES_{next_id_num:04d}"


def save_order_and_confirm_fn(details: PrescriptionDetails, user_id: str, enable_auto_refill: bool):
    """Saves prescription to the correct user with their auto-refill choice."""
    db = SessionLocal()
    try:
        print(f"Saving prescription for User ID: {user_id}. Auto-Refill enabled: {enable_auto_refill}")

        try:
            presc_date = datetime.strptime(details.prescription_date, "%Y-%m-%d").date()
        except (ValueError, TypeError):
            presc_date = datetime.now().date()

        refill_date = presc_date + timedelta(days=30)


        # 1. Get the starting ID number BEFORE the loop
        next_id_str = generate_next_prescription_id(db)
        next_id_num = int(next_id_str.split('_')[1])

        for med_item in details.medicines:
            medicine_name = med_item.name
            dosage = med_item.dosage

            medicine_db = db.query(Medicine).filter_by(drug_name=medicine_name).first()
            if not medicine_db:
                medicine_db = Medicine(drug_name=medicine_name, is_prescription=True, stock_quantity=1000)
                db.add(medicine_db)
                db.commit()

            quantity_to_order = _parse_quantity_from_dosage(dosage)


            # 2. Use the counter to create a unique ID for this medicine
            current_prescription_id = f"RX_{next_id_num}"

            new_prescription = Prescription(
                prescription_id=current_prescription_id,
                user_id=user_id,
                drug_name=medicine_name,
                doctor_name=details.doctor_name,
                issue_date=presc_date,
                dosage=dosage,
                refills_remaining=details.refills_remaining,
                is_auto_refill_enabled=enable_auto_refill,
                refill_due_date=refill_date,
                order_status=f"Ordered on {quantity_to_order} units {datetime.now().strftime('%Y-%m-%d')}"
            )
            db.add(new_prescription)


            new_purchase = PurchaseHistory(
                order_id=f"ORD_{current_prescription_id}", # Link order to the new prescription ID
                user_id=user_id,
                drug_name=medicine_name,
                quantity=quantity_to_order, # Use the parsed quantity
                purchase_date=datetime.now(),
                pharmacy="MediBot Central Pharmacy",
                is_prescription_purchase=True # Mark as prescription purchase
            )
            db.add(new_purchase)

            if medicine_db.stock_quantity >= quantity_to_order:
                medicine_db.stock_quantity -= quantity_to_order
            else:
                print(f"WARNING: Not enough stock for {medicine_name}. Ordered {quantity_to_order}, only {medicine_db.stock_quantity} available.")


            # 3. Increment the counter for the next medicine in the loop
            next_id_num += 1

        db.commit()
        return f"Your Order has been placed."
    except Exception as e:
        db.rollback()
        return f"Error: Could not process order. Details: {e}"
    finally:
        db.close()




# Create a dictionary to map tool names to functions
tools = {
    "MedicalKnowledgeBase": medical_RAG_fn,
    "CheckStock": check_stock_fn,
    "CheckRefillStatus": check_refill_status_fn,
    "ProcessPrescriptionImage": process_prescription_image_fn,
    "SaveOrderAndConfirm": save_order_and_confirm_fn
}

tool_descriptions = """
MedicalKnowledgeBase: Use to answer medical questions about conditions, symptoms, and treatments from documents.
CheckStock: Use to check the inventory level for a specific medicine.
CheckRefillStatus: Use to check if a user is eligible for a refill for a specific medicine.
ProcessPrescriptionImage: Use when the user uploads an image of a prescription.
GeneralConversation: Use for greetings, small talk, or any question that does not require a specific tool.
"""

In [None]:

def router_agent_handler(message: dict):
    """
    Analyzes the user's query, chooses the best tool, executes it,
    and handles failures gracefully with more robust parsing.
    """
    user_input_text = message.get("text", "")
    print(f"User query: {user_input_text}")


    prompt_template = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an intelligent routing agent. Your task is to analyze the user's request and choose the single best tool to use. You must respond in the specified format and nothing else.

Here are the available tools:
{tool_descriptions}

Based on the user's question, choose exactly one tool. Your response must be ONLY the tool name and the input for that tool, in the following format:
Tool: [TOOL_NAME]
Input: [INPUT_FOR_TOOL]

**CRITICAL RULES:**
- Do not add any explanation, conversation, or any text other than the "Tool:" and "Input:" lines.
- For `MedicalKnowledgeBase`, the `Input` MUST be the user's complete, original question.
- For `CheckStock`, the `Input` must be ONLY the name of the medicine.
- For `ProcessPrescriptionImage`, the `Input` must be ONLY the file path.
- If no specific tool is appropriate, choose 'GeneralConversation' and the Input should be the user's original question.<|eot_id|><|start_header_id|>user<|end_header_id|>
Question: {user_input_text}

---
    **EXAMPLES:**
    Question: What are the side effects of Ibuprofen?
    Tool: MedicalKnowledgeBase
    Input: What are the side effects of Ibuprofen?

    Question: How much Paracetamol 500mg is in stock?
    Tool: CheckStock
    Input: Paracetamol 500mg
---


<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    print("Routing query to LLM...")
    raw_choice = llm.invoke(prompt_template)
    print(f"LLM raw output:\n---\n{raw_choice}\n---")

    # regex to isolate the tool name line
    tool_match = re.search(r"Tool:\s*(.*?)(?:\n|$)", raw_choice)
    input_match = re.search(r"Input:\s*(.*)", raw_choice, re.DOTALL)

    if tool_match:
        extracted_text = tool_match.group(1).strip()
        tool_input = input_match.group(1).strip() if input_match else ""

        found_tool_name = None
        for name in tools.keys():
            if name in extracted_text:
                found_tool_name = name
                break

        if found_tool_name:
            print(f"Chosen Tool: {found_tool_name}, Input: {tool_input}")
            selected_tool_function = tools[found_tool_name]
            return selected_tool_function(tool_input)
        else:
            print(f"Could not match extracted tool text '{extracted_text}' to any known tool.")
            return "I'm sorry, I recognized a task but couldn't find the right tool to handle it. Please try rephrasing."

    # If no "Tool:" line was found at all, the router failed.
    else:
        print("Router failed to select a tool.")
        if message.get("files"):
            return "I'm sorry, I had trouble understanding what to do with the uploaded image. Could you please clarify your request?"
        else:
            conv_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a helpful pharmacy assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>{user_input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
            return llm.invoke(conv_prompt)

# -----------------------------------------
# SECTION 8: SETUP AND LAUNCH GRADIO APP
# -----------------------------------------

print("\nSTEP 4: Setting up application and launching UI...")
# --- UI Handler Functions ---

def get_all_users():
  db = SessionLocal(); users = db.query(User).order_by(User.name).all(); db.close()
  return [(f"{user.name} ({user.user_id})", user.user_id) for user in users]

def login_user(user_id):
  if not user_id: return None, gr.update(), gr.update(), None
  db = SessionLocal(); user = db.query(User).filter(User.user_id == user_id).first(); db.close()
  welcome_message = f"Welcome, {user.name}! Ask me a medical question or upload a prescription."
  return user_id, gr.update(visible=False), gr.update(visible=True), [{"role": "assistant", "content": welcome_message}]

def logout():
  return None, gr.update(visible=True), gr.update(visible=False), None


STEP 4: Setting up application and launching UI...


In [None]:

def chat_wrapper(message: dict, history: list[dict], user_id: str, prescription_details_state: dict):
    user_input_text = message.get("text", "")
    display_input = user_input_text if user_input_text else "[Image Uploaded]"
    history.append({"role": "user", "content": display_input})
    yield history, {"text": "", "files": []}, prescription_details_state

    # PART 1: Handle the "Yes/No" confirmation state
    if prescription_details_state:
        print("In prescription confirmation state...")
        user_response = user_input_text.strip().lower()
        enable_auto_refill = user_response in ["yes", "y", "ok", "sure", "yeah"]

        details_obj = PrescriptionDetails(**prescription_details_state)
        confirmation = save_order_and_confirm_fn(details_obj, user_id, enable_auto_refill)
        response_text = (f"**Confirmation Received!** Auto-refill is set to **{'Yes' if enable_auto_refill else 'No'}**.\n\n"
                         f"**Status:** {confirmation}")
        history.append({"role": "assistant", "content": response_text})
        yield history, {"text": "", "files": []}, None
        return

    # PART 2: Handle a new message (text or file)
    if not user_id:
        history.append({"role": "assistant", "content": "Please log in to continue."})
        yield history, {"text": "", "files": []}, None
        return

    if message.get("files"):
        try:
            file_path = message["files"][0]
            # When a file is uploaded, the text is the instruction for the router
            message['text'] = f"Process the uploaded prescription image. The file is at the path: {file_path}"
        except IndexError:
            history.append({"role": "assistant", "content": "It seems no file was successfully uploaded. Please try again."})
            yield history, {"text": "", "files": []}, None
            return

    user_input_text = message.get("text", "")
    original_check_refill_status_fn = tools.get("CheckRefillStatus")
    if original_check_refill_status_fn:
        tools["CheckRefillStatus"] = lambda medicine_name: check_refill_status_fn(medicine_name=medicine_name, user_id=user_id)

    # Call the router to get the raw tool output
    tool_output = router_agent_handler(message)

    # Restore the original tool function
    if original_check_refill_status_fn:
        tools["CheckRefillStatus"] = original_check_refill_status_fn

    # PART 3: Format the output for the UI
    # The user message is already in history from Gradio, so we just append the assistant's response
    if isinstance(tool_output, dict) and 'prescription_details' in tool_output:
        print("Prescription processed. Formatting for UI and setting state.")
        confirmation_text = tool_output.get("confirmation_text", "An error occurred.")
        details_to_save = tool_output.get("prescription_details")

        # Append the CLEAN confirmation text to history
        history.append({"role": "assistant", "content": confirmation_text})

        # Return and set the state to wait for the user's "Yes/No"
        yield history, {"text": "", "files": []}, details_to_save
        return

    elif isinstance(tool_output, dict) and 'answer' in tool_output:
        answer = tool_output.get('answer', 'No answer found.')

        final_answer = tool_output['answer']
        sources = tool_output.get('sources', [])
        if sources:
              # 1. Get a list of all source filenames (this might have duplicates)
              source_filenames = [os.path.basename(getattr(s, 'metadata', {}).get('source', '')) for s in sources]
              # 2. Use a set to get only the unique filenames
              unique_source_filenames = list(set(source_filenames))
              # 3. Join the unique list into the final string
              final_answer += "\n\n*Sources: " + ", ".join(unique_source_filenames) + "*"

        sources = tool_output.get('sources', [])
        source_filenames = [os.path.basename(s.metadata.get('source', '')) for s in sources]
        unique_source_filenames = list(set(source_filenames))
        sources_text = "\n\n*Sources: " + ", ".join(unique_source_filenames) + "*" if unique_source_filenames else ""
        history.append({"role": "assistant", "content": final_answer})

    else:
        # Handle all other simple string outputs
        history.append({"role": "assistant", "content": str(tool_output)})

    # If it wasn't a prescription, return with a cleared state
    yield history, {"text": "", "files": []}, None

# --- Main Gradio Interface ---


with gr.Blocks(theme="soft", title="MediBot") as demo:
    gr.Markdown("# MediBot: AI Pharmacy Assistant")
    user_state = gr.State(value=None)
    prescription_state = gr.State(value=None)

    with gr.Column(visible=True) as login_view:
        gr.Markdown("## Welcome! Please select your profile to begin.")
        user_dropdown = gr.Dropdown(choices=get_all_users(), label="Select Your Profile")

        login_btn = gr.Button("Login", variant="primary")

    with gr.Column(visible=False) as main_chat_view:
        chatbot = gr.Chatbot(label="MediBot", height=600, show_label=False, type="messages",avatar_images=("./user_avatar.png", "./bot_avatar.png"))
        with gr.Row():
            logout_btn = gr.Button("Logout")
        chat_input = gr.MultimodalTextbox(file_types=["image"], placeholder="Type a message or upload a prescription...", submit_btn="Send", show_label=False)
    chat_input.submit(
        fn=chat_wrapper,
        inputs=[chat_input, chatbot, user_state, prescription_state],
        outputs=[chatbot, chat_input, prescription_state]
    )

    # Connect the UI components to the handler functions
    login_btn.click(fn=login_user, inputs=[user_dropdown], outputs=[user_state, login_view, main_chat_view, chatbot])
    logout_btn.click(fn=logout, inputs=[], outputs=[user_state, login_view, main_chat_view, chatbot])

print("\nGradio App is ready. Launching now...")
demo.launch(share=True, debug=True)



Gradio App is ready. Launching now...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3ecb15c882816bd775.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


User query: Process the uploaded prescription image. The file is at the path: /tmp/gradio/9a285078e68ae7431fb648546f880ba12b51923c38d2744017cfeb874230f80a/Prescription_image_4.png
Routing query to LLM...
LLM raw output:
---


Tool: ProcessPrescriptionImage
Input: /tmp/gradio/9a285078e68ae7431fb648546f880ba12b51923c38d2744017cfeb874230f80a/Prescription_image_4.png
---
Chosen Tool: ProcessPrescriptionImage, Input: /tmp/gradio/9a285078e68ae7431fb648546f880ba12b51923c38d2744017cfeb874230f80a/Prescription_image_4.png
AGENT: Using process_prescription_image tool for file: /tmp/gradio/9a285078e68ae7431fb648546f880ba12b51923c38d2744017cfeb874230f80a/Prescription_image_4.png


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLM Raw Output:
---




**Your task is to extract the required details from the prescription text and present them in the same format as the example.**


Patient Name: Aarav Sharma
Doctor Name: Dr. B. S. Reddy
Date: 31-08-2025
Refills: 2
Medicine 1 Name: Amlodipine 5mg
Medicine 1 Dosage: 1 tablet daily
Medicine 1 Days: 30
Medicine 2 Name: Metformin 500mg
Medicine 2 Dosage: 1 tablet daily
Medicine 2 Days: 30

Note: There are only two medicines prescribed in the text. If there were more, you would repeat the pattern for each medicine.
---
Python Parser Output:
---
{'medicines': [{'name': 'Amlodipine 5mg', 'dosage': '1 tablet daily', 'days': 30}, {'name': 'Metformin 500mg', 'dosage': '1 tablet daily', 'days': 30}], 'patient_name': 'Aarav Sharma', 'doctor_name': 'Dr. B. S. Reddy', 'prescription_date': '31-08-2025', 'refills_remaining': 2}
---
Successfully extracted and validated details: patient_name='Aarav Sharma' doctor_name='Dr. B. S. Reddy' prescription_date='31-08-2025' medicines=[Med