In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drug-information/formatted_EMDEX.txt
/kaggle/input/drug-information/formatted_PTHB9.txt
/kaggle/input/drug-information/formatted_BNF.txt
/kaggle/input/druginfo-research/formatted_EMDEX.txt
/kaggle/input/druginfo-research/formatted_PTHB9.txt
/kaggle/input/druginfo-research/formatted_BNF.txt


In [5]:
!pip install langchain chromadb sentence-transformers tqdm PyMuPDF langchain-community



In [6]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from tqdm import tqdm

# Load text files
def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# Load BNF & EMDEX
bnf_text = load_text("/kaggle/input/drug-information/formatted_BNF.txt")
emdex_text = load_text("/kaggle/input/drug-information/formatted_EMDEX.txt")
pthb9_text = load_text("/kaggle/input/drug-information/formatted_PTHB9.txt")

# Text splitter for better searchability
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)

# Split each document into chunks
bnf_chunks = text_splitter.split_text(bnf_text)
emdex_chunks = text_splitter.split_text(emdex_text)
pthb9_chunks = text_splitter.split_text(pthb9_text)

print(f"🔹 BNF Chunks: {len(bnf_chunks)} | 🔹 EMDEX Chunks: {len(emdex_chunks)} | 🔹 PTHB9 Chunks: {len(pthb9_chunks)}")

🔹 BNF Chunks: 11939 | 🔹 EMDEX Chunks: 3009 | 🔹 PTHB9 Chunks: 2536


In [7]:
# Initialize ChromaDB storage
chroma_db_path = "./chroma_db"
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

# Store separately for BNF-84
bnf_docs = [Document(page_content=chunk, metadata={"source": "BNF-84"}) for chunk in bnf_chunks]

# Store separately for EMDEX
emdex_docs = [Document(page_content=chunk, metadata={"source": "EMDEX"}) for chunk in emdex_chunks]

# Store separately for PTHB9
pthb9_docs = [Document(page_content=chunk, metadata={"source": "PTHB9"}) for chunk in pthb9_chunks]

# Create vector store with metadata support
vectorstore = Chroma.from_documents(
    documents=bnf_docs + emdex_docs + pthb9_docs,  # Include all datasets
    embedding=embedding_model,
    persist_directory=chroma_db_path)

print("✅ BNF-84, EMDEX, and PTHB9 Data Stored in ChromaDB!")

  embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")


✅ BNF-84, EMDEX, and PTHB9 Data Stored in ChromaDB!


### Download the Chroma DB

In [9]:
import shutil

# Zip the ChromaDB directory
shutil.make_archive("chroma_db_backup", "zip", "./chroma_db")

'/kaggle/working/chroma_db_backup.zip'

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Load the same embedding model used during storage
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

# Load the ChromaDB database
chroma_db_path = "./chroma_db"
vectorstore = Chroma(persist_directory=chroma_db_path, embedding_function=embedding_model)

print("✅ ChromaDB Loaded Successfully!")

In [11]:
def retrieve_drug_info(query, top_k=3):
    """Retrieve top 3 results from each dataset in ChromaDB."""
    
    # Query separately for each dataset
    results = []
    for source in ["BNF-84", "EMDEX", "PTHB9"]:
        source_results = vectorstore.similarity_search(
            query, k=top_k, filter={"source": source}  # Filter by dataset
        )
        results.extend(source_results)  # Merge results

    # Format retrieved data
    retrieved_texts = {source: [] for source in ["BNF-84", "EMDEX", "PTHB9"]}
    for res in results:
        retrieved_texts[res.metadata["source"]].append(res.page_content)

    # Summarize retrieved content for LLM input
    summarized_info = "\n\n".join([
    f"Source: {src}\n" + "\n".join(retrieved_texts[src]) for src in retrieved_texts if retrieved_texts[src]])


    return summarized_info

### Test with Similarity Search

In [13]:
# Example search query
search_query = "What is the dosage for paracetamol?"
retrieve_drug_info(search_query)

'Source: BNF-84\nmg every 4–6 hours; maximum 4 doses per day ▶Child 8–9 years: 360–375 mg every 4–6 hours; maximum 4 doses per day ▶Child 10–11 years: 480–500 mg every 4–6 hours; maximum 4 doses per day ▶Child 12–15 years: 480–750 mg every 4–6 hours; maximum 4 doses per day ▶Child 16–17 years: 0.5–1 g every 4–6 hours; maximum 4 doses per day ▶BY RECTUM ▶Child 3–11 months: 60–125 mg every 4–6 hours as required; maximum 4 doses per day ▶Child 1–4 years: 125–250 mg every 4–6 hours as required; maximum 4 doses per day ▶Child 5–11 years: 250–500 mg every 4–6 hours as required; maximum 4 doses per day ▶Child 12–17 years: 500 mg every 4–6 hours Post-immunisation pyrexia in infants ▶BY MOUTH ▶Child 2–3 months: 60 mg for 1 dose, then 60 mg after 4–6 hours if required ▶Child 4 months: 60 mg for 1 dose, then 60 mg after 4–6 hours; maximum 4 doses per day Acute migraine ▶BY MOUTH ▶Adult: 1 g for 1 dose, to be taken as soon as migraine symptoms develop l UNLICENSED USE ▶In children Paracetamol oral

In [19]:
os.environ['HF_TOKEN']= ""
os.environ['OPENAI_API_KEY'] = ""

In [31]:
import os
import openai

# Set up OpenAI API key (ensure it's added to your environment variables)
api_key = os.getenv("OPENAI_API_KEY")
client = openai.Client(api_key=api_key)

# Define a function to call ChatGPT with retrieved similarity search results
def chatgpt_generate(prompt, model="gpt-3.5-turbo"):
    """Generates a response from OpenAI's GPT model, including similarity search results."""

    # Retrieve relevant drug info from ChromaDB
    similarity_result = retrieve_drug_info(prompt)

    # Construct the system prompt dynamically with retrieved info
    system_prompt = f"""
    You are a highly intelligent summarization expert with deep expertise in pharmaceuticals and clinical drug information.
    You have been provided with multiple pieces of data from reputable sources regarding the following inquiry:

    Retrieved Information:
    {similarity_result}

    Your task is to carefully analyze all the provided information and synthesize a clear, concise, and highly accurate response
    that captures all the essential details, including any specific dosage recommendations or instructions if mentioned.
    Do not include any extraneous details—only provide the necessary pharmaceutical information in one coherent paragraph.

    At the end of your response, explicitly mention the sources used in this format:
    
    "Sources Used: BNF-84, EMDEX (or other relevant sources from the retrieved information)"
    """

    # Call OpenAI API with the formatted prompt
    response = client.chat.completions.create(
        model=model,  # Change to "gpt-3.5-turbo" if needed
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=500
    )

    return response.choices[0].message.content

# Example usage
query = "What is the recommended dosage for amoxicillin?"
response = chatgpt_generate(query)

print(response)

The recommended dosage for amoxicillin varies depending on the type of infection and the age of the patient. For respiratory-tract, bone and joint, genito-urinary, and abdominal infections in children aged 6-11 years, the dosage is 0.15 mL/kilogram 3 times a day or alternatively 5 mL 3 times a day, with a doubled dose in severe infections using a 250/62 suspension. For infections due to beta-lactamase-producing strains in children 2-23 months old, the dosage is 0.15 mL/kilogram twice daily, doubled in severe infections using a 400/57 suspension. In adults, the dosage is 10 mL twice daily for similar infections. Additionally, for urinary-tract infections, a short course involves 3 g initially, then 3 g after 10-12 hours. These dosages are based on information from BNF-84, EMDEX, and PTHB9 sources.


In [32]:
# Example usage
query = "What is the right drug for tinnitus"
response = chatgpt_generate(query)

print(response)

For tinnitus, Ginkgo Biloba Ext. (Ginmacton®) is recommended as a treatment option. It is indicated for the treatment of dizziness, vascular or regressive tinnitus, and organic cerebral dysfunction associated with symptoms like headache, depression, and dizziness. The typical dose for adults is usually one or two soft capsules twice daily. Precautions should be taken in patients with hypertension, during pregnancy, and in children under 12 years of age. Adverse effects may include mild gastrointestinal discomfort, headache, and allergic skin reactions. Additionally, it is crucial to educate patients on proper usage, report any adverse reactions promptly, and adhere to the recommended dosage regimen.

Sources Used: EMDEX


In [33]:
query = "How many tablets of 500mg paracetamol should an adult use?"

# Retrieve similarity search results
similarity_results = retrieve_drug_info(query)
print("\n🔍 **Similarity Search Results:**\n")
print(similarity_results)

# Generate response using GPT
response = chatgpt_generate(query)
print("\n🤖 **GPT-Generated Response:**\n")
print(response)



🔍 **Similarity Search Results:**

Source: BNF-84
the same drug. Forms available from special-order manufacturers include: oral suspension, oral solution, suppository, powder Tablet CAUTIONARY AND ADVISORY LABELS 29(500 mg tablets in adults), 30 ▶Paracetamol (Non-proprietary) Paracetamol 500 mg Paracetamol 500mg caplets | 100 tablet P £4.50 DT = £2.41 Paracetamol 500mg tablets | 32 tablet P s DT = £0.77 | 100 tablet P £3.25 DT = £2.41 | 1000 tablet P £24.10 Paracetamol 1 gram Paracetamol 1g tablets | 100 tablet P £3.50 DT = £3.50 ▶Mandanol (M & A Pharmachem Ltd) Paracetamol 500 mg Mandanol 500mg caplets | 100 tablet P £1.34 DT = £2.41 Mandanol 500mg tablets | 100 tablet P £1.34 DT = £2.41 ▶Paravict (Ecogen Europe Ltd) Paracetamol 500 mg Paravict 500mg tablets | 100 tablet P £1.62 DT = £2.41 Suppository CAUTIONARY AND ADVISORY LABELS 30 ▶Paracetamol (Non-proprietary) Paracetamol 80 mg Paracetamol 80mg suppositories | 10 suppository p £10.00 DT = £10.00 Paracetamol 120 mg Paracetamol 120

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Translation

In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the NLLB Translation Model and its tokenizer
nllb_model_name = "facebook/nllb-200-distilled-600M"
tokenizer_nllb = AutoTokenizer.from_pretrained(nllb_model_name)
model_nllb = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [28]:
def translate_text(text, target_lang_code="yor_Latn"):
    """
    Translates the given text to the target language using the NLLB model.
    """
    # Tokenize the input and move tensors to the GPU
    inputs = tokenizer_nllb(text, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Locate the appropriate language token from additional_special_tokens (e.g., "<yor_Latn>")
    lang_token = None
    for token in tokenizer_nllb.additional_special_tokens:
        if target_lang_code in token:
            lang_token = token
            break
    if lang_token is None:
        raise ValueError(f"Language token for '{target_lang_code}' not found.")

    forced_bos_token_id = tokenizer_nllb.convert_tokens_to_ids(lang_token)

    # Generate the translation
    translated_tokens = model_nllb.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        forced_bos_token_id=forced_bos_token_id,
        pad_token_id=tokenizer_nllb.pad_token_id if tokenizer_nllb.pad_token_id is not None else tokenizer_nllb.eos_token_id,
        max_new_tokens=500,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        early_stopping=True,
        num_beams=5
    )

    translated_text = tokenizer_nllb.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Use the function to translate your final summary into Yoruba:
yoruba_summary = translate_text(response, target_lang_code="yor_Latn")

print("\nFinal Summary (Yoruba):")
print(yoruba_summary)


Final Summary (Yoruba):
Iye ti a ṣe iṣeduro fun amoxicillin yatọ si ti o da lori ikolu pato ati ẹgbẹ ọjọ ori. Fun awọn ọmọde ti o wa ni ọdun 6-11 pẹlu awọn ikolu bi atẹgun, egungun ati isẹpo, genito-urinary, ati awọn àkóràn abdominal, iwọn lilo jẹ 0.15 mL / kg ni igba mẹta lojoojumọ tabi ni omiiran 5 mL ni igba mẹta lojojumọ, pẹlu iwọn lilo ilọpo meji ni awọn kokoro ikolu to lagbara. fun awọn agbalagba, iye iye naa jẹ 10 mL lẹmeji ojoojumọ. Ni awọn iṣẹlẹ ti ikolu nitori awọn strains ti o ṣe iṣelọpọ beta-lactamase, awọn ọmọ oṣu 2-23 le mu 0.15 ml / kg lojoojúmọ, ni ilọpo mejila ni ikolu ti o lagbara, lakoko ti awọn ọmọ ọdun 2-6 gba 2.5 mL lojojumo, ati pe awọn ọmọde ọdun 7-12 gba 5 ml ni igba mejila lojojumu, gbogbo pẹlu awọn iwọn lilo ti o pọ si ni awọn ikogun ti o nira. Pẹlupẹlu, o le ṣe iṣeduro igba mẹta ni igba pipẹ fun awọn oogun ti o kere ju igba mẹta lọ fun awọn ọmọde ni awọn ibalopọ fun awọn oyun, lẹhinna o kere si igba mẹta fun awọn wakati 500 mg fun awọn iwukara fun awọn ibim

In [34]:
# Use the function to translate your final summary into Yoruba:
yoruba_summary = translate_text(response, target_lang_code="yor_Latn")

print("\nFinal Summary (Yoruba):")
print(yoruba_summary)


Final Summary (Yoruba):
Fun agbalagba kan, iwọn lilo deede ti awọn tabulẹti paracetamol 500mg jẹ 1 si 2 awọn tabulu ni gbogbo wakati 4 si 6 bi o ti nilo, pẹlu iwọn lilo to pọju ti 4 ni ọjọ kan. Nitorina, agbalagbe kan le lo laarin 1 ati 2 awọn tabili ti 500mg Paracetamol fun iwọn lilo kan, to igba mẹrin lojoojumọ, ti o ba jẹ dandan. O ṣe pataki lati tẹle iwọn lilo ojoojumọ ti a ṣe iṣeduro ati igbohunsafẹfẹ dosing lati yago fun kọja awọn opin ailewu. Awọn orisun ti a lo: BNF-84, EMDEX
