In [1]:
import torch

print("CUDA:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

CUDA: True
GPU: NVIDIA GeForce RTX 5070 Ti Laptop GPU


In [2]:
from pathlib import Path
from pypdf import PdfReader
import re
import json

RAW_PDF_DIR = Path("data/raw_pdfs")
OUTPUT_PATH = Path("data/extracted/clean_text.json")

def clean_text(text: str) -> str:
    """
    Minimal but effective cleaning:
    - remove page numbers
    - remove extra whitespace
    - normalize line breaks
    """
    text = re.sub(r"\n\s*\d+\s*\n", "\n", text)   # page numbers
    text = re.sub(r"[ \t]+", " ", text)           # extra spaces
    text = re.sub(r"\n{3,}", "\n\n", text)        # large gaps
    return text.strip()

documents = []

for pdf_path in RAW_PDF_DIR.glob("*.pdf"):
    reader = PdfReader(pdf_path)
    full_text = []

    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text.append(page_text)

    cleaned = clean_text("\n".join(full_text))

    documents.append({
        "source": pdf_path.name,
        "text": cleaned
    })

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)

print(f"Extracted and cleaned {len(documents)} documents")
print("Saved to:", OUTPUT_PATH)


Extracted and cleaned 6 documents
Saved to: data\extracted\clean_text.json


In [3]:
import json
from pprint import pprint

with open("data/extracted/clean_text.json", "r", encoding="utf-8") as f:
    docs = json.load(f)

print("Number of documents:", len(docs))
print("\n--- SAMPLE ---\n")
print(docs[0]["source"])
print(docs[0]["text"][:1000])

Number of documents: 6

--- SAMPLE ---

actualFA.pdf
Page | 1 
 
An introduction to first aid 
 
Imagine: Whilst feeding your child, they start to gag and appear unable to 
breathe. You have tried slapping them on the back, with no success. They seem 
close to losing consciousness, their lips are turning a definite shade of blue. 
 
People rarely give first aid a thought, until the day they need it. The above 
scenario is the sort of every day occurrence that can so easily lead to tragedy. 
 
However, with the correct first aid training anyone could, in the short term (until 
the arrival of the emergency services) save a life. 
 
These notes have been designed to aid you with your first aid training. It is, 
however, not a substitute for hands on training from a professional first aid 
trainer, but a reference for you to look back on when you need to. 
 
We hope the training you undertake with us will give you the knowledge and 
confidence to, if the worst happens, help keep someone al

In [4]:
import json
from pathlib import Path
import uuid

INPUT_PATH = Path("data/extracted/clean_text.json")
OUTPUT_PATH = Path("data/extracted/chunks.json")

MAX_WORDS = 320      # sweet spot
OVERLAP_WORDS = 40   # light overlap

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    documents = json.load(f)

def chunk_text(text, max_words=MAX_WORDS, overlap=OVERLAP_WORDS):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks = []
    current = []
    current_len = 0

    for para in paragraphs:
        words = para.split()
        wlen = len(words)

        # Case 1: paragraph itself is too large â†’ split it
        if wlen > max_words:
            for i in range(0, wlen, max_words - overlap):
                sub = words[i:i + max_words]
                chunks.append(" ".join(sub))
            current = []
            current_len = 0
            continue

        # Case 2: normal accumulation
        if current_len + wlen <= max_words:
            current.append(para)
            current_len += wlen
        else:
            # finalize current chunk
            chunks.append(" ".join(current))

            # overlap
            overlap_words = " ".join(" ".join(current).split()[-overlap:])
            current = [overlap_words, para]
            current_len = len(overlap_words.split()) + wlen

    if current:
        chunks.append(" ".join(current))

    return chunks

all_chunks = []

for doc in documents:
    doc_chunks = chunk_text(doc["text"])
    for i, chunk in enumerate(doc_chunks):
        all_chunks.append({
            "chunk_id": f"{doc['source']}_{i}",
            "source": doc["source"],
            "text": chunk
        })

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"Created {len(all_chunks)} chunks")
print("Saved to:", OUTPUT_PATH)

Created 1620 chunks
Saved to: data\extracted\chunks.json


In [5]:
import json
from pprint import pprint

with open("data/extracted/chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

print("Total chunks:", len(chunks))
print("\n--- SAMPLE CHUNK ---\n")
pprint(chunks[0]["chunk_id"])
print(chunks[0]["text"][:800])
print("\nWord count:", len(chunks[0]["text"].split()))

Total chunks: 1620

--- SAMPLE CHUNK ---

'actualFA.pdf_0'
Page | 1 An introduction to first aid Imagine: Whilst feeding your child, they start to gag and appear unable to breathe. You have tried slapping them on the back, with no success. They seem close to losing consciousness, their lips are turning a definite shade of blue. People rarely give first aid a thought, until the day they need it. The above scenario is the sort of every day occurrence that can so easily lead to tragedy. However, with the correct first aid training anyone could, in the short term (until the arrival of the emergency services) save a life. These notes have been designed to aid you with your first aid training. It is, however, not a substitute for hands on training from a professional first aid trainer, but a reference for you to look back on when you need to. We hope th

Word count: 320


In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)
embed_model = AutoModel.from_pretrained(EMBED_MODEL_NAME).to(device)
embed_model.eval()

print("Embedding model loaded on:", device)

  from .autonotebook import tqdm as notebook_tqdm


Embedding model loaded on: cuda


In [7]:
def embed_texts(texts, batch_size=64):
    """
    Embeds a list of texts using BGE-small.
    Returns normalized float32 vectors (cosine-ready).
    """
    all_embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]

            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)

            outputs = embed_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0]   # CLS token
            embeddings = F.normalize(embeddings, p=2, dim=1)

            all_embeddings.append(embeddings.cpu())

    return torch.cat(all_embeddings, dim=0).numpy()

In [8]:
import json

with open("data/extracted/chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [c["text"] for c in chunks]

embeddings = embed_texts(texts)

print("Embeddings shape:", embeddings.shape)

Embeddings shape: (1620, 384)


In [9]:
import faiss
import numpy as np

# Ensure float32 (FAISS requirement)
embeddings_f32 = embeddings.astype("float32")

dim = embeddings_f32.shape[1]

# Inner Product index (cosine similarity because embeddings are normalized)
index = faiss.IndexFlatIP(dim)

index.add(embeddings_f32)

print("FAISS index built")
print("Total vectors in index:", index.ntotal)

FAISS index built
Total vectors in index: 1620


In [10]:
def retrieve(query, k=5):
    q_emb = embed_texts([query]).astype("float32")
    scores, indices = index.search(q_emb, k)
    return [chunks[i] for i in indices[0]]

# Test query
results = retrieve("Someone is choking and cannot breathe", k=5)

for r in results:
    print(r["chunk_id"])
    print(r["text"][:200], "\n")

IndianFAmanual.pdf_52
coughing. Do not do anything else, but stay with the person until he breathes normally again. C.5.2.1.2 WHAT DO I DO IF THE PERSON CANNOT SPEAK, COUGH OR BREATHE? 1. Stand to the side and a little beh 

IndianFAmanual.pdf_51
hanging to a healt hcare facility. C.5 CHOKING When a person is having severe difficulty in breathing because of an obstructed airway or lack of air, he is choking. Coughing is the natural way of clea 

IFRCmanual.pdf_276
breathe Severe choking â€¢ Unable to cough, speak, cry or breathe â€¢ Clutches the throat with one or both hands â€¢ Panic â€¢ Bluish colour to the skin of the lips, ears, fingers and toes â€¢ Becomes unrespons 

StJohnFA.pdf_42
casualty is breathing normally, place them in the recovery position (see pages 21-22) h If the casualty is not breathing normally, perform CPR (see pages 23-28) 2. Dial 999 or 112 for an ambulance 3.  

canadianRCFA.pdf_68
Choking4 partial choking, because it depends on creating pressure behind the bl

In [30]:
def build_prompt(query, retrieved_chunks, max_context_chars=1200):
    context_parts = []
    total_len = 0

    for c in retrieved_chunks:
        text = c["text"].strip()

        if len(text) > 500:
            text = text[:500]

        part = text + "\n"
        if total_len + len(part) > max_context_chars:
            break

        context_parts.append(part)
        total_len += len(part)

    context = "\n".join(context_parts)

    prompt = f"""### SYSTEM
You are a first-aid reference assistant.
Follow the rules strictly.

Rules:
- Use ONLY the context.
- Write numbered steps.
- Use simple language.
- Do NOT repeat instructions.
- Do NOT include sources.
- Stop when finished.
- If the person becomes unconscious, advise calling emergency services.

### CONTEXT
{context}

### QUESTION
{query}

### ANSWER
"""
    return prompt

In [12]:
test_chunks = retrieve("Someone is choking and cannot breathe", k=3)
prompt = build_prompt("Someone is choking and cannot breathe", test_chunks)

print("Prompt length (characters):", len(prompt))
print("\n--- PROMPT PREVIEW ---\n")
print(prompt[:1500])

Prompt length (characters): 1898

--- PROMPT PREVIEW ---

You are a first-aid reference assistant.

Use ONLY the information in the context below.
Write clear, numbered first-aid steps.
Use simple language.
Do NOT diagnose medical conditions.

Context:
[IndianFAmanual.pdf]
coughing. Do not do anything else, but stay with the person until he breathes normally again. C.5.2.1.2 WHAT DO I DO IF THE PERSON CANNOT SPEAK, COUGH OR BREATHE? 1. Stand to the side and a little behind the choking person or child ( aged older than one year). 2. Support the personâ€™s chest with one hand and bend him forward. 3. Give five firm blows between the personâ€™s shoulder blades. To do so, use the heel of your free hand. Verify if the object has come out and the person can breathe again. C.5.2.1.3 WHAT DO I DO IF THE OBJECT DID NOT COME OUT AND THE PERSON IS STILL CHOKING? 1. Stand behind the choking person and put both hands around him, so your hands meet in front of the person. 2. Make a fist and place it

In [21]:
from llama_cpp import Llama

llm = Llama(
    model_path=r"C:\Projects\firstAidChatbot\models\Phi-3-mini-4k-instruct-q4.gguf",

    # Context
    n_ctx=2048,

    # CPU threading (CRITICAL)
    n_threads=24,          # physical cores
    n_threads_batch=24,    # prompt evaluation

    # Batch size (CRITICAL)
    n_batch=1024,          # fast prompt ingestion

    # Memory optimizations
    use_mmap=True,
    use_mlock=True,

    # Generation behavior (first-aid safe)
    temperature=0.0,
    top_p=1.0,
    repeat_penalty=1.08,

    verbose=False
)

llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [27]:
def generate_answer(prompt: str) -> str:
    output = llm(
        prompt,
        max_tokens=120,
        stop=[
            "###",
            "<|assistant|>",
            "<|user|>"
        ]
    )
    return output["choices"][0]["text"].strip()

In [15]:
def rag_answer(query, top_k=3):
    retrieved = retrieve(query, k=top_k)
    prompt = build_prompt(query, retrieved)
    answer = generate_answer(prompt)
    return answer, [(c["chunk_id"], c["source"]) for c in retrieved]

In [31]:
import time

query = "Someone is choking and cannot breathe. What should I do?"

start = time.time()
answer, sources = rag_answer(query)
elapsed = time.time() - start

print("Response time:", round(elapsed, 2), "seconds\n")
print("ANSWER:\n", answer)
print("\nSOURCES:")
for s in sources:
    print("-", s[0])

Response time: 6.2 seconds

ANSWER:
 1. Stand to the side and slightly behind the choking person.

2. Support the person's chest with one hand and bend them forward.

3. Give five firm blows between the person's shoulder blades.

4. Verify if the obstruction has been cleared and the person can breathe.

5. If the person does not become unresponsive, continue to monitor them.

If the person becomes unresponsive, advise calling emergency services.

SOURCES:
- IndianFAmanual.pdf_52
- IFRCmanual.pdf_276
- IndianFAmanual.pdf_53


In [39]:
import gradio as gr

# -------------------------------
# STREAMING GENERATION
# -------------------------------

def generate_answer_stream(prompt: str):
    stream = llm(
        prompt,
        max_tokens=150,
        stop=["###", "<|assistant|>", "<|user|>"],
        stream=True
    )

    partial = ""
    for chunk in stream:
        token = chunk["choices"][0]["text"]
        partial += token
        yield partial


def rag_answer_stream(query):
    retrieved = retrieve(query, k=3)
    prompt = build_prompt(query, retrieved)
    return generate_answer_stream(prompt)


# -------------------------------
# GRADIO CHAT STREAM FUNCTION
# -------------------------------

def chat_stream(user_message, history):
    if history is None:
        history = []

    # append user message
    history.append({
        "role": "user",
        "content": user_message
    })

    # append empty assistant message
    history.append({
        "role": "assistant",
        "content": ""
    })

    stream = rag_answer_stream(user_message)

    for partial in stream:
        history[-1]["content"] = partial
        yield history


# -------------------------------
# GRADIO UI
# -------------------------------

with gr.Blocks(title="First Aid Assistant") as demo:
    gr.Markdown("## ðŸ©º First Aid Assistant")
    gr.Markdown(
        "Fast, offline, retrieval-augmented first-aid guidance. "
        "This tool does not replace professional medical care."
    )

    chatbot = gr.Chatbot(height=420)

    msg = gr.Textbox(
        placeholder="Ask a first aid questionâ€¦",
        show_label=False
    )

    msg.submit(
        chat_stream,
        inputs=[msg, chatbot],
        outputs=chatbot,
        queue=True
    )

    msg.submit(lambda: "", None, msg)

demo.launch()

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


