# ===== QUERY CONSTRUCTION =====




In [1]:
# INSTALL AND IMPORT THE DEPENDENCIES

!pip install transformers sentencepiece --quiet


from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import numpy as np
import pandas as pd


# =====  MODELS DEFINITION  =====



In [2]:
# QUERY RE-CONSTRUCTION
generator = pipeline("text2text-generation", model="google/flan-t5-base")

# DECISION MAKER
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# DECOMPOSITION
decomposer = pipeline("text2text-generation", model="google/flan-t5-large")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [3]:
# QUERY RECONSTRUCTION (Correct Chronological Order)

# Simulated chat history
chat_history = [
    {"user": "From when did the patient Allen jeffrey started the home health journey with us?"},
    {"assistant": "Allen Jefrey started his journey on 2nd of February"},
    {"user": "What is the recent visit type he had?"}
]

# Function to build context from recent chat turns (ordered as actual conversation)
def build_context(chat_history, max_turns=2):
    context_turns = []
    count = 0
    # Walk backward and collect full user-assistant pairs
    for i in range(len(chat_history) - 1, -1, -1):
        turn = chat_history[i]
        context_turns.insert(0, turn)  # insert at beginning to preserve order
        if "user" in turn:
            count += 1
        if count == max_turns:
            break
    context = ""
    for turn in context_turns:
        if "user" in turn:
            context += f"User: {turn['user']}\n"
        elif "assistant" in turn:
            context += f"Assistant: {turn['assistant']}\n"
    return context.strip()

# Get context and current user query
context = build_context(chat_history)
user_query = chat_history[-1]["user"]

# Output
print("=== Context ===")
print(context)
print("\n=== Original Query ===")
print(user_query)


=== Context ===
User: From when did the patient Allen jeffrey started the home health journey with us?
Assistant: Allen Jefrey started his journey on 2nd of February
User: What is the recent visit type he had?

=== Original Query ===
What is the recent visit type he had?


# QUERY RE-CONSTRUCTION DECISION

In [4]:
# DECISION MAKER
# Example queries

queries = [
    "What did the nurse do during the last visit?",
    "What did Nurse Thomas provide to patient 1287 on July 18?",
    "Show all lab results from the most recent episode 12235",
    "List physical therapy visits for patient 9021.",
    "Why was this not mentioned that day that this did and this did not?",
]

# Labels
labels = ["vague", "clear"]

# Classify
for q in queries:
    result = classifier(q, labels)
    prediction = result['labels'][0]
    print(f"Query: {q}\n → Label: {prediction}\n")

# THESE MODELS CAN BE FINTUNED ACCORDING TO OUR DATA


Query: What did the nurse do during the last visit?
 → Label: vague

Query: What services did Nurse Thomas provide to patient 1287 on July 18?
 → Label: vague

Query: Show all lab results from the most recent episode 12235
 → Label: clear

Query: List physical therapy visits for patient 9021.
 → Label: clear

Query: Why was this not mentioned that day that this did and this did not?
 → Label: vague



# ===== MCG (MULTI QUERY GENEARATION) =====

In [15]:


# Load model
model_id = "mrm8488/t5-base-finetuned-question-generation-ap"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")

# Input query
user_query = "What did the nurse do during the last visit?"

# Create paraphrasing prompt
prompt = f"paraphrase: {user_query} </s>"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# Generate multiple paraphrases
outputs = model.generate(
    input_ids=input_ids,
    do_sample=True,
    top_k=50,
    num_return_sequences=6,
    temperature=0.8,
    max_length=64
)

# Decode
multi_queries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

print("Multi-Queries:\n")
for q in multi_queries:
    print(f"- {q}")


# Multi-Queries:

# - What activities did the nurse perform during the previous visit?
# - What services were carried out by the nurse during the last home visit?
# - What tasks did the nurse complete in the most recent appointment?


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Multi-Queries:

- question: What did the nurse do during the last visit?
- question: What did the nurse do during her last visit?
- paraphrase: What did the nurse take the time to do for the nurse to leave the hospital?
- question: What did the nurse do during the last visit to the hospital?
- question: When did the nurse give the last visit?
- question: What did the nurse say during the last visit to the practice?


# ===== QUERY DECOMPOSITION =====

In [17]:
# DECOMPOSITION
# NA
# Complex query
# !pip install transformers sentencepiece --quiet
# from transformers import pipeline

# Load FLAN-T5 for instruction-style prompting
# Done already in the model definition section at the top

# Complex query for decomposition
complex_query = "What are the essential 5 health care factors every human must know? Can you specify the factors with reasons why it tops the 5?"

# Instruction prompt
prompt = f"Think you are a general physician and have health care domain knowledge - Break this question and provide the decomposed questions assuming it is addressing to the health care domain specific question:\n{complex_query}"

# Generate output
response = decomposer(prompt, max_length=128, do_sample=False)[0]['generated_text']

# Clean and split output
sub_questions = [line.strip(" -") for line in response.split('\n') if line.strip()]

# Display results
print("Decomposed Sub-Questions:\n")
for q in sub_questions:
    print(f"- {q}")



#     Decomposed Sub-Questions example- User query -> What medications did the patient take, Were there any side effects reported?

# - What medications did the patient take?
# - Were there any side effects reported?



Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Decomposed Sub-Questions:

- Why is this period of patient, a LUPA?


# ===== RAG FUSION ======

In [18]:


# Simulated chunks from Federal_Income_Tax.pdf
chunks = [
    "Federal income tax applies to individuals, corporations, and certain estates and trusts.",
    "The tax system in the U.S. uses a progressive tax rate structure based on income brackets.",
    "Standard deductions and personal exemptions reduce taxable income.",
    "Credits such as the Child Tax Credit directly reduce tax owed.",
    "Capital gains and dividends are taxed at different rates based on holding periods.",
    "Filing status determines standard deduction amounts and income thresholds.",
    "Employers withhold federal taxes from wages and report it via Form W-2.",
    "Self-employed individuals must pay estimated taxes quarterly to the IRS."
]

# Multiple Query Variations (MQG step)
queries = [
    "How does federal income tax affect individuals?",
    "Explain income brackets and tax rates in the U.S.",
    "What deductions and credits are available in federal tax?"
]

# Vectorize the document chunks
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(chunks)

# Initialize fusion score array
fusion_scores = np.zeros(len(chunks))

# Reciprocal Rank Fusion parameters
k = 60
all_rankings = []

# For each query, compute similarity and rank chunks
for query in queries:
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, doc_vectors).flatten()
    ranked_indices = np.argsort(-similarities)  # descending
    all_rankings.append(ranked_indices)

    for rank, idx in enumerate(ranked_indices):
        fusion_scores[idx] += 1 / (rank + k)

# Final fused ranking
final_ranking = np.argsort(-fusion_scores)

# Show ranked results
fusion_df = pd.DataFrame({
    "Rank": range(1, len(chunks) + 1),
    "Chunk": [chunks[i] for i in final_ranking],
    "Fusion Score": fusion_scores[final_ranking].round(4)
})

print(fusion_df.to_string(index=False))


 Rank                                                                                      Chunk  Fusion Score
    1   Federal income tax applies to individuals, corporations, and certain estates and trusts.        0.0495
    2 The tax system in the U.S. uses a progressive tax rate structure based on income brackets.        0.0492
    3                             Credits such as the Child Tax Credit directly reduce tax owed.        0.0489
    4                         Standard deductions and personal exemptions reduce taxable income.        0.0469
    5         Capital gains and dividends are taxed at different rates based on holding periods.        0.0464
    6                   Self-employed individuals must pay estimated taxes quarterly to the IRS.        0.0459
    7                    Employers withhold federal taxes from wages and report it via Form W-2.        0.0459
    8                 Filing status determines standard deduction amounts and income thresholds.        0.0457


# ==== HyDE ====

In [12]:
Hypothetical Document Embedding (HyDE)

# Step 1: Your document chunks (can be from any domain)
document_chunks = [
    "Patients with Type 2 diabetes are usually treated with metformin as a first-line medication.",
    "Informed consent ensures patients understand the risks and benefits of treatment.",
    "Hypertension may lead to heart attack, stroke, and kidney failure.",
    "Statins are commonly prescribed to manage high cholesterol.",
    "Clinical trials test the safety and effectiveness of new medical treatments.",
    "Medical records must be stored securely per HIPAA regulations.",
    "Balanced diet and exercise help manage chronic health issues.",
    "Placebo effect can result in perceived improvement in symptoms."
]

# Step 2: User query
user_query = "What is the role of clinical trials in healthcare?"

# Step 3: Simulate a hypothetical answer (LLM would usually generate this)
hypothetical_answer = "Clinical trials are used to assess the safety and efficacy of new treatments before they are approved for public use."

# Step 4: TF-IDF Embedding for both document chunks and hypothetical answer
vectorizer = TfidfVectorizer()
all_text = document_chunks + [hypothetical_answer]
vectors = vectorizer.fit_transform(all_text)

# Step 5: Similarity search using the hypothetical answer
answer_vec = vectors[-1]
doc_vectors = vectors[:-1]
similarities = cosine_similarity(answer_vec, doc_vectors).flatten()
ranked_indices = np.argsort(-similarities)

# Step 6: Print top results
print("\nHyDE Top Matches (Ranked by Similarity to Hypothetical Answer):\n")
for rank, idx in enumerate(ranked_indices[:5]):
    print(f"{rank + 1}. [{similarities[idx]:.4f}] {document_chunks[idx]}")



HyDE Top Matches (Ranked by Similarity to Hypothetical Answer):

1. [0.4397] Clinical trials test the safety and effectiveness of new medical treatments.
2. [0.1520] Statins are commonly prescribed to manage high cholesterol.
3. [0.1146] Informed consent ensures patients understand the risks and benefits of treatment.
4. [0.0719] Hypertension may lead to heart attack, stroke, and kidney failure.
5. [0.0700] Patients with Type 2 diabetes are usually treated with metformin as a first-line medication.


# ==== HyPE ====

In [13]:
Hypothetical Passage Extraction (HyPE)

# Step 1: Simulated retrieved chunks (multi-sentence text blocks)
retrieved_chunks = [
    "Clinical trials are essential for evaluating medical treatments. They involve testing new drugs on volunteers. Results determine if the treatment is safe.",
    "Informed consent is required before participation. It explains risks and procedures. Patients must understand before signing.",
    "HIPAA regulations demand secure data storage. Medical records must be encrypted and access-controlled. Violations can lead to fines."
]

# Step 2: User query
user_query = "Why are clinical trials important in medicine?"

# Step 3: Apply HYPE — Extract best sentence per chunk
hype_outputs = []
for chunk in retrieved_chunks:
    sentences = chunk.split(". ")
    best_sentence = ""
    best_score = -1

    for sentence in sentences:
        if not sentence.strip():
            continue
        # Use TF-IDF on [query, sentence]
        vectorizer = TfidfVectorizer().fit([user_query, sentence])
        vecs = vectorizer.transform([user_query, sentence])
        score = cosine_similarity(vecs[0], vecs[1])[0][0]

        if score > best_score:
            best_score = score
            best_sentence = sentence.strip()

    hype_outputs.append((best_sentence, best_score))

# Step 4: Display top extracted subpassages
print("\HYPE Results – Best Sentence from Each Retrieved Chunk:\n")
for i, (sentence, score) in enumerate(hype_outputs):
    print(f"{i+1}. [{score:.4f}] {sentence}")


\HYPE Results – Best Sentence from Each Retrieved Chunk:

1. [0.2532] Clinical trials are essential for evaluating medical treatments
2. [0.0000] Informed consent is required before participation
3. [0.0000] HIPAA regulations demand secure data storage
