In [None]:
!pip install pinecone-client
!pip install pinecone sentence-transformers datasets google-generativeai pandas tqdm

Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-6.0.0-py3-none-any.whl (6.7 kB)
Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-client
Successfully installed pinecone-client-6.0.0 pinecone-plugin-interface-0.0.7
Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloa

In [None]:
# ✅ Only for re-running securely

from pinecone import Pinecone, ServerlessSpec
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from datasets import load_dataset
from google.colab import drive


drive.mount('/content/drive')


pinecone_key_path = "/content/drive/My Drive/keys/pinecone_key.txt"
gemini_key_path = "/content/drive/My Drive/keys/gemini_key.txt"


with open(pinecone_key_path, "r") as f:
    PINECONE_API_KEY = f.read().strip()

with open(gemini_key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()


pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "umls-ner-rag"
index = pc.Index(index_name)


genai.configure(api_key=GEMINI_API_KEY)


model = SentenceTransformer("all-MiniLM-L6-v2")



In [None]:
# import gzip

# # Load side effects
# with gzip.open("/content/sider_data/meddra_all_se.tsv.gz", 'rt') as f:
#     se_df = pd.read_csv(f, sep='\t', header=None, names=['stitch_id', 'umls_id', 'meddra_type', 'side_effect'])

# # Load indications
# with gzip.open("/content/sider_data/meddra_all_indications.tsv.gz", 'rt') as f:
#     ind_df = pd.read_csv(f, sep='\t', header=None, names=['stitch_id', 'umls_id', 'meddra_type', 'indication'])

# # Preview
# se_df.head()


In [None]:
# ind_df.head()

In [None]:
# with gzip.open("/content/sider_data/meddra.tsv.gz", 'rt') as f:
#     meddra_lookup = pd.read_csv(f, sep='\t', header=None, names=['umls_id', 'label'])

# # Convert to dict for fast lookup
# umls_to_label = dict(zip(meddra_lookup.umls_id, meddra_lookup.label))

# # Preview few mappings
# list(umls_to_label.items())[:5]


In [None]:
# from datasets import load_dataset

# # Load UMLS dataset from Hugging Face
# umls_dataset = load_dataset("adlbh/umls-concepts", split="train")

# # Build CUI → name dictionary
# cui_to_name = {}
# for item in umls_dataset:
#     cui = item.get("ENTITY", "").strip()
#     name = item.get("NAME", "").strip()
#     if cui and name:
#         cui_to_name[cui] = name

# # Preview
# list(cui_to_name.items())[:5]


In [None]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from datasets import load_dataset

# Load the UMLS concepts dataset
umls_dataset = load_dataset("adlbh/umls-concepts", split="train")

# Convert to pandas DataFrame
umls_df = pd.DataFrame(umls_dataset)

# Preview first few rows
umls_df.head()


Unnamed: 0,ENTITY,DEFINITION,ALIASES,NAME
0,C0003725,Arthropod-borne viruses. A non-taxonomic desig...,Arbovirus (navigational concept)|arbovirus|Arb...,Arboviruses
1,C0039258,,Tahyna virus (organism)|Tahyna virus,Tahyna virus
2,C0318627,,Eyach virus|Eyach virus (organism),Eyach virus
3,C0012634,A definite pathologic process with a character...,Disease|Clinical disease or syndrome|Clinical ...,Disease
4,C0042776,Minute infectious agents whose genomes are com...,"Virus|Virus, NOS|Viruses, General|Virus (organ...",Virus


In [None]:
umls_docs = []

for _, row in umls_df.iterrows():
    cui = str(row.get("ENTITY", "")).strip()
    name = str(row.get("NAME", "")).strip()
    definition = str(row.get("DEFINITION") or "").strip()
    raw_aliases = row.get("ALIASES")

    # Handle: list OR pipe-separated string
    if isinstance(raw_aliases, list):
        aliases = raw_aliases
    elif isinstance(raw_aliases, str) and "|" in raw_aliases:
        aliases = [a.strip() for a in raw_aliases.split("|") if a.strip()]
    else:
        aliases = []

    if name:
        alias_text = f"Aliases: {','.join(aliases)}\n" if aliases else ""
        body_text = f"{name}: {definition}" if definition else name
        full_text = alias_text + body_text

        umls_docs.append({
            "id": f"umls_{cui}",
            "text": full_text
        })

print(f"✅ Prepared {len(umls_docs)} UMLS documents with aliases and fallback for missing definitions.")


✅ Prepared 474872 UMLS documents with aliases and fallback for missing definitions.


In [None]:
# test_docs = umls_docs[:500]
# batch_size = 100  # safe for Pinecone + Colab memory

# for i in tqdm(range(0, len(test_docs), batch_size)):
#     batch = test_docs[i:i+batch_size]
#     ids = [doc["id"] for doc in batch]
#     texts = [doc["text"] for doc in batch]
#     embeddings = model.encode(texts, show_progress_bar=False).tolist()
#     pinecone_vectors = list(zip(ids, embeddings, batch))
#     index.upsert(vectors=pinecone_vectors, namespace="umls")

100%|██████████| 5/5 [00:04<00:00,  1.10it/s]


In [None]:
batch_size = 100  # safe for Pinecone + Colab memory

for i in tqdm(range(0, len(umls_docs), batch_size)):
    batch = umls_docs[i:i+batch_size]
    ids = [doc["id"] for doc in batch]
    texts = [doc["text"] for doc in batch]
    embeddings = model.encode(texts, show_progress_bar=False).tolist()
    pinecone_vectors = list(zip(ids, embeddings, batch))
    index.upsert(vectors=pinecone_vectors, namespace="umls")

100%|██████████| 4749/4749 [39:31<00:00,  2.00it/s]


In [None]:
# ✅ Only for re-running securely

from pinecone import Pinecone, ServerlessSpec
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from datasets import load_dataset
from google.colab import drive


drive.mount('/content/drive')


pinecone_key_path = "/content/drive/My Drive/keys/pinecone_key.txt"
gemini_key_path = "/content/drive/My Drive/keys/gemini_key.txt"


with open(pinecone_key_path, "r") as f:
    PINECONE_API_KEY = f.read().strip()

with open(gemini_key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()


pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "umls-ner-rag"
index = pc.Index(index_name)


genai.configure(api_key=GEMINI_API_KEY)


model = SentenceTransformer("all-MiniLM-L6-v2")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def query_umls_rag_filtered(term, top_k=50, print_top_n=50):
    # Embed the query term and convert to list
    query_emb = model.encode([term])[0].tolist()

    # Query Pinecone
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        namespace="umls",
        include_metadata=True
    )["matches"]

    # Normalize query for substring matching
    query_lower = term.lower()

    # Filter results: must contain the term (or part of it) in the text
    filtered = [match for match in results if query_lower in match["metadata"]["text"].lower()]

    # Display top N filtered results
    print(f"🔍 Query: {term}")
    print(f"✅ Showing {min(len(filtered), print_top_n)} of {len(filtered)} filtered results:\n")

    for i, match in enumerate(filtered[:print_top_n]):
        print(f"Result {i+1} (Score: {match['score']:.4f}):")
        print(match["metadata"]["text"])
        print("-" * 100)

# Example
query_umls_rag_filtered("Aspirin / Dipyridamole")


🔍 Query: Aspirin / Dipyridamole
✅ Showing 1 of 1 filtered results:

Result 1 (Score: 0.6757):
Aliases: Aspirin- and dipyridamole-containing product,Combination, Aspirin-Dipyridamole Drug,Product containing aspirin and dipyridamole (medicinal product),Drug Combination, Aspirin-Dipyridamole,Aspirin, Dipyridamole Drug Combination,Aspirin/ Dipyridamole,ASPIRIN/DIPYRIDAMOLE,Aspirin-Dipyridamole Drug Combination,Aspirin Dipyridamole Drug Combination
Aspirin / Dipyridamole: A drug combination of aspirin and dipyridamole that functions as a PLATELET AGGREGATION INHIBITOR, used to prevent THROMBOSIS and STROKE in TRANSIENT ISCHEMIC ATTACK patients.
----------------------------------------------------------------------------------------------------


In [None]:
import re

def query_umls_hybrid(term, top_k=1000, return_top_n=1000):
    # 1. Embed query and run Pinecone search
    query_emb = model.encode([term])[0].tolist()
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        namespace="umls",
        include_metadata=True
    )["matches"]

    query_tokens = set(re.findall(r'\w+', term.lower()))

    # 2. Filter and rerank results based on token overlap
    def relevance_score(text):
        text_tokens = set(re.findall(r'\w+', text.lower()))
        overlap = len(query_tokens & text_tokens)
        return overlap

    ranked = sorted(
        results,
        key=lambda match: relevance_score(match["metadata"]["text"]),
        reverse=True
    )

    # 3. Display the top results that match
    print(f"🔍 Hybrid Search for: '{term}'\n")
    shown = 0
    for match in ranked:
        score = relevance_score(match["metadata"]["text"])
        if score > 0 and shown < return_top_n:
            print(match["metadata"]["text"])
            shown += 1

    if shown == 0:
        print("❌ No relevant keyword-containing matches found.")


In [None]:
query_umls_hybrid("ASA")

🔍 Hybrid Search for: 'ASA'

ASA intolerant asthma
Aliases: Asa foetida,asafoetida,Asa-foetida,Ferula assa-foetida,Asafetidas,Asafoetida,Asafetida,asafetida,Asa-foetidas,Asafoetidas
Asafoetida
Aliases: ACETYLSALICYLIC ACID INGESTION,ASA INGESTION
ASPIRIN INGESTION
Aliases: ASA-6,Cholan-24-amine, N,N-dimethyl-, (5beta)-
25-azacoprostane
Aliases: American Society of Anesthesiologists physical status class 3 (finding),ASA physical status class 3
American Society of Anesthesiologists physical status class 3


In [None]:
import re

def query_umls_hybrid_inline(term, top_k=1000):
    # Embed and query
    query_emb = model.encode([term])[0].tolist()
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        namespace="umls",
        include_metadata=True
    )["matches"]

    query_tokens = set(re.findall(r'\w+', term.lower()))

    def relevance_score(text):
        text_tokens = set(re.findall(r'\w+', text.lower()))
        return len(query_tokens & text_tokens)

    ranked = sorted(
        results,
        key=lambda match: relevance_score(match["metadata"]["text"]),
        reverse=True
    )

    output = []
    for match in ranked:
        score = relevance_score(match["metadata"]["text"])
        if score > 0:
            output.append(f"{match['metadata']['text']}")

    if output:
        print(output)
    else:
        print("❌ No relevant keyword-containing matches found.")


In [None]:
query_umls_hybrid_inline("Aspirin")

['Aspirin therapy', 'aspirin free', 'Aspirin 60 MG', 'Aspirin 500 MG', 'Aspirin 200 MG', 'Aspirin 600 MG', 'Aspirin / butalbital', 'Aspirin 250 MG', 'Aspirin 650 MG', 'Aspirin 1000 MG', 'Aspirin 25 MG', 'Aspirin 150 MG', 'Aspirin 75 MG', 'Aspirin 300 MG', 'Aspirin 162 MG', 'aspirin intolerance', 'Aspirin 227 MG', 'Aspirin adverse reaction', 'ASPIRIN PWDR', 'Aspirin prophylaxis', 'Aspirin 325 MG', 'Aliases: aspirin coating,aspirin coated\ncoated aspirin', 'aspirin sensitivity', 'aspirin eugenol ester', 'aspirin asthma', 'Bayer Plus Aspirin Tablets', 'Aliases: non-asprin,non aspirin,non-aspirin\nNon-Aspirin', 'Aspirin / Calcium Carbonate Oral Product', 'Aliases: Aspirin desensitization therapy (regime/therapy),Aspirin desensitisation therapy\nAspirin desensitization therapy']


In [None]:
from google.colab import drive
import google.generativeai as genai

drive.mount('/content/drive')

key_path = "/content/drive/My Drive/GenAI_keys/gemini_api.txt"


with open(key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()

genai.configure(api_key=GEMINI_API_KEY)

gemini = genai.GenerativeModel("gemini-1.5-flash")




In [None]:
import re
def enhance_ner_output(ner_input_dict, top_k=100):
    all_context_blocks = []
    entity_list = []

    for label, entities in ner_input_dict.items():
        for entity in entities:
            entity_list.append((label, entity))
            # Embed + query Pinecone
            emb = model.encode([entity])[0].tolist()
            matches = index.query(
                vector=emb,
                top_k=top_k,
                namespace="umls",
                include_metadata=True
            )["matches"]

            # Keyword filter
            query_tokens = set(re.findall(r'\w+', entity.lower()))
            def score(txt): return len(set(re.findall(r'\w+', txt.lower())) & query_tokens)

            ranked = sorted(matches, key=lambda m: score(m["metadata"]["text"]), reverse=True)
            context = "\n".join(m["metadata"]["text"] for m in ranked if score(m["metadata"]["text"]) > 0)

            if context:
                all_context_blocks.append(f"Context for '{entity}' in category '{label}':\n{context}\n")

    # Gemini Prompt
    full_context = "\n\n".join(all_context_blocks)
    entity_string = ", ".join([f"{label}: {ent}" for label, ent in entity_list])

    prompt = f"""
You are a clinical AI assistant.

You are given noisy NER output from a medical note with categories and entities.
Some may be abbreviated, misspelled, or unclear.

Entities:
{entity_string}

Knowledge Context:
{full_context}

Instructions:
- Enhance each entity.
- extract the abbreviations and its fullform. Replace the abbrevation with fullform
  For example ASA is Aspirin
- Normalize drug, symptom, or disease names using the context.
- Keep structure by returning enhanced entities grouped under original labels.

Return JSON like:
{{
  "Symptoms": ["Shortness of breath", "Fever"],
  "Diagnosis": ["Aspirin"]
}}
"""

    response = gemini.generate_content(prompt)
    return response.text


In [None]:
test_ner = {
    "Symptoms": ["SOB", "weakness in wrist flexion"],
    "Diagnosis": ["Dermatome of C3"]
}

print(enhance_ner_output(test_ner))

```json
{
  "Symptoms": ["Shortness of breath", "Wrist flexion weakness"],
  "Diagnosis": ["Dermatome of the third cervical nerve"]
}
```

