<a href="https://colab.research.google.com/github/Vinaydubey79/MedDRA-event-coding-and-query/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# -----------------------------------------
# AE Coder Prototype - Complete and Fixed
# Narrative Weighting + Site Queries + Error Handling
# Colab Ready (Single Cell)
# -----------------------------------------

# Step 1: Install dependencies
!pip install -q gradio pandas rapidfuzz sentence-transformers

# Step 2: Write dummy dictionary CSV (clean)
dummy_dict_content = """id,llt_name,pt_name,soc_name,is_current
1001,Nausea,Nausea and vomiting,Gastrointestinal disorders,True
1002,Vomiting,Nausea and vomiting,Gastrointestinal disorders,True
1003,Diarrhoea,Diarrhoea,Gastrointestinal disorders,True
1004,Chest pain,Chest pain,Cardiac disorders,True
1005,Myocardial infarction,Myocardial infarction,Cardiac disorders,True
1006,Rash,Rash,Skin and subcutaneous tissue disorders,True
1007,Generalized rash,Rash,Skin and subcutaneous tissue disorders,True
1008,Itchy rash,Pruritus,Skin and subcutaneous tissue disorders,True
1009,Wrist fracture,Fractures,Injury poisoning and procedural complications,True
1010,Fall,Fall,Injury poisoning and procedural complications,True
1011,Headache,Headache,Nervous system disorders,True
1012,Fever,Pyrexia,General disorders and administration site conditions,True
"""

with open("dummy_dict.csv", "w") as f:
    f.write(dummy_dict_content)

# Step 3: Import libraries and load model
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz, process
import re
import numpy as np

df = pd.read_csv("dummy_dict.csv", dtype=str)
df["is_current"] = df["is_current"].fillna("True").str.lower().isin(["true","1","y"])
df["norm"] = df["llt_name"].str.lower()
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb = model.encode(df["norm"].tolist(), normalize_embeddings=True)

def normalize(text):
    t = (text or "").lower()
    t = re.sub(r"[^a-z0-9\s/+\-]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def generate_site_queries(verbatim, narrative):
    queries = []
    v = (verbatim or "").lower()
    n = (narrative or "").lower()
    if "myocardial infarction" in n and "chest pain" in v:
        queries.append(
            "Kindly confirm if the verbatim should be updated to 'Myocardial infarction' based on narrative details instead of 'Chest pain'."
        )
    if "sepsis" in n and "fever" in v:
        queries.append(
            "Please confirm if the fever is related to sepsis as described in the narrative."
        )
    if "rash" in v and not any(x in n for x in ["generalized", "localized", "site"]):
        queries.append(
            "Please specify if the rash is localized or generalized, and provide location details."
        )
    return queries if queries else ["No additional queries detected based on input."]

def code(verbatim, narrative):
    try:
        text_v = normalize(verbatim or "")
        text_n = normalize(narrative or "")
        if not text_v and not text_n:
            return "Please enter verbatim or narrative text.", "", ""

        w_narrative = min(max(len(text_n.split()) / 100, 0.3), 0.6)
        w_verbatim = 1 - w_narrative

        lex_v = process.cdist([text_v], df["norm"].tolist(), scorer=fuzz.token_set_ratio)[0].astype(float) if text_v else np.zeros(len(df))
        lex_n = process.cdist([text_n], df["norm"].tolist(), scorer=fuzz.token_set_ratio)[0].astype(float) if text_n else np.zeros(len(df))

        emb_v = model.encode([text_v], normalize_embeddings=True) if text_v else np.zeros((1, emb.shape[1]))
        emb_v = emb_v.ravel() if emb_v.ndim == 2 else emb_v

        emb_n = model.encode([text_n], normalize_embeddings=True) if text_n else np.zeros((1, emb.shape[1]))
        emb_n = emb_n.ravel() if emb_n.ndim == 2 else emb_n

        sem_v = (emb @ emb_v.reshape(-1,1)).ravel() * 100.0 if text_v else np.zeros(len(df))
        sem_n = (emb @ emb_n.reshape(-1,1)).ravel() * 100.0 if text_n else np.zeros(len(df))

        lex_scores = w_verbatim * lex_v + w_narrative * lex_n
        sem_scores = w_verbatim * sem_v + w_narrative * sem_n
        final_score = 0.6 * lex_scores + 0.4 * sem_scores

        top_idx = np.argsort(-final_score)[:10]
        cands = df.iloc[top_idx].copy()
        cands = cands.reset_index(drop=True)
        cands["score"] = final_score[top_idx]

        if cands.empty or len(cands) == 0:
            return "No suitable candidates found.", "", ""

        best = cands.iloc[0]
        rationale_best = (
            "This term was selected because it best matches the given verbatim and narrative context "
            "and is currently the preferred coding term."
        )

        alternates_list = []
        for pos in range(1, min(6, len(cands))):
            r = cands.iloc[pos]
            lex = lex_scores[top_idx[pos]]
            sem = sem_scores[top_idx[pos]]
            reason = (
                "The wording closely matches the input." if lex >= sem
                else "The meaning is closely related to the input narrative."
            )
            alternates_list.append(
                f"- {r['llt_name']} → PT: {r['pt_name']} (Score: {r['score']:.1f})\n  (Reason: {reason})"
            )
        alternates = "\n".join(alternates_list) if alternates_list else "No alternates found."

        site_queries = generate_site_queries(verbatim, narrative)

        return (
            f"{best['llt_name']} → PT: {best['pt_name']} (SOC: {best['soc_name']}) (Score: {best['score']:.1f})\n\n"
            f"Rationale: {rationale_best}",
            alternates,
            "\n".join(site_queries),
        )
    except Exception as e:
        err_msg = f"ERROR: {type(e).__name__}: {e}"
        return err_msg, err_msg, err_msg

gr.Interface(
    fn=code,
    inputs=[gr.Textbox(label="Verbatim"), gr.Textbox(label="Narrative (optional, can be long)")],
    outputs=[
        gr.Textbox(label="Best LLT Match with Rationale"),
        gr.Textbox(label="Alternate LLTs with Reasons"),
        gr.Textbox(label="Site Queries / Clarifications"),
    ],
    title="AE Coder Prototype Demo with Narrative Weighting and Site Queries",
    description="Demo using a dummy dictionary (not MedDRA). Enter verbatim and narrative to get coding suggestions, alternate options, and queries for clarifications.",
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cd3bdd21d1ceddf9da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


