In [83]:
import pandas as pd
import spacy
from spacy.lang.en import English
import medspacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import gradio as gr
import re

try:
    df_icd = pd.read_csv('d_icd_diagnoses.csv')
    # Filter for ICD-10 diagnoses for this example
    df_icd_10 = df_icd[df_icd['icd_version'] == 10]
    print("Successfully loaded ICD diagnoses data.")
except FileNotFoundError:
    print("Error: 'd_icd_diagnoses.csv' not found. Please place it in the same directory.")
    exit()

vague_diagnoses = [
    "pneumonia",
    "heart failure",
    "sepsis",
    "acute kidney injury"
]

mock_data = {
    'note_id': [1, 2, 3, 4, 5, 6],
    'text': [
        "Patient admitted with pneumonia. No pathogen specified.",
        "Patient has systolic heart failure.",
        "Patient presents with septic shock due to E. coli.",
        "Patient with fever and cough, rule out pneumonia.",
        "Patient has chronic systolic heart failure, New York Heart Association Class II.",
        "Discharge summary: Patient was treated for acute kidney injury."
    ],
    'label': [1, 0, 0, 1, 0, 1]  # 1 = needs query, 0 = complete
}
df_notes = pd.DataFrame(mock_data)

nlp = medspacy.load()

icd_diagnosis_list = df_icd_10['long_title'].str.lower().tolist()

def preprocess_text(text):
    text = text.lower()
    # Replace newlines and extra spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # You can add more complex cleaning here if needed
    return text

df_notes['processed_text'] = df_notes['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(df_notes['processed_text'])
y = df_notes['label']

model = LogisticRegression()
model.fit(X, y)
print("Machine learning model trained successfully.")

def cdi_assistant_logic(note_text):
    """
    Combines rule-based and machine learning approaches to suggest CDI queries.
    """

    clean_text = preprocess_text(note_text)

    for term in vague_diagnoses:
        if term in clean_text:
            # If the vague term is present, check if a more specific ICD code title is also present.
            if not any(specific_term in clean_text for specific_term in icd_diagnoses_list if term in specific_term):
                return f"‚ö†Ô∏è **Query Needed:** The documentation mentions '{term}' without a more specific diagnosis. Please clarify."

            doc = nlp(note_text)
    for ent in doc.ents:
        # Check if a diagnosis is negated
        if ent.label_ in ("DIAGNOSIS", "PROBLEM") and ent._.is_negated:
            return f"‚úÖ **Documentation Appears Complete:** A diagnosis of '{ent.text}' was mentioned but explicitly negated. No query needed."
    vectorized_text = vectorizer.transform([clean_text])
    prediction = model.predict(vectorized_text)[0]

    if prediction == 1:
        return "‚ö†Ô∏è **Query Needed (ML Suggestion):** The model suggests this note may be incomplete or vague. Consider a query."
    else:
        return "‚úÖ **Documentation Appears Complete:** The note is likely specific and complete."

iface = gr.Interface(
    fn=cdi_assistant_logic,
    inputs=gr.Textbox(
        label="Enter a Clinical Note",
        lines=10,
        placeholder="e.g., Patient admitted for heart failure. "
    ),
    outputs=gr.Markdown(),
    title="AI-Powered CDI Assistant ü©∫",
    description="This tool analyzes a clinical note to identify potential documentation gaps that may require a CDI query. It uses a combination of rule-based logic (leveraging ICD codes) and a machine learning model."
)


if __name__ == "__main__":
    iface.launch('share=True')

Successfully loaded ICD diagnoses data.
Machine learning model trained successfully.
* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.
