In [1]:
import pandas as pd

In [2]:
df_val = pd.read_csv('data/combined_insomnia_data_validation.csv')

In [3]:
val_ids = [1280230, 1286616, 1291281, 1293851, 1295870, 1298934, 1300300, 1306672, 1308278, 1313641, 1317793, 1318788, 1330529, 14380, 14435,
       1731, 17455, 2203, 3925, 45066]

In [4]:
df_val.insert(0, 'note_id', val_ids)

In [5]:
df = pd.read_csv('data/insomnia_predictions_with_metrics.csv')

In [6]:
selected_columns = ['text', 'Definition 1 True Text', 'Definition 2 True Text', 'Rule B True Text', 'Rule C True Text']
df_selected = df[selected_columns].copy()
df_selected.insert(0, 'note_ids', val_ids)

### Task 2b and 2a - 'Insomnia phrase extraction' followed by 'Multi-label classification'

In [35]:
import requests
import pandas as pd
import re

class VLLMClient:
    def __init__(self, server_url):
        self.server_url = server_url
        self.primary_medications = {
            "Estazolam": "Estazolam", "Eszopiclone": "Eszopiclone", "Flurazepam": "Flurazepam",
            "Lemborexant": "Lemborexant", "Quazepam": "Quazepam", "Ramelteon": "Ramelteon",
            "Suvorexant": "Suvorexant", "Temazepam": "Temazepam", "Triazolam": "Triazolam",
            "Zaleplon": "Zaleplon", "Zolpidem": "Zolpidem",
            # Brand Names:
            "Ambien": "Zolpidem", "Sonata": "Zaleplon", "Doral": "Quazepam"
        }

        self.secondary_medications = {
            "Acamprosate": "Acamprosate", "Alprazolam": "Alprazolam", "Clonazepam": "Clonazepam",
            "Clonidine": "Clonidine", "Diazepam": "Diazepam", "Diphenhydramine": "Diphenhydramine",
            "Doxepin": "Doxepin", "Gabapentin": "Gabapentin", "Hydroxyzine": "Hydroxyzine",
            "Lorazepam": "Lorazepam", "Melatonin": "Melatonin", "Mirtazapine": "Mirtazapine",
            "Olanzapine": "Olanzapine", "Quetiapine": "Quetiapine", "Trazodone": "Trazodone",
            # Brand Names:
            "Benadryl": "Diphenhydramine", "Restoril": "Temazepam", "Seroquel": "Quetiapine"
        }

    def classify(self, text):
        """Classifies text, extracts relevant phrases, and determines 'yes' or 'no' labels."""
        results = {}  # Store classification labels (yes/no)
        extracted_text = {  
            "Definition 1 Extracted": "", "Definition 2 Extracted": "",  
            "Rule A Extracted": "", "Rule B Extracted": "", "Rule C Extracted": ""
        }  

        def evaluate_text(prompt):
            """Sends prompt to the model & retrieves extracted text."""
            payload = {
                "model": "google/gemma-2-2b-it", # "TechxGenus/Meta-Llama-3-8B-Instruct-AWQ"
                "prompt": prompt,
                "max_tokens": 50,  
                "temperature": 0.0,
                "top_p": 0.9,
                "stop": ["Answer:"],
            }

            response = requests.post(f"{self.server_url}/v1/completions", json=payload)

            if response.status_code == 200:
                result = response.json()
                return result["choices"][0]["text"].strip()
            return ""

       # Extract text for Definition 1 (Sleep Difficulty)
        prompt_def1 = f"""
        Identify phrases in the following clinical text that indicate sleep difficulty at night, such as trouble initiating sleep, maintaining sleep, waking up earlier than desired, and explicitly mentioning insomnia.
        Extract only relevant text that matches the above conditions, and do not give anything extra. There is no need for explanations. 
        If nothing matches, please return "unknown" and do not extract anything other than **insomnia indications during the night**.

        Clinical Text: {text}

        Extracted Phrases:
        """
        extracted_text["Definition 1 Extracted"] = evaluate_text(prompt_def1)

        # Extract text for Definition 2 (Daytime Impairment)
        prompt_def2 = f"""
         Identify phrases in the following clinical text that indicate daytime impairment like fatigue or malaise; impaired attention, concentration, or memory; impaired social, family, occupational, or academic performance; excessive daytime sleepiness;
        mood disturbance or irritability; behavioural problems such as hyperactivity, impulsivity, or aggression; decreased motivation, energy, or initiative; 
        proneness to errors or accidents; concerns or dissatisfaction with sleep.
        Extract only relevant text that matches the above conditions, and do not give anything extra. There is no need for explanations.
        If nothing matches, please return "unknown" and do not extract anything other than indications of insomnia during the daytime.

        Clinical Text: {text}

        Extracted Phrases:
        """
        extracted_text["Definition 2 Extracted"] = evaluate_text(prompt_def2)

        # Rule A: Insomnia Diagnosis (Yes if both Definition 1 & 2 are Yes)
        results["Definition 1 (Sleep Difficulty)"] = "yes" if extracted_text["Definition 1 Extracted"] else "no"
        results["Definition 2 (Daytime Impairment)"] = "yes" if extracted_text["Definition 2 Extracted"] else "no"
        results["Rule A (Insomnia Diagnosis)"] = "yes" if results["Definition 1 (Sleep Difficulty)"] == "yes" and results["Definition 2 (Daytime Impairment)"] == "yes" else "no"
        # extracted_text["Rule A Extracted"] = extracted_text["Definition 1 Extracted"] + " | " + extracted_text["Definition 2 Extracted"] if results["Rule A (Insomnia Diagnosis)"] == "yes" else ""

        # Rule B: Primary Insomnia Medications
        extracted_text["Rule B Extracted"] = self.extract_medications(text, self.primary_medications)
        results["Rule B (Primary Medications)"] = "yes" if extracted_text["Rule B Extracted"] else "no"

        # Rule C: Secondary Insomnia Medications
        extracted_text["Rule C Extracted"] = self.extract_medications(text, self.secondary_medications)
        results["Rule C (Secondary Medications)"] = "yes" if extracted_text["Rule C Extracted"] and (results["Definition 1 (Sleep Difficulty)"] == "yes" or results["Definition 2 (Daytime Impairment)"] == "yes") else "no"

        # Final Insomnia Status: Yes if any Rule is Yes
        # results["Final Insomnia Status"] = "yes" if any([results["Rule A (Insomnia Diagnosis)"], results["Rule B (Primary Medications)"], results["Rule C (Secondary Medications)"]]) else "no"

        return results, extracted_text

    def extract_medications(self, text, medication_list):
        """Extracts medication names from the text based on given list."""
        extracted_meds = [med for med in medication_list if re.search(rf"\b{med}\b", text, re.IGNORECASE)]
        return ", ".join(extracted_meds)

# --- Load Input Data ---
texts = df_val['text'].tolist()
true_labels = df_val[['Definition 1', 'Definition 2', 'Rule A', 'Rule B', 'Rule C']]

# --- Run Classification ---
vllm_client = VLLMClient(server_url="http://localhost:8000")

classification_results = []
extracted_texts = []

for text in texts:
    classification, extracted = vllm_client.classify(text)
    classification_results.append(classification)
    extracted_texts.append(extracted)

# --- Convert Results to DataFrames ---
df_classification = pd.DataFrame(classification_results)
df_extracted = pd.DataFrame(extracted_texts)

# --- Combine All DataFrames ---
df_final = pd.concat([df_val[['note_id']], df_val[['text']], true_labels, df_classification, df_extracted], axis=1)
# df_final = pd.concat([df_test_10[['note_id']], df_test_10[['text']], df_classification, df_extracted], axis=1)


# --- Display the Results ---
print("\nFinal DataFrame Extracted")
# print(df_final.head())


Final DataFrame Extracted


In [36]:
df_final 

Unnamed: 0,note_id,text,Definition 1,Definition 2,Rule A,Rule B,Rule C,Definition 1 (Sleep Difficulty),Definition 2 (Daytime Impairment),Rule A (Insomnia Diagnosis),Rule B (Primary Medications),Rule C (Secondary Medications),Definition 1 Extracted,Definition 2 Extracted,Rule A Extracted,Rule B Extracted,Rule C Extracted
0,1280230,female patient in sixties prescribed no drugs\...,yes,no,no,no,no,no,no,no,no,no,,,,,
1,1286616,male patient in fifties prescribed no drugs\n\...,yes,yes,yes,no,yes,no,no,no,no,no,,,,,
2,1291281,female patient in seventies prescribed no drug...,no,no,no,no,no,no,no,no,no,no,,,,,
3,1293851,female patient in seventies prescribed Nitrogl...,yes,yes,yes,yes,yes,no,no,no,yes,no,,,,Zolpidem,Trazodone
4,1295870,male patient in seventies prescribed Pantopraz...,yes,yes,yes,no,yes,no,no,no,no,no,,,,,Lorazepam
5,1298934,female patient in thirties prescribed Hydroxyz...,yes,yes,yes,no,yes,no,no,no,no,no,,,,,"Hydroxyzine, Lorazepam"
6,1300300,female patient in fifties prescribed Milk of M...,yes,no,no,no,yes,no,no,no,no,no,,,,,Clonazepam
7,1306672,male patient in sixties prescribed Acetaminoph...,yes,no,no,yes,no,no,no,no,yes,no,,,,"Zolpidem, Ambien",
8,1308278,"male patient in seventies prescribed D5W, Nore...",yes,no,no,no,yes,no,no,no,no,no,,,,,Diazepam
9,1313641,"male patient in fifties prescribed Insulin, Bi...",no,yes,no,yes,yes,no,no,no,yes,no,,,,Zolpidem,"Gabapentin, Lorazepam"


In [37]:
df_final.columns

Index(['note_id', 'text', 'Definition 1', 'Definition 2', 'Rule A', 'Rule B',
       'Rule C', 'Definition 1 (Sleep Difficulty)',
       'Definition 2 (Daytime Impairment)', 'Rule A (Insomnia Diagnosis)',
       'Rule B (Primary Medications)', 'Rule C (Secondary Medications)',
       'Definition 1 Extracted', 'Definition 2 Extracted', 'Rule A Extracted',
       'Rule B Extracted', 'Rule C Extracted'],
      dtype='object')

In [38]:
# df_final.to_csv("test_data_outcomes.csv", index=False)

In [39]:
selected_columns = ['note_id', 'Definition 1 (Sleep Difficulty)', 'Definition 2 (Daytime Impairment)', 'Rule B (Primary Medications)', 'Rule C (Secondary Medications)', 'Definition 1 Extracted', 'Definition 2 Extracted', 'Rule B Extracted', 'Rule C Extracted']
df_2b = df_final[selected_columns].copy()
df_2b.rename(columns={
    "Definition 1 (Sleep Difficulty)": "Definition 1 Label",
    "Definition 2 (Daytime Impairment)": "Definition 2 Label",
    "Rule B (Primary Medications)": "Rule B Label",
    "Rule C (Secondary Medications)": "Rule C Label",
    "Definition 1 Extracted": "Definition 1 Text",
    "Definition 2 Extracted": "Definition 2 Text",
    "Rule B Extracted": "Rule B Text",
    "Rule C Extracted": "Rule C Text"
}, inplace=True)

In [40]:
df_2b_modified = df_2b.replace("unknown", None)

In [41]:
selected_columns = ['Definition 1 Text', 'Definition 2 Text', 'Rule B Text', 'Rule C Text']

for col in selected_columns:
    df_2b_modified[col.replace("Text", "Label")] = df_2b_modified[col].apply(lambda x: (x==None or len(x)<1)).replace(True, "no").replace(False, "yes") 

In [42]:
def convert_to_list(text):
    if text == None:
        return []
    elif len(text.strip()) > 4:
        items = [each.strip() for each in text.split('\n')]
        return items
    else:
        return []

In [43]:
import pandas as pd
import json


# Function to build nested dict per row
def build_entry(row):
    return {
        row["note_id"]: {
            "Definition 1": {
                "label": row["Definition 1 Label"],
                "text": convert_to_list(row["Definition 1 Text"])
            },
            "Definition 2": {
                "label": row["Definition 2 Label"],
                "text": convert_to_list(row["Definition 2 Text"])
            },
            "Rule B": {
                "label": row["Rule B Label"],
                "text": convert_to_list(row["Rule B Text"])
            },
            "Rule C": {
                "label": row["Rule C Label"],
                "text": convert_to_list(row["Rule C Text"])
            }
        }
    }

# Build the final JSON structure
json_data = {}
for _, row in df_2b_modified.iterrows():
    json_data.update(build_entry(row))

# Save to file
with open("val_subtask_2b.json", "w") as f:
    json.dump(json_data, f, indent=4)


In [13]:
### Based on task 2b (text extraction) the definitions and rules are updated with yes/no. I.e. if there is text extraction 
### then the labels are updated to yes and if there is no text extraction then the labels are marked as no. These updated labels
### are treated as final labels for task 2a.

In [14]:
selected_columns = ['note_id', 'Definition 1 (Sleep Difficulty)', 'Definition 2 (Daytime Impairment)', 'Rule A (Insomnia Diagnosis)', 'Rule B (Primary Medications)', 'Rule C (Secondary Medications)']
df_2a = df_final[selected_columns].copy()
df_2a.rename(columns={
    "Definition 1 (Sleep Difficulty)": "Definition 1",
    "Definition 2 (Daytime Impairment)": "Definition 2",
    "Rule A (Insomnia Diagnosis)": "Rule A" ,
    "Rule B (Primary Medications)": "Rule B",
    "Rule C (Secondary Medications)": "Rule C"
}, inplace=True)

In [15]:
df_2a['Definition 1'] = df_2b_modified['Definition 1 Label']
df_2a['Definition 2'] = df_2b_modified['Definition 2 Label']
df_2a['Rule B'] = df_2b_modified['Rule B Label']
df_2a['Rule C'] = df_2b_modified['Rule C Label']

In [16]:
import pandas as pd
import json

# Set the desired columns for output
cols_to_include = ["Definition 1", "Definition 2", "Rule A", "Rule B", "Rule C"]

# Create the JSON structure
json_dict = {
    str(row["note_id"]): {col: row[col] for col in cols_to_include}
    for _, row in df_2a.iterrows()
}

# Save to file (optional)
with open("test_subtask_2a.json", "w") as f:
    json.dump(json_dict, f, indent=4)

# If you want to print to console
print(json.dumps(json_dict, indent=4))


{
    "20": {
        "Definition 1": "yes",
        "Definition 2": "yes",
        "Rule A": "yes",
        "Rule B": "yes",
        "Rule C": "yes"
    },
    "27": {
        "Definition 1": "yes",
        "Definition 2": "yes",
        "Rule A": "yes",
        "Rule B": "no",
        "Rule C": "yes"
    },
    "28": {
        "Definition 1": "yes",
        "Definition 2": "no",
        "Rule A": "no",
        "Rule B": "no",
        "Rule C": "yes"
    },
    "33": {
        "Definition 1": "yes",
        "Definition 2": "yes",
        "Rule A": "yes",
        "Rule B": "yes",
        "Rule C": "yes"
    },
    "51": {
        "Definition 1": "yes",
        "Definition 2": "yes",
        "Rule A": "yes",
        "Rule B": "no",
        "Rule C": "yes"
    },
    "107": {
        "Definition 1": "yes",
        "Definition 2": "no",
        "Rule A": "no",
        "Rule B": "no",
        "Rule C": "yes"
    },
    "156": {
        "Definition 1": "yes",
        "Definition 2": "yes",


### Task 1: Insomnia prediction (Binary Classification)

In [17]:
## Similar to how 2a is dependent on 2b here we add dependency to task 1 on 2b and 2a.
## Overall our framework works from left to right i.e: from task 2b to 2a to task 1.

In [18]:
columns_to_check = ['Rule A', 'Rule B', 'Rule C']
df_1 = pd.DataFrame({
    'id': df_2a['note_id'],
    'Insomnia': df_2a[columns_to_check].eq('yes').any(axis=1).map({True: 'yes', False: 'no'})
})
df_1

Unnamed: 0,id,Insomnia
0,20,yes
1,27,yes
2,28,yes
3,33,yes
4,51,yes
...,...,...
1995,2081638,no
1996,2082036,yes
1997,2082044,no
1998,2082673,no


In [19]:
import json

# Convert to desired JSON format
insomnia_json = {
    str(row["id"]): {"Insomnia": row["Insomnia"]}
    for _, row in df_1.iterrows()
}

# Save to file
with open("val_task_1.json", "w") as f:
    json.dump(insomnia_json, f, indent=4)

#### Task 1 based on regular binary classification method

In [21]:
import requests

class VLLMClient:
    def __init__(self, server_url):
        self.server_url = server_url

    def classify(self, text):
        """
        Perform text classification using VLLM for detecting insomnia.
        :param text: Clinical discharge summary to analyze.
        :return: 'yes' or 'no'
        """

        prompt = f"""
            You are a medical AI assistant. Based on the following clinical discharge summary, determine whether the patient has insomnia. 
            Always Answer strictly with 'yes' or 'no'.
            
            Text: {text}
            
            Answer:
            """
        

        payload = {
            "model": "google/gemma-2-2b-it",  
            "prompt": prompt,
            "max_tokens": 5,  # Allow space for a clear response
            "temperature": 0.0,  # Make it deterministic
            "top_p": 0.7,
            "stop": ["Answer"],
        }

            
        response = requests.post(f"{self.server_url}/v1/completions", json=payload)

        if response.status_code == 200:
            result = response.json()

            # Debug: Print raw API response
            # print("\n Raw API Response:", result, "\n")

            # Extract and clean the predicted label
            predicted_label = result["choices"][0]["text"].strip().lower()

            # Debug: Print extracted label
            # print(f"Extracted Label: '{predicted_label}'\n")

            # Ensure valid output
            if 'yes' in predicted_label:
                return 'yes'
            elif 'no' in predicted_label:
                return 'no'
            else:
                # print(f"Unexpected output: '{predicted_label}'")
                return 'uncertain'


            return predicted_label
        else:
            raise Exception(f"Error: {response.status_code} {response.text}")

# Initialize the VLLM client
vllm_client = VLLMClient(server_url="http://localhost:8000")  # Ensure VLLM is running

# Prepare data
note_ids = df_test['note_id'].tolist()
texts = df_test['text'].tolist()

# Perform classification and store results
results = []
for note_id, text in zip(note_ids, texts):
    label = vllm_client.classify(text)
    results.append({'ID': note_id, 'Insomnia': label})
    # print(f"Note ID: {note_id} | Predicted Label: {label}")

# Optional: Create a DataFrame to view or analyze results
results_df = pd.DataFrame(results)

# Filter note_ids where the label is 'uncertain'
uncertain_note_ids = [r['ID'] for r in results if r['Insomnia'] == 'uncertain']

# Print or return the list
print("\n Note IDs with 'uncertain' output:")
print(uncertain_note_ids)


 Note IDs with 'uncertain' output:
[1174, 6522, 7638, 11381, 12664, 13037, 22752, 28729, 29313, 35187, 41108, 44397, 55186, 56117, 56196, 58845, 1309724, 1395906, 1430628, 1459001, 1462250, 1594256, 1727646, 1797852, 1905966, 1927078, 1937419, 1952987, 1970582, 1985867, 2062889, 2076591]


In [22]:
# Replace 'uncertain' with 'no' in the 'label' column
results_df['Insomnia'] = results_df['Insomnia'].replace('uncertain', 'no')

# Optional: check if it worked
print(results_df['Insomnia'].value_counts())

Insomnia
yes    1381
no      619
Name: count, dtype: int64


In [23]:
results_df

Unnamed: 0,ID,Insomnia
0,20,yes
1,27,no
2,28,yes
3,33,no
4,51,yes
...,...,...
1995,2081638,no
1996,2082036,no
1997,2082044,no
1998,2082673,no


In [24]:
import json

# Convert to desired JSON format
insomnia_json = {
    str(row["ID"]): {"Insomnia": row["Insomnia"]}
    for _, row in results_df.iterrows()
}

# Save to file
with open("val_task_1_standalone.json", "w") as f:
    json.dump(insomnia_json, f, indent=4)