In [1]:
from utils import load_data
from const import (
    EHR_TRAIN_DATA_PATH, 
    EHR_TRAIN_LABEL_PATH, 
    EHR_VALID_DATA_PATH, 
    EHR_VALID_LABEL_PATH, 
    EHR_TEST_DATA_PATH, 
    EHR_TEST_LABEL_PATH, 
    TEST_DATA_PATH, 
    VALID_DATA_PATH, 
    VALID_LABEL_PATH
)
import os
import json
from tqdm import tqdm

with open(os.path.join(TEST_DATA_PATH), "r") as f:
    test_data = json.load(f)

ehr_valid_data, ehr_valid_labels = load_data(EHR_VALID_DATA_PATH, EHR_VALID_LABEL_PATH)
ehr_train_data, ehr_train_labels = load_data(EHR_TRAIN_DATA_PATH, EHR_TRAIN_LABEL_PATH)
ehr_test_data, ehr_test_labels = load_data(EHR_TEST_DATA_PATH, EHR_TEST_LABEL_PATH)

valid_data, valid_labels = load_data(VALID_DATA_PATH, VALID_LABEL_PATH)
test_data, _ = load_data(TEST_DATA_PATH, None, is_test=True)

print("ehr_valid_data", len(ehr_valid_data["data"]), len(ehr_valid_labels))
print("ehr_train_data", len(ehr_train_data["data"]), len(ehr_train_labels)) 
print("ehr_test_data", len(ehr_test_data["data"]), len(ehr_test_labels))

print("valid_data", len(valid_data["data"]), len(valid_labels))
print("test_data", len(test_data["data"]))


ehr_valid_data 1163 1163
ehr_train_data 5124 5124
ehr_test_data 1167 1167
valid_data 20 20
test_data 1008


In [2]:
import pandas as pd

list_null_labels = []
for idx, label in ehr_valid_labels.items():
    if (label == "null"):
        list_null_labels.append(idx)
print(f"{len(list_null_labels)}/{len(ehr_valid_labels)} questions are not answerable in the validation set")

test_df = pd.DataFrame(ehr_valid_data["data"])
null_test_df = test_df[test_df['id'].isin(list_null_labels)]

null_test_df.head()

232/1163 questions are not answerable in the validation set


Unnamed: 0,id,question
14,293a491c440d62e67c686f47,What is the maximum number of drugs and their ...
15,fc91b305e4be2838d4a5b0c5,Is there a gender restriction on potassium chl...
19,6d76715f3b8643d188af9795,Do they have a gender limit to lidocaine-prilo...
20,dec6c9f45523ef859e8d0977,Is there any remaining appointment for patient...
31,5497eb668de1ff020fd4e774,Translate icu equipment usage data into a main...


In [None]:
import pandas as pd

list_null_labels = []
for idx, label in ehr_test_labels.items():
    if (label == "null"):
        list_null_labels.append(idx)
print(f"{len(list_null_labels)}/{len(ehr_test_labels)} questions are not answerable in the test set")

test_df = pd.DataFrame(ehr_test_data["data"])
null_test_df = test_df[test_df['id'].isin(list_null_labels)]
null_test_df.head()

233/1167 questions are not answerable in the validation set


Unnamed: 0,id,question
2,3f70d67ccafc181a6d95a5da,When was back to godhead written?
3,a5348117eb65b1c2d5c291ae,Call the it department to report a system issu...
4,8b093fb10a7007c37b5ff6f9,What is primary and non-contributory under the...
13,cc6a1305faf59ceadd9f9270,When does patient 1819 cease to be in a quaran...
19,af5df4194e2b15e8dc1e2251,What are the latest guidelines for managing ic...


In [None]:
import pandas as pd

list_null_labels = []
for idx, label in ehr_train_labels.items():
    if (label == "null"):
        list_null_labels.append(idx)
print(f"{len(list_null_labels)}/{len(ehr_train_labels)} questions are not answerable in the train set")

train_df = pd.DataFrame(ehr_train_data["data"])
null_train_df = train_df[train_df['id'].isin(list_null_labels)]
null_train_df.head()

In [5]:
import random
idx = random.randint(0, len(null_test_df))
print(null_test_df.iloc[idx]['question'])

Which performing physicians were involved in procedures for patients admitted via emergency room admission?


In [6]:
from const import TEST_DATA_PATH
import os
import json

with open(os.path.join(TEST_DATA_PATH), "r") as f:
    test_data  = json.load(f)

In [7]:

# Define a similarity check function
def is_similar(test_question, data, threshold=0.8):
    """
    Check if a test question is similar to any question in valid data
    using a simple string similarity measure
    """
    from difflib import SequenceMatcher
    
    for item in data:
        question, idx = item["question"], item["id"]
        similarity = SequenceMatcher(None, test_question.lower(), 
                                    question.lower()).ratio()
        if similarity >= threshold:
            return True, similarity, question, idx
    return False, 0, None, None

def _save(file_name, similar_items):
    output_path = os.path.join("tmp", file_name)
    with open(output_path, "w") as f:
        json.dump({"similar_items": similar_items}, f, indent=2)

    print(f"Saved {len(similar_items)} similar items to {output_path}")

def _check(ehr_data, file_name):
    # Check each test item for similarity
    similar_items = []
    for item in tqdm(test_data["data"]):
        id, question = item["id"], item["question"]
        is_sim, score, match, match_id = is_similar(question, ehr_data["data"])
        
        if is_sim:
            similar_items.append({
                "test_id": id,
                "test_question": question,
                "similarity_score": score,
                "matching_question": match,
                "matching_id": match_id
            })

    print(f"Found {len(similar_items)} similar items from ehr_valid_data out of {len(test_data['data'])} test items")
    _save(file_name, similar_items)

def _display(file_name):
    with open(os.path.join("tmp", file_name), "r") as f:
        similar_items = json.load(f)["similar_items"]
    # Display some examples of similar items if any were found

    print(f"Found {len(similar_items)} similar items out of {len(test_data['data'])} test items")
    # for i, item in enumerate(similar_items[:5]):  # Show first 5 similar items
    #     print(f"\nSimilar item #{i+1}:")
    #     print(f"Test ID: {item['test_id']}")
    #     print(f"Test question: {item['test_question']}")
    #     print(f"Similarity score: {item['similarity_score']:.2f}")
    #     print(f"Matching question: {item['matching_question']}")
   

In [12]:
# _check(ehr_valid_data, "ehr_valid_data_similar_items.json")
_display("ehr_valid_data_similar_items.json")

Found 129 similar items out of 1008 test items


In [9]:
# _check(ehr_test_data, "ehr_test_data_similar_items.json") 
_display("ehr_test_data_similar_items.json")

Found 131 similar items out of 1008 test items


In [10]:
# _check(ehr_train_data, "ehr_train_data_similar_items.json")
_display("ehr_train_data_similar_items.json")

Found 251 similar items out of 1008 test items
