## Necessary dependency installations

In [None]:
!pip install openai
!pip install nltk
!pip install pandas
!pip install numpy
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz
!pip install Bio-Epidemiology-NER
!pip install python-dotenv
!pip install thefuzz

# Dataset Generation
### Note: This code requires an OpenAI key to enable ensemble methodology using NER and LLM. Please provide an API key below in order to generate the dataset from the provided raw data
#### To skip dataset generation, skip to 'Testing / Validation output' section. Note that dataset generation may take a while to run

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from dotenv import dotenv_values
from Helpers.load_data import load_ann, load_txt
from Helpers.clinician_note_dataset_handler import ClinicianNoteDataSetHandler
from Helpers.tester import Tester
import pandas as pd
from thefuzz import fuzz

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
OpenAI_Key = ""
# Or use environment variables
config = dotenv_values("/workspaces/codespaces-jupyter/Project/.env")
OpenAI_Key = config.get("OPEN_AI_KEY")
if OpenAI_Key == None:
    OpenAI_Key = ""

In [None]:
# Provided code block below by Cohere Health
PATH_TO_ZIP = "/workspaces/codespaces-jupyter/Project/RawData"
DATA_PATH = f"{PATH_TO_ZIP}/"
print(f"Full data path: {DATA_PATH}")
txt_df = load_txt(DATA_PATH)
ent_df, rel_df = load_ann(DATA_PATH)

if len(OpenAI_Key) == 0: # type: ignore
    print("Please provide an OpenAI API Key and try again")
    exit()
data_handler = ClinicianNoteDataSetHandler(txt_df, ent_df, rel_df, OpenAI_Key) # type: ignore

output = data_handler.identify_primary_diagnosis_and_underlying_factors()
output.to_csv('./Clinical_Note_Diagnoses_Factors_Dataset.csv', index=False)

# Testing / Validation output

In [2]:
output_df = pd.read_csv('./Clinical_Note_Diagnoses_Factors_Dataset.csv')
txt_df = pd.read_csv('./Intermediate Data File/txt_df.csv')
ent_df= pd.read_csv('./Intermediate Data File/ent_df.csv')
rel_df = pd.read_csv('./Intermediate Data File/rel_df.csv')

TesterInstance = Tester(txt_df, ent_df, rel_df, output_df)
TesterInstance.test_primary_medical_diagnosis()

TesterInstance.test_common_underlying_factors()

Testing Primary Medicla diagnoses...
NER Accuracy:  0.4070175438596491
LLM Accuracy:  0.8701754385964913
Ensemble Accuracy:  0.9192982456140351
Testing Common Underlying Factors...
Percentage of Notes with Underlying Factors that Directly Appear in Note 0.7578947368421053


# Diagnosis and Symptom Explorer
#### Please provide a Primary Medical Diagnosis / Condition in the cell below to retrieve its underlying Common Factors. The code will fuzzy match it to find matches for you within the dataset

In [3]:
primary_medical_condition_to_search_for = "heart disease"

In [7]:
if primary_medical_condition_to_search_for == "":
    print("Please enter a primary medical condition to search for")
else:
    filtered_output_df = output_df[output_df['primary_diagnosis'].apply(lambda x: fuzz.token_set_ratio(x, primary_medical_condition_to_search_for)) > 50]
    print(f"Number of diagnoses found: {len(filtered_output_df)}")

    filtered_output_df = filtered_output_df.head(1)

    file_idx = filtered_output_df['file_idx'].values[0]
    primary_medical_condition = filtered_output_df['primary_diagnosis'].values[0]
    primary_medical_condition_LLM = filtered_output_df['primary_diagnosis_LLM'].values[0]
    primary_medical_condition_NER = filtered_output_df['primary_diagnosis_NER'].values[0]
    confidence = filtered_output_df['confidence'].values[0]
    print(f"The primary medical condition for patient in docuement number {file_idx} was {primary_medical_condition}")
    print(f"This is a {confidence}")
    if confidence == 'Higher Confidence Prediction':
        print("This is because the NER and LLM models used in ensemble identified the same primary medical condition")
    elif confidence == 'Lower Confidence Prediction':
        print("This is because the NER and LLM models used in ensemble identified different primary medical conditions")
        print(f"The LLM model identified the primary medical condition as: {primary_medical_condition_LLM}")
        print(f"The NER model identified the primary medical condition as: {primary_medical_condition_NER}")
    
    underlying_conditions = filtered_output_df['Common_Underlying_Factors'].values[0]
    print("The underlying conditions for this Primary Medical Diagnosis are:")
    for underlying_condition in underlying_conditions.split(','):
        print(underlying_condition.strip())

Number of diagnoses found: 25
The primary medical condition for patient in docuement number 196798 was Congestive Heart Failure
This is a Higher Confidence Prediction
This is because the NER and LLM models used in ensemble identified the same primary medical condition
The underlying conditions for this Primary Medical Diagnosis are:
Hypoxic
Hypoxia
Agitation
Pneumonia
Coronary Artery Disease

