In [1]:
!pip install thefuzz
!pip install nltk
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz
!pip install openai


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz (531.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import nltk
from load_data import load_ann, load_txt
import pandas as pd
from thefuzz import fuzz
from nltk.stem import WordNetLemmatizer
import spacy
from scispacy.abbreviation import AbbreviationDetector
from openai import OpenAI
from scipy.spatial.distance import cosine
from scispacy.linking import EntityLinker
# from Bio_Epidemiology_NER.bio_recognizer import ner_prediction


nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# abreviation_handler = spacy.load("en_core_sci_lg")
# abreviation_handler.add_pipe("abbreviation_detector")


# abreviation_handler = spacy.load("en_core_sci_lg")
# abreviation_handler.add_pipe("abbreviation_detector")

# TODO: Put in config
client = OpenAI(api_key='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')

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   returnVal = client.embeddings.create(input = [text], model=model).data[0].embedding
   print(returnVal)
   return returnVal

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
PATH_TO_ZIP = "/workspaces/codespaces-jupyter/Project/RawData"
DATA_PATH = f"{PATH_TO_ZIP}/"
print(f"Full data path: {DATA_PATH}")
# read in txt files
txt_df = load_txt(DATA_PATH)
# read in REASONS entities from .ann files
ent_df, rel_df = load_ann(DATA_PATH)

Full data path: /workspaces/codespaces-jupyter/Project/RawData/
Time taken to read .txt files: 0.9435818195343018
Time taken to read .ann files and extract all metadata: 0.1653289794921875


In [4]:
"""
DATA CLEANING
"""

"""
1. Convert 'text' columns to lowercase, in order to facilitate comparison.
"""
# To lowercase 'text' column in ent_df
ent_df['text'] = ent_df['text'].str.lower()
ent_df['text'] = ent_df['text'].str.strip()

# To lowercase 'text' column in txt_df
# txt_df['text'] = txt_df['text'].str.lower()

"""
2. Remove \n ending from 'text' column in ent_df and in 'entity2' column in rel_df.
"""
ent_df['text'] = ent_df['text'].str.rstrip('\n')
rel_df['entity2'] = rel_df['entity2'].str.rstrip('\n')

"""
3. Convert 'start_idx' and 'end_idx' columns in ent_df to int.
"""
# Drop rows that cannot be converted to int TODO: Make this better
ent_df = ent_df[ent_df['start_idx'].str.isnumeric()]
ent_df = ent_df[ent_df['end_idx'].str.isnumeric()]
ent_df['start_idx'] = ent_df['start_idx'].astype(int)
ent_df['end_idx'] = ent_df['end_idx'].astype(int)

# Make new column with lemmatized text of 'text' column called 'lemmatized_text'
ent_df['orig_txt'] = ent_df['text']
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

ent_df['text'] = ent_df['text'].apply(lemmatize_text)
# Convert ent_df['text'] to string
ent_df['text'] = ent_df['text'].astype(str)


"""
4. Add definition of abbreviations to 'text' in txt_df
"""
# Loop through each row in txt_df and then use abreviation_handler to find abbreviations and then insert definition in parenthesis after abbreviation, for every abbreviation found in the 'text' column.
# for index, row in txt_df.iterrows():
#     doc = abreviation_handler(row['text'])
#     print(len(doc._.abbreviations))
    # for abrv in doc._.abbreviations:
    #     row['text'] = row['text'].replace(f"{abrv}", f"{abrv} ({abrv._.long_form})") 

"\n4. Add definition of abbreviations to 'text' in txt_df\n"

In [5]:
"""
FEATURE ENGINEERING
"""

"""
1. Join the appropriate entity1 and entity2 for each relation in rel_df.
"""
# Remove first 5 letters from 'entity1' and 'entity2' column in rel_df
rel_df['entity1'] = rel_df['entity1'].str[5:]
rel_df['entity2'] = rel_df['entity2'].str[5:]
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity1', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity1_text'}, inplace=True)
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity2', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity2_text'}, inplace=True)
rel_df.drop(columns=['entity_id_x', 'entity_id_y'], inplace=True)

# Create column 'entity1_entity2' in rel_df
rel_df['entity1_entity2'] = rel_df['entity1_text'] + rel_df['entity2_text']

"""
2. Get count of text in file_idx for each entity in ent_df. Do the same for the 'entity1_entity2' in rel_df.
"""
ent_df_count = ent_df.groupby(['text', 'file_idx']).size().reset_index(name='count_in_document')
ent_df = ent_df.merge(ent_df_count, how='left', left_on=['text', 'file_idx'], right_on=['text', 'file_idx'])
rel_df_count = rel_df.groupby(['entity1_entity2', 'file_idx']).size().reset_index(name='count_in_document')
rel_df = rel_df.merge(rel_df_count, how='left', left_on=['entity1_entity2', 'file_idx'], right_on=['entity1_entity2', 'file_idx'])

"""
3. Create encoding to represent if entity in ent_df is in the 'Discharge Diagnosis', 'Chief Complaint', or 'History of Present Illness' section of the txt_df.
"""
def find_section_range(row, section_name):
    lines = row['text'].split('\n')
    matches = [(i, fuzz.ratio(line.lower(), section_name.lower())) for i, line in enumerate(lines)]
    matches.sort(key=lambda x: x[1], reverse=True)  # sort by fuzz.ratio in descending order
    if not matches:
        # Raise error if no match is found
        raise ValueError(f"Could not find section {section_name} in file {row['file_idx']}")
    start_line = matches[0][0]  # start of the range is the line with the highest fuzz.ratio
    end_line = start_line
    while end_line < len(lines) and lines[end_line].strip() != '':
        end_line += 1
    # calculate start and end index within the raw text
    start_index = sum(len(line) + 1 for line in lines[:start_line])  # +1 for the newline character
    end_index = sum(len(line) + 1 for line in lines[:end_line])  # +1 for the newline character
    if row['file_idx'] == '100035':
        # print(row['text'][start_index: start_index + 20], '\n', row['text'][end_index: end_index + 20])
        print(row['text'][10559:10563] )
    return (start_index, end_index)

txt_df['DD_Range'] = txt_df.apply(lambda row: find_section_range(row, 'Discharge Diagnosis'), axis=1)
txt_df['CC_Range'] = txt_df.apply(lambda row: find_section_range(row, 'Chief Complaint'), axis=1)
txt_df['HPI_Range'] = txt_df.apply(lambda row: find_section_range(row, 'History of Present Illness'), axis=1)

# Join the 'DD_Range', 'CC_Range', and 'HPI_Range' columns from txt_df to ent_df
ent_df = ent_df.merge(txt_df[['file_idx', 'DD_Range', 'CC_Range', 'HPI_Range']], how='left', left_on=['file_idx'], right_on=['file_idx'])

# Loop through each entity in ent_df and check if it is in the 'Discharge Diagnosis', 'Chief Complaint', or 'History of Present Illness' section of the txt_df.
# If it is, then add the section name to the 'section' column in ent_df.
def find_section(row):
    # Throw error if start_idx is greater than end_idx
    if row['start_idx'] > row['end_idx']:
        raise ValueError(f"start_idx {row['start_idx']} is greater than end_idx {row['end_idx']}")
    # Throw error if start_idx and end_idx are in multiple sections
    if row['start_idx'] >= row['DD_Range'][0] and row['end_idx'] <= row['DD_Range'][1] and row['start_idx'] >= row['CC_Range'][0] and row['end_idx'] <= row['CC_Range'][1]:
        print(row['file_idx'])
        # raise ValueError(f"start_idx {row['start_idx']} and end_idx {row['end_idx']} are in both DD and CC")
    if row['start_idx'] >= row['DD_Range'][0] and row['end_idx'] <= row['DD_Range'][1] and row['start_idx'] >= row['HPI_Range'][0] and row['end_idx'] <= row['HPI_Range'][1]:
        print(row['file_idx'])
        # raise ValueError(f"start_idx {row['start_idx']} and end_idx {row['end_idx']} are in both DD and HPI")
    if row['start_idx'] >= row['CC_Range'][0] and row['end_idx'] <= row['CC_Range'][1] and row['start_idx'] >= row['HPI_Range'][0] and row['end_idx'] <= row['HPI_Range'][1]:
        print(row['file_idx'])
        # raise ValueError(f"start_idx {row['start_idx']} and end_idx {row['end_idx']} are in both CC and HPI")
    
    # If start_idx and end_idx are in one section, return the section name
    if row['start_idx'] >= row['DD_Range'][0] and row['end_idx'] <= row['DD_Range'][1]:
        return 'Discharge Diagnosis'
    elif row['start_idx'] >= row['CC_Range'][0] and row['end_idx'] <= row['CC_Range'][1]:
        return 'Chief Complaint'
    elif row['start_idx'] >= row['HPI_Range'][0] and row['end_idx'] <= row['HPI_Range'][1]:
        return 'History of Present Illness'
    else:
        return 'Other'

ent_df['section'] = ent_df.apply(lambda row: find_section(row), axis=1)
# Drop DD_Range, CC_Range, and HPI_Range columns from ent_df
ent_df.drop(columns=['DD_Range', 'CC_Range', 'HPI_Range'], inplace=True)
# Apply one hot encoding to 'section' column in ent_df
ent_df = pd.get_dummies(ent_df, columns=['section'])

75mg
75mg
75mg
106575
106575
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
134445
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
111542
149687
149687
149687
149687
149687
149687
149687
149687
14968

In [6]:
txt_df_subset = txt_df[txt_df['file_idx'] == '100035']
ent_df_subset = ent_df[ent_df['file_idx'] == '100035']
rel_df_subset = rel_df[rel_df['file_idx'] == '100035']

In [7]:
# # Get DD_Range, CC_Range, and HPI_Range for each file in txt_df_subset and store in tuples
# DD_Range = txt_df_subset['DD_Range'].iloc[0]
# CC_Range = txt_df_subset['CC_Range'].iloc[0]
# HPI_Range = txt_df_subset['HPI_Range'].iloc[0]

# # Create a dict that loops through each line in the txt_df_subset 'text' column in the DD_Range and stores each diagnosis as a key and the value as an embedding of the diagnosis using get_embedding from openai.
# diagnosis_dict = {}
# count_dict = {}
# for line in str(txt_df_subset['text'].iloc[0][DD_Range[0]:DD_Range[1]]).split('\n'):
#     if len(line) > 0:
#         diagnosis_dict[line] = get_embedding(line)
#         count_dict[line] = 0

# curr_text = str(txt_df_subset['text'].iloc[0])
# text_to_analyze = ""
# # Add the text from the CC_Range and HPI_Range to text_to_analyze
# text_to_analyze += curr_text[CC_Range[0]:CC_Range[1]]
# text_to_analyze += curr_text[HPI_Range[0]:HPI_Range[1]]

# # For each sentence in the txt_df_subset 'text' column in the CC_Range and HPI_Range, convert the sentence to an embedding using get_embedding from openai. Get the sentences using nltk
# for line in nltk.sent_tokenize(text_to_analyze):
#     # For each diagnosis in diagnosis_dict, calculate the cosine similarity between the diagnosis embedding and the sentence embedding. Store the diagnosis with the highest cosine similarity in the count_dict.
#     max_sim = 0
#     max_diagnosis = ""
#     for diagnosis, embedding in diagnosis_dict.items():
#         similarity = cosine(embedding, get_embedding(line))
#         if similarity > max_sim:
#             max_diagnosis = diagnosis
#             max_sim = similarity
#     print(max_sim)
#     count_dict[max_diagnosis] += 1

# print(count_dict)


In [8]:
# nlp = spacy.load("en_core_sci_lg")
# nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# diagnosis_dict = {}
# objects = []
# for line in str(txt_df_subset['text'].iloc[0][DD_Range[0]:DD_Range[1]]).split('\n'):
#     if len(line) > 0:
#         doc = nlp(line)
#         linker = nlp.get_pipe("scispacy_linker")
#         for entity in doc.ents:
#             for umls_ent in entity._.kb_ents:
#                 try:
#                     objects.append(linker.kb.cui_to_entity[umls_ent[0]])
#                     break
#                 except:
#                     print("error")
#                 # diagnosis_dict[ent.entity_id] = ent.canonical_name

In [9]:
# for object in objects:
#     print(object[2])

In [6]:
df = pd.DataFrame(columns=['file_idx', 'primary_diagnosis', 'count'])  # initialize df as an empty DataFrame
def create_freq_dict(df_input):
    nlp = spacy.load("en_core_sci_lg")
    nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
    linker = nlp.get_pipe("scispacy_linker")

    for i, file_idx in enumerate(txt_df['file_idx'].unique()):
        print(i, " out of ", len(txt_df['file_idx'].unique()))
        txt_df_subset = txt_df[txt_df['file_idx'] == file_idx]

        # Get DD_Range, CC_Range, and HPI_Range for each file in txt_df_subset and store in tuples
        DD_Range = txt_df_subset['DD_Range'].iloc[0]
        CC_Range = txt_df_subset['CC_Range'].iloc[0]
        HPI_Range = txt_df_subset['HPI_Range'].iloc[0]

        # Create a dict that loops through each line in the txt_df_subset 'text' column in the DD_Range and stores each diagnosis as a key and the value as an embedding of the diagnosis using get_embedding from openai.
        diagnosis_dict = {}
        for i, line in enumerate(str(txt_df_subset['text'].iloc[0][DD_Range[0]:DD_Range[1]]).split('\n')):
            if i == 0:
                continue
            if len(line) > 0:
                doc = nlp(line)
                for entity in doc.ents:
                    diagnosis_dict[entity] = []
                    for umls_ent in entity._.kb_ents:
                        try:
                            object = linker.kb.cui_to_entity[umls_ent[0]]
                            for alias in object[2]:
                                diagnosis_dict[entity].append(alias)
                        except Exception as e:
                            print(e)

        count_dict = {}
        for key, _ in diagnosis_dict.items():
            count_dict[key] = 0

        curr_text = str(txt_df_subset['text'].iloc[0])
        # text_to_analyze = ""
        # # Add the text from the CC_Range and HPI_Range to text_to_analyze
        # text_to_analyze += curr_text[CC_Range[0]:CC_Range[1]]
        # text_to_analyze += curr_text[HPI_Range[0]:HPI_Range[1]]

        for line in nltk.sent_tokenize(curr_text):
            doc = nlp(line)
            for entity in doc.ents:
                for umls_ent in entity._.kb_ents:
                    try:
                        object = linker.kb.cui_to_entity[umls_ent[0]]
                        for key, arr in diagnosis_dict.items():
                            for alias in object[2]:
                                if alias in arr:
                                    count_dict[key] += 1
                                    break
                    except Exception as e:
                        print(e)

        for key, value in count_dict.items():
            df_input = df_input.append({'file_idx': file_idx, 'primary_diagnosis': key, 'count': value}, ignore_index=True)

    return df_input

df = create_freq_dict(df)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


: 

In [None]:
# Save df to csv
df.to_csv('primary_diagnosis.csv', index=False)

In [35]:
# Get DD_Range, CC_Range, and HPI_Range for each file in txt_df_subset and store in tuples
DD_Range = txt_df_subset['DD_Range'].iloc[0]
CC_Range = txt_df_subset['CC_Range'].iloc[0]
HPI_Range = txt_df_subset['HPI_Range'].iloc[0]

# Create a dict that loops through each line in the txt_df_subset 'text' column in the DD_Range and stores each diagnosis as a key and the value as an embedding of the diagnosis using get_embedding from openai.
nlp = spacy.load("en_core_sci_lg")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
diagnosis_dict = {}
for i, line in enumerate(str(txt_df_subset['text'].iloc[0][DD_Range[0]:DD_Range[1]]).split('\n')):
    if i == 0:
        continue
    if len(line) > 0:
        doc = nlp(line)
        linker = nlp.get_pipe("scispacy_linker")
        for entity in doc.ents:
            print(entity)
            diagnosis_dict[entity] = []
            for umls_ent in entity._.kb_ents:
                try:
                    for ent in umls_ent:
                        print(ent)
                    object = linker.kb.cui_to_entity[umls_ent[0]]
                    for alias in object[2]:
                        diagnosis_dict[entity].append(alias)
                except Exception as e:
                    print(e)

print(diagnosis_dict)
count_dict = {}
for key, _ in diagnosis_dict.items():
    count_dict[key] = 0

KeyboardInterrupt: 

In [33]:
for key, _ in diagnosis_dict.items():
    count_dict[key] = 0
curr_text = str(txt_df_subset['text'].iloc[0])
# text_to_analyze = ""
# # Add the text from the CC_Range and HPI_Range to text_to_analyze
# text_to_analyze += curr_text[CC_Range[0]:CC_Range[1]]
# text_to_analyze += curr_text[HPI_Range[0]:HPI_Range[1]]

for line in nltk.sent_tokenize(curr_text):
    doc = nlp(line)
    linker = nlp.get_pipe("scispacy_linker")
    for entity in doc.ents:
        for umls_ent in entity._.kb_ents:
            try:
                object = linker.kb.cui_to_entity[umls_ent[0]]
                print(object[2])
                for key, arr in diagnosis_dict.items():
                    for alias in object[2]:
                        if alias in arr:
                            count_dict[key] += 1
                            break
            except Exception as e:
                print(e)

['Chief Complaint', 'Primary Complaint', 'Presenting complaint', '{Chief complaint}', 'Chief complaint', 'Presenting Complaint', 'Chief complaint (finding)', 'Main Complain', 'chief complaint']
['Other Chief Complaint']
['Chief complaint duration', 'Duration of Chief Complaint']
[]
[]
['Post Cardiac Arrest Syndrome', 'Postresuscitation Disease', 'Postresuscitation Diseases', 'Post-Cardiac Arrest Syndromes', 'Postcardiac Arrest Syndromes', 'Postcardiac Arrest Syndrome']
['arrest [as an cardiac arrest]', 'Cardiac arrest (disorder)', 'ARREST CARDIAC', 'Cardiac arrest- asystole', 'arrest', 'cardiac arrest', 'CARDIAC ASYSTOLE', 'ASYSTOLIA', 'Asystole (disorder)', 'ventricular asystole', 'Asystole', 'VENTRICULAR ASYSTOLIA', 'cardiac asystole', 'Heart Arrest', 'Arrest, Heart', 'asystolia', 'Arrest, Cardiac', 'SCA', 'Cardiac Arrest', 'Cardiac arrest', 'asystole', 'Heart stops beating', 'heart arrest', 'Asystoles', 'STANDSTILL CARDIAC', 'Cardiac standstill', 'ASYSTOLE', 'HEART ARREST', 'arreste

In [34]:
print(diagnosis_dict)
print(count_dict)
for key, val in count_dict.items():
    print(f"{key}: {ner_prediction(str(key), compute='cpu')['entity_group'].iloc[0]}: {val}" + '\n')

{Anoxic: [], Brain Injury: ['Brain Damage', 'cerebral injury', 'damage brain', 'brain injury tissue', 'brain injuries', 'brain lesion (from injury)', 'Brain damage, NOS', 'Cerebral damage', 'Brain Injuries', 'brain tissue injury', 'Brain damage', 'Brain Injury', 'injury brain', 'Injuries, Brain', 'cerebral damage', 'Acquired brain injury', 'disorder brain injury', 'Damage, brain', 'brain injury', 'Injury, Brain', 'brain disorders injury', 'brain damages', 'Brain injury', 'brain damage', 'Chronic Brain Injury', 'Chronic Brain Injuries', 'chronic brain injury', 'Brain Injuries, Chronic', 'Injury', 'Traumatic or non-traumatic injury (disorder)', 'Traumatic or non-traumatic injury', 'Trauma', 'Traumatic injury', 'Injury', 'injuries', 'Traumatic injury (disorder)', 'injury', 'Injury, NOS', 'Injury - disorder', 'TRAUMATIC INJURY', 'Injuries', 'INJURY', 'injury from an external force', 'Wound', 'Traumatic Injury'], PEA arrest x2: [], Status Asthmaticus: ['status asthmaticus', 'asthma asthmati

  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)
  final_df = final_df.append(disease_df) # adding the di