## Load in Data
You must provide YOUR OWN PATH to the location of the sampleclinicalnotes.zip file in the `PATH_TO_ZIP` object.

In [213]:
!pip install thefuzz
!pip install nltk



In [214]:
from load_data import load_ann, load_txt
import pandas as pd
from thefuzz import fuzz
import nltk
from nltk.stem import WordNetLemmatizer


nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [215]:
PATH_TO_ZIP = "/workspaces/codespaces-jupyter/Project/RawData"
DATA_PATH = f"{PATH_TO_ZIP}/"
print(f"Full data path: {DATA_PATH}")
# read in txt files
txt_df = load_txt(DATA_PATH)
# read in REASONS entities from .ann files
ent_df, rel_df = load_ann(DATA_PATH)

Full data path: /workspaces/codespaces-jupyter/Project/RawData/
Time taken to read .txt files: 0.011150598526000977
Time taken to read .ann files and extract all metadata: 0.15177011489868164


In [216]:
# EDA on txt_df
print("txt_df shape:", txt_df.shape)
print("txt_df columns:", txt_df.columns)
print("txt_df head:\n", txt_df.head())

# EDA on ent_df
print("ent_df shape:", ent_df.shape)
print("ent_df columns:", ent_df.columns)
print("ent_df head:\n", ent_df.head())

# EDA on rel_df
print("rel_df shape:", rel_df.shape)
print("rel_df columns:", rel_df.columns)
print("rel_df head:\n", rel_df.head())

txt_df shape: (303, 2)
txt_df columns: Index(['file_idx', 'text'], dtype='object')
txt_df head:
   file_idx                                               text
0   109450  Admission Date:  [**2121-8-7**]              D...
1   103677  Admission Date:  [**2128-12-3**]              ...
2   113824  Admission Date:  [**2200-6-14**]              ...
3   113524  Admission Date:  [**2124-1-14**]              ...
4   115244  Admission Date:  [**2168-4-18**]     Discharge...
ent_df shape: (50951, 6)
ent_df columns: Index(['file_idx', 'entity_id', 'category', 'start_idx', 'end_idx', 'text'], dtype='object')
ent_df head:
   file_idx entity_id  category start_idx end_idx             text
0   120253        T1      Drug     10002   10015  Calcipotriene\n
1   120253        T2  Strength     10016   10021         0.005 \n
2   120253        T3      Form     10024   10029          Cream\n
3   120253        T4    Dosage     10035   10042        One (1)\n
4   120253        T5      Form     10043   10047     

In [217]:
"""
DATA CLEANING
"""

"""
1. Convert 'text' columns to lowercase, in order to facilitate comparison.
"""
# To lowercase 'text' column in ent_df
ent_df['text'] = ent_df['text'].str.lower()
ent_df['text'] = ent_df['text'].str.strip()

# To lowercase 'text' column in txt_df
txt_df['text'] = txt_df['text'].str.lower()

"""
2. Remove \n ending from 'text' column in ent_df and in 'entity2' column in rel_df.
"""
ent_df['text'] = ent_df['text'].str.rstrip('\n')
rel_df['entity2'] = rel_df['entity2'].str.rstrip('\n')

"""
3. Convert 'start_idx' and 'end_idx' columns in ent_df to int.
"""
# Drop rows that cannot be converted to int TODO: Make this better
ent_df = ent_df[ent_df['start_idx'].str.isnumeric()]
ent_df = ent_df[ent_df['end_idx'].str.isnumeric()]
ent_df['start_idx'] = ent_df['start_idx'].astype(int)
ent_df['end_idx'] = ent_df['end_idx'].astype(int)

# Make new column with lemmatized text of 'text' column called 'lemmatized_text'
ent_df['orig_txt'] = ent_df['text']
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

ent_df['text'] = ent_df['text'].apply(lemmatize_text)

In [218]:
"""
FEATURE ENGINEERING
"""

"""
1. Join the appropriate entity1 and entity2 for each relation in rel_df.
"""
# Remove first 5 letters from 'entity1' and 'entity2' column in rel_df
rel_df['entity1'] = rel_df['entity1'].str[5:]
rel_df['entity2'] = rel_df['entity2'].str[5:]
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity1', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity1_text'}, inplace=True)
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity2', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity2_text'}, inplace=True)
rel_df.drop(columns=['entity_id_x', 'entity_id_y'], inplace=True)

# Create column 'entity1_entity2' in rel_df
rel_df['entity1_entity2'] = rel_df['entity1_text'] + rel_df['entity2_text']

"""
2. Get count of text in file_idx for each entity in ent_df. Do the same for the 'entity1_entity2' in rel_df.
"""
ent_df_count = ent_df.groupby(['text', 'file_idx']).size().reset_index(name='count_in_document')
ent_df = ent_df.merge(ent_df_count, how='left', left_on=['text', 'file_idx'], right_on=['text', 'file_idx'])
rel_df_count = rel_df.groupby(['entity1_entity2', 'file_idx']).size().reset_index(name='count_in_document')
rel_df = rel_df.merge(rel_df_count, how='left', left_on=['entity1_entity2', 'file_idx'], right_on=['entity1_entity2', 'file_idx'])

"""
3. Create encoding to represent if entity in ent_df is in the 'Discharge Diagnosis', 'Chief Complaint', or 'History of Present Illness' section of the txt_df.
"""
def find_section_range(row, section_name):
    lines = row['text'].split('\n')
    matches = [(i, fuzz.ratio(line.lower(), section_name.lower())) for i, line in enumerate(lines)]
    matches.sort(key=lambda x: x[1], reverse=True)  # sort by fuzz.ratio in descending order
    if not matches:
        # Raise error if no match is found
        raise ValueError(f"Could not find section {section_name} in file {row['file_idx']}")
    start_line = matches[0][0]  # start of the range is the line with the highest fuzz.ratio
    end_line = start_line
    while end_line < len(lines) and lines[end_line].strip() != '':
        end_line += 1
    # calculate start and end index within the raw text
    start_index = sum(len(line) + 1 for line in lines[:start_line])  # +1 for the newline character
    end_index = sum(len(line) + 1 for line in lines[:end_line])  # +1 for the newline character
    print(start_index, end_index)
    return (start_index, end_index)

txt_df['DD_Range'] = txt_df.apply(lambda row: find_section_range(row, 'Discharge Diagnosis'), axis=1)
txt_df['CC_Range'] = txt_df.apply(lambda row: find_section_range(row, 'Chief Complaint'), axis=1)
txt_df['HPI_Range'] = txt_df.apply(lambda row: find_section_range(row, 'History of Present Illness'), axis=1)

6652 6737
6087 6119
13508 13600
4843 4926
5356 5530
10282 10392
11754 11920
12076 12153
4618 4694
16949 16990
21664 21725
11202 11249
5619 5774
12095 12229
12394 12456
7356 7413
13835 13948
19918 20137
366 496
6442 6467
6015 6102
8337 8601
14427 14837
8560 8850
15693 15767
5759 5805
14200 14283
9527 9698
10221 10330
15341 15419
10504 10652
8229 8313
16358 17061
5007 5138
24387 24786
248 626
2071 2708
23222 23308
28937 29017
10092 10298
4311 4382
4754 4799
10955 11181
5302 5375
10434 10533
5008 5099
7894 7923
20748 20885
5091 5152
8306 8370
7370 7522
13348 13415
12104 12423
4925 5011
11638 11730
4899 4964
5242 5344
9125 9245
11583 11647
17013 17083
17767 17936
10988 11018
15972 16185
9413 9520
12848 13018
9269 9356
12669 12737
18869 18933
15454 15483
22711 22834
924 1122
8632 8668
6708 6861
9786 9875
10844 11041
10965 11081
13546 13672
9282 9441
18087 19009
8941 9378
8850 9051
11888 11917
7033 7093
13516 13571
14405 14491
523 801
18781 18899
24218 24442
8217 8380
14090 14390
16604 16673

In [219]:
"""
# QUESTION: What are the categories of entities and relationships? Do they encapsulate primary medical diagnoses and common unerlying factors?"
"""
# Store unique values in 'category' in ent_df and the count of each, and an example of the category in a DF
ent_df_unique = pd.DataFrame(ent_df['category'].value_counts())
# Get an example of each category and add it as a column to ent_df_unique
ent_df_unique['example'] = ent_df.groupby('category')['text'].apply(lambda x: x.sample(1).values[0])
print(ent_df_unique.head(30))


# Store unique values in 'category' in rel_df and the count of each in a DF
rel_df_unique = pd.DataFrame(rel_df['category'].value_counts())
print(rel_df_unique.head(30))

"""
# LEARNING: 
# 1. The categories of entities and relationships are not mutually exclusive. For example, a patient can have both a primary diagnosis and a secondary diagnosis.
# 2. All relationships tie entities to drugs
"""

"""
# FURTHER QUESTIONS TO EXPLORE:
# 1. Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?
# 2. Can we derive the primary diagnosis by choosing the most common Reason?
# 3. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?
# 4. Can we derive the primary diagnosis / underlying factors by choosing the Reason that occurs the most in the 'Discharge Diagnosis', 'Chief Complaint', or 'Hisotry of Present Illness' sections?
"""

           category                 example
Drug          15192  fluticasone-salmeterol
Strength       6465                   10 mg
Form           6321                  tablet
Route          5449         area of redness
Frequency      4771              once a day
Dosage         3926               one ( 1 )
Reason         3575          blood pressure
ADE             892          bm suppression
Duration        538                 24 hour
                category
Strength-Drug       6702
Form-Drug           6654
Frequency-Drug      6310
Route-Drug          5538
Reason-Drug         5169
Dosage-Drug         4225
ADE-Drug            1107
Duration-Drug        643


"\n# FURTHER QUESTIONS TO EXPLORE:\n# 1. Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?\n# 2. Can we derive the primary diagnosis by choosing the most common Reason?\n# 3. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?\n# 4. Can we derive the primary diagnosis / underlying factors by choosing the Reason that occurs the most in the 'Discharge Diagnosis', 'Chief Complaint', or 'Hisotry of Present Illness' sections?\n"

In [220]:
"""
# QUESTION: Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?
"""

# Create a list of the categories in ent_df_unique, print the top 10 most occuring values in the 'text' for each category and count the number of unique values
print("Entities:\n")
ent_df_unique_list = ent_df_unique.index.tolist()
for category in ent_df_unique_list:
    print(f"Category: {category}")
    print(ent_df[ent_df['category'] == category]['text'].value_counts().head(10))
    print(f"Number of unique values: {ent_df[ent_df['category'] == category]['text'].nunique()}")
    print("\n")

# Create a list of the categories in rel_df_unique, print the top 10 most occuring 'entity1_text' and 'entity2_text' combinations seperated by a '-' and count the number of unique values
print("Relationships:\n")
rel_df_unique_list = rel_df_unique.index.tolist()
for category in rel_df_unique_list:
    print(f"Category: {category}")
    print(rel_df[rel_df['category'] == category]['entity1_text'].str.cat(rel_df[rel_df['category'] == category]['entity2_text'], sep='-').value_counts().head(10))
    print(f"Number of unique values: {rel_df[rel_df['category'] == category]['entity1_text'].str.cat(rel_df[rel_df['category'] == category]['entity2_text'], sep='-').nunique()}")
    print("\n")

"""
# LEARNING:
# 1. 'Reason' category for entities, as Dataset Overview PDF suggests, seems to be the best category to represent the medical diagnosis based on its categories
# 2. 'Reason-Drug' category for relationships seems to be the best category to represent the common underlying factors based on its categories
"""

"""
# HYPOTHESIS:
# 1. The most common 'Reason' category for entities in a document in the 'Discharge Diagnosis' section is the primary diagnosis
# 2. The most prevalent underlying factors are the most common reasons from the most commonly occuring 'Reason-Drug' relationship. This is because the doctor likely is using drugs to treat the most common underlying factors.
"""

"""
# FURTHER QUESTIONS TO EXPLORE:
# 1. Can we derive the primary diagnosis by choosing the most common Reason? Is it more accurate to choose the most common Reason in the 'Discharge Diagnosis' section?
# 2. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?
"""

Entities:

Category: Drug
coumadin      370
vancomycin    265
aspirin       255
lasix         248
antibiotic    222
prednisone    217
heparin       204
insulin       187
lisinopril    176
metoprolol    139
Name: text, dtype: int64
Number of unique values: 1874


Category: Strength
20 mg     323
10 mg     284
100 mg    283
40 mg     256
5 mg      256
25 mg     205
500 mg    197
50 mg     167
325 mg    160
1 mg      132
Name: text, dtype: int64
Number of unique values: 716


Category: Form


tablet                                3362
capsule                                529
solution                               192
tablet , delayed release ( e.c . )     148
tab                                    118
tablet , chewable                      105
puff                                    91
tablet ( s )                            73
appl                                    70
injection                               62
Name: text, dtype: int64
Number of unique values: 207


Category: Route
po              3294
iv               526
by mouth         206
inhalation       185
drip             117
gtt               94
oral              93
subcutaneous      88
topical           83
intravenous       79
Name: text, dtype: int64
Number of unique values: 143


Category: Frequency
daily              1209
daily ( daily )     306
bid                 258
once a day          235
prn                 187
tid                 176
qd                  146
twice a day         130
qhs                 

"\n# FURTHER QUESTIONS TO EXPLORE:\n# 1. Can we derive the primary diagnosis by choosing the most common Reason? Is it more accurate to choose the most common Reason in the 'Discharge Diagnosis' section?\n# 2. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?\n"

In [223]:
"""
# QUESTION: Can we derive the primary diagnosis by choosing the most common Reason? Is it more accurate to choose the most common Reason in the 'Discharge Diagnosis' section?
"""

# Check that every document has a 'Discharge Diagnosis' section
print(txt_df['text'][txt_df['text'].str.contains('diagnosis:')].count() / txt_df.shape[0])
# Check that every document has a 'Chief Complaint' section
print(txt_df['text'][txt_df['text'].str.contains('complaint:')].count() / txt_df.shape[0])
# Check that every document has a 'History of Present Illness' section
print(txt_df['text'][txt_df['text'].str.contains('illness:')].count()/ txt_df.shape[0])

# TODO: Figure out how to fuzzy match discharge diagonosis

# Use file_idx 100035 as an example. Create a txt_df_subset with only file_idx 100035
txt_df_subset = txt_df[txt_df['file_idx'] == '100035']
ent_df_subset = ent_df[ent_df['file_idx'] == '100035']
rel_df_subset = rel_df[rel_df['file_idx'] == '100035']

# Create a list of the categories in ent_df_unique, print the top 10 most occuring values that have category 'Reason'
print("Entities:\n")
ent_df_unique_list = ent_df_unique.index.tolist()
for category in ent_df_unique_list:
    if category == "Reason":
        print(f"Category: {category}")
        print(ent_df_subset[ent_df_subset['category'] == category]['text'].value_counts().head(30))
        print(f"Number of unique values: {ent_df_subset[ent_df_subset['category'] == category]['text'].nunique()}")
        print("\n")

# Print the text in text_df_subset that is in the 'Discharge Diagnosis' section
print("Discharge Diagnosis:\n")
print(txt_df_subset['text'].values[0][txt_df_subset['DD_Range'].values[0][0]:txt_df_subset['DD_Range'].values[0][1]])
# Print DD_Range
print(txt_df_subset['DD_Range'].values[0])

# Print the top 10 most occuring values that have category 'Reason', where the start_idx greater than start_idx of 'Discharge Diagnosis' section and less than end_idx of 'Discharge Diagnosis' section
print("Entities:\n")
ent_df_unique_list = ent_df_unique.index.tolist()
for category in ent_df_unique_list:
    if category == "Reason":
        print(f"Category: {category}")
        print(ent_df_subset[(ent_df_subset['category'] == category) & (ent_df_subset['start_idx'] > txt_df_subset['DD_Range'].values[0][0]) & (ent_df_subset['start_idx'] < txt_df_subset['DD_Range'].values[0][1])]['text'].value_counts().head(10))
        print(f"Number of unique values: {ent_df_subset[(ent_df_subset['category'] == category) & (ent_df_subset['start_idx'] > txt_df_subset['DD_Range'].values[0][0]) & (ent_df_subset['start_idx'] < txt_df_subset['DD_Range'].values[0][1])]['text'].nunique()}")
        print("\n")


0.9141914191419142
0.9042904290429042
0.9438943894389439
Entities:

Category: Reason
pain                               4
asthma                             3
constipation                       2
seizure                            2
fever                              2
agitation                          2
longer than 5 minute               1
back pain                          1
sob                                1
sleep                              1
recurrent seizure                  1
cap                                1
hypertension                       1
low bps                            1
elevated cr 1.9                    1
diuresed                           1
ventilator associated pneumonia    1
wheezing                           1
aggitated                          1
Name: text, dtype: int64
Number of unique values: 19


Discharge Diagnosis:

discharge diagnosis:
anoxic brain injury s/p pea arrest x2
status asthmaticus
ventilator associated pneumonia
chronic systolic heart fa