## Load in Data
You must provide YOUR OWN PATH to the location of the sampleclinicalnotes.zip file in the `PATH_TO_ZIP` object.

In [50]:
from load_data import load_ann, load_txt
import pandas as pd

In [51]:
PATH_TO_ZIP = "/workspaces/codespaces-jupyter/Project/RawData"
DATA_PATH = f"{PATH_TO_ZIP}/"
print(f"Full data path: {DATA_PATH}")
# read in txt files
txt_df = load_txt(DATA_PATH)
# read in REASONS entities from .ann files
ent_df, rel_df = load_ann(DATA_PATH)

Full data path: /workspaces/codespaces-jupyter/Project/RawData/
Time taken to read .txt files: 0.008808374404907227
Time taken to read .ann files and extract all metadata: 0.11787986755371094


In [52]:
# EDA on txt_df
print("txt_df shape:", txt_df.shape)
print("txt_df columns:", txt_df.columns)
print("txt_df head:\n", txt_df.head())

# EDA on ent_df
print("ent_df shape:", ent_df.shape)
print("ent_df columns:", ent_df.columns)
print("ent_df head:\n", ent_df.head())

# EDA on rel_df
print("rel_df shape:", rel_df.shape)
print("rel_df columns:", rel_df.columns)
print("rel_df head:\n", rel_df.head())

txt_df shape: (303, 2)
txt_df columns: Index(['file_idx', 'text'], dtype='object')
txt_df head:
   file_idx                                               text
0   109450  Admission Date:  [**2121-8-7**]              D...
1   103677  Admission Date:  [**2128-12-3**]              ...
2   113824  Admission Date:  [**2200-6-14**]              ...
3   113524  Admission Date:  [**2124-1-14**]              ...
4   115244  Admission Date:  [**2168-4-18**]     Discharge...
ent_df shape: (50951, 6)
ent_df columns: Index(['file_idx', 'entity_id', 'category', 'start_idx', 'end_idx', 'text'], dtype='object')
ent_df head:
   file_idx entity_id  category start_idx end_idx             text
0   120253        T1      Drug     10002   10015  Calcipotriene\n
1   120253        T2  Strength     10016   10021         0.005 \n
2   120253        T3      Form     10024   10029          Cream\n
3   120253        T4    Dosage     10035   10042        One (1)\n
4   120253        T5      Form     10043   10047     

In [53]:
"""
DATA CLEANING
"""

"""
1. Convert 'text' columns to lowercase, in order to facilitate comparison.
"""
# To lowercase 'text' column in ent_df
ent_df['text'] = ent_df['text'].str.lower()
ent_df['text'] = ent_df['text'].str.strip()

# To lowercase 'text' column in txt_df
txt_df['text'] = txt_df['text'].str.lower()

"""
2. Remove \n ending from 'text' column in ent_df and in 'entity2' column in rel_df.
"""
ent_df['text'] = ent_df['text'].str.rstrip('\n')
rel_df['entity2'] = rel_df['entity2'].str.rstrip('\n')

In [54]:
"""
FEATURE ENGINEERING
"""

"""
1. Join the appropriate entity1 and entity2 for each relation in rel_df.
"""
# Remove first 5 letters from 'entity1' and 'entity2' column in rel_df
rel_df['entity1'] = rel_df['entity1'].str[5:]
rel_df['entity2'] = rel_df['entity2'].str[5:]
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity1', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity1_text'}, inplace=True)
rel_df = rel_df.merge(ent_df[['entity_id', 'text', 'file_idx']], how='left', left_on=['entity2', 'file_idx'], right_on=['entity_id', 'file_idx'])
rel_df.rename(columns={'text': 'entity2_text'}, inplace=True)

In [55]:
"""
# QUESTION: What are the categories of entities and relationships? Do they encapsulate primary medical diagnoses and common unerlying factors?"
"""
# Store unique values in 'category' in ent_df and the count of each, and an example of the category in a DF
ent_df_unique = pd.DataFrame(ent_df['category'].value_counts())
# Get an example of each category and add it as a column to ent_df_unique
ent_df_unique['example'] = ent_df.groupby('category')['text'].apply(lambda x: x.sample(1).values[0])
print(ent_df_unique.head(30))


# Store unique values in 'category' in rel_df and the count of each in a DF
rel_df_unique = pd.DataFrame(rel_df['category'].value_counts())
print(rel_df_unique.head(30))

"""
# LEARNING: 
# 1. The categories of entities and relationships are not mutually exclusive. For example, a patient can have both a primary diagnosis and a secondary diagnosis.
# 2. All relationships tie entities to drugs
"""

"""
# FURTHER QUESTIONS TO EXPLORE:
# 1. Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?
# 2. Can we derive the primary diagnosis by choosing the most common Reason?
# 3. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?
# 4. Can we derive the primary diagnosis / underlying factors by choosing the Reason that occurs the most in the 'Discharge Diagnosis', 'Chief Complaint', or 'Hisotry of Present Illness' sections?
"""

           category              example
Drug          16225           omeprazole
Strength       6691                 40mg
Form           6651    tablet , chewable
Frequency      6281                daily
Route          5476                   iv
Dosage         4221                taper
Reason         3855             pruritis
ADE             959  creatinine increase
Duration        592            for 2 yrs
                category
Strength-Drug       6702
Form-Drug           6654
Frequency-Drug      6310
Route-Drug          5538
Reason-Drug         5169
Dosage-Drug         4225
ADE-Drug            1107
Duration-Drug        643


"\n# FURTHER QUESTIONS TO EXPLORE:\n# 1. Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?\n# 2. Can we derive the primary diagnosis by choosing the most common Reason?\n# 3. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?\n# 4. Can we derive the primary diagnosis / underlying factors by choosing the Reason that occurs the most in the 'Discharge Diagnosis', 'Chief Complaint', or 'Hisotry of Present Illness' sections?\n"

In [57]:
"""
# QUESTION: Which category types best represent the medical diagnosis and the common underlying factors in ent_df and rel_df?
"""

# Create a list of the categories in ent_df_unique, print the top 10 most occuring values in the 'text' for each category and count the number of unique values
print("Entities:\n")
ent_df_unique_list = ent_df_unique.index.tolist()
for category in ent_df_unique_list:
    print(f"Category: {category}")
    print(ent_df[ent_df['category'] == category]['text'].value_counts().head(10))
    print(f"Number of unique values: {ent_df[ent_df['category'] == category]['text'].nunique()}")
    print("\n")

# Create a list of the categories in rel_df_unique, print the top 10 most occuring 'entity1_text' and 'entity2_text' combinations seperated by a '-' and count the number of unique values
print("Relationships:\n")
rel_df_unique_list = rel_df_unique.index.tolist()
for category in rel_df_unique_list:
    print(f"Category: {category}")
    print(rel_df[rel_df['category'] == category]['entity1_text'].str.cat(rel_df[rel_df['category'] == category]['entity2_text'], sep='-').value_counts().head(10))
    print(f"Number of unique values: {rel_df[rel_df['category'] == category]['entity1_text'].str.cat(rel_df[rel_df['category'] == category]['entity2_text'], sep='-').nunique()}")
    print("\n")

"""
# LEARNING:
# 1. 'Reason' category for entities, as Dataset Overview PDF suggests, seems to be the best category to represent the medical diagnosis based on its categories
# 2. 'Reason-Drug' category for relationships seems to be the best category to represent the common underlying factors based on its categories
"""

"""
# HYPOTHESIS:
# 1. The most common 'Reason' category for entities in a document in the 'Discharge Diagnosis' section is the primary diagnosis
# 2. The most prevalent underlying factors are the most common reasons from the most commonly occuring 'Reason-Drug' relationship. This is because the doctor likely is using drugs to treat the most common underlying factors.
"""

"""
# FURTHER QUESTIONS TO EXPLORE:
# 1. Can we derive the primary diagnosis by choosing the most common Reason? Is it more accurate to choose the most common Reason in the 'Discharge Diagnosis' section?
# 2. Can we derive the primary diagnosis by choosing the Reason that is most closely related to most frequently occuring drug?
"""

Entities:

Category: Drug
coumadin       370
vancomycin     265
aspirin        255
lasix          248
prednisone     217
heparin        204
antibiotics    190
insulin        187
lisinopril     176
metoprolol     139
Name: text, dtype: int64
Number of unique values: 2220


Category: Strength
20 mg     325
100 mg    284
10 mg     284
40 mg     257
5 mg      256
25 mg     205
500 mg    199
50 mg     167
325 mg    160
1 mg      132
Name: text, dtype: int64
Number of unique values: 832


Category: Form
tablet                            3195
capsule                            524
solution                           192
tablet, delayed release (e.c.)     168
tablets                            167
tablet, chewable                   118
tab                                 84
tablet(s)                           73
appl                                70
capsule, delayed release(e.c.)      63
Name: text, dtype: int64
Number of unique values: 264


Category: Frequency
daily                          