In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=False)

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/CS598-DL-Healthcare

/content/drive/MyDrive/CS598-DL-Healthcare


In [None]:
'''
FINAL DATA PROCESSING FUNCTION

We use the clinical notes of a patient from their admission until one day before the patient’s death.
Therefore, the patients who have only stayed one day are filtered out, because all of their notes are from the date
of death or discharge.

We remove the clinical notes from the day of discharge for =0 + as well as discharge summaries
'''
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# 1) Load tables
adm = pd.read_csv(
    "./ADMISSIONS.csv",
    parse_dates=["ADMITTIME","DISCHTIME","DEATHTIME"]
)
notes = pd.read_csv(
    "./NOTEEVENTS.csv",
    parse_dates=['CHARTDATE','CHARTTIME'],
    dtype={"ROW_ID":int, "SUBJECT_ID":int},
    infer_datetime_format=True
).rename(columns = {'ROW_ID':'NOTE_ID'})



notes = notes[notes['HADM_ID'].notnull()]
notes['HADM_ID'] = notes['HADM_ID'].astype(int)
notes = notes[notes["CATEGORY"] != "Discharge summary"]

# fill missing CHARTTIME with end of CHARTDATE
end_of_day = notes.CHARTDATE.dt.normalize() + pd.Timedelta(hours=23,minutes=59,seconds=59)
notes['CHARTTIME'] = notes.CHARTTIME.fillna(end_of_day)
notes = notes.sort_values(by=['SUBJECT_ID','HADM_ID','CHARTDATE'])

df_adm_notes = pd.merge(adm[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','HOSPITAL_EXPIRE_FLAG']],
                        notes[['NOTE_ID','SUBJECT_ID','HADM_ID','CHARTDATE','CHARTTIME','TEXT']],
                        on = ['SUBJECT_ID','HADM_ID'],
                        how = 'left')

df_adm_notes.DISCHTIME = df_adm_notes.DISCHTIME.apply(lambda x: str(x).split(' ')[0])
df_adm_notes['DISCHTIME'] = pd.to_datetime(df_adm_notes.DISCHTIME, format = '%Y-%m-%d', errors = 'coerce')
df_adm_notes['CHARTDATE'] = pd.to_datetime(df_adm_notes.CHARTDATE, format = '%Y-%m-%d', errors = 'coerce')
df_adm_notes = df_adm_notes[((df_adm_notes.DISCHTIME - df_adm_notes.CHARTTIME).dt.total_seconds() / (86400)) > 0]

df_adm_notes.rename(columns={
        "NOTE_ID": "Note_ID",
        "HADM_ID": "Adm_ID",
        'HOSPITAL_EXPIRE_FLAG': 'Label',
        'CHARTDATE': 'chartdate',
        'CHARTTIME': 'charttime'
    }, inplace=True)

desired_order = ['Adm_ID', 'Note_ID', 'TEXT', 'Label', 'chartdate', 'charttime']
df_adm_notes["Note_ID"] = df_adm_notes["Note_ID"].astype(int)
df_adm_notes[desired_order].to_csv('./mortality.csv', index=False)

  notes = pd.read_csv(
  notes = pd.read_csv(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['HADM_ID'] = notes['HADM_ID'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTTIME'] = notes.CHARTTIME.fillna(end_of_day)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_adm_notes.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [None]:
# Bert-Base-Uncased Performance
Test Patient Level Accuracy: 0.7893738140417458
Test Patient Level F1 Score: 0.7819253438113949
Test Patient Level Precision: 0.8105906313645621
Test Patient Level Recall: 0.7552182163187856
Test Patient Level AUC: 0.8687209473983633
Test Patient Level Matthew's correlation coefficient: 0.580102712061101
Test Patient Level AUPR: 0.8786625176824754
All Finished!

# Bert-Large-Uncased Performance
Test Patient Level Accuracy: 0.5455407969639469
Test Patient Level F1 Score: 0.5838401390095569
Test Patient Level Precision: 0.5384615384615384
Test Patient Level Recall: 0.6375711574952562
Test Patient Level AUC: 0.5744718772616472
Test Patient Level Matthew's correlation coefficient: 0.09266478335988364
Test Patient Level AUPR: 0.5758099861445012
All Finished!

# Clinical Bert
Test Patient Level Accuracy: 0.793168880455408
Test Patient Level F1 Score: 0.781563126252505
Test Patient Level Precision: 0.8280254777070064
Test Patient Level Recall: 0.7400379506641366
Test Patient Level AUC: 0.8816760223095176
Test Patient Level Matthew's correlation coefficient: 0.5896764022644368
Test Patient Level AUPR: 0.8873331184376569
All Finished!

# LSTM
Test Patient Level Accuracy: 0.8889943074003795
Test Patient Level F1 Score: 0.8860759493670886
Test Patient Level Precision: 0.91
Test Patient Level Recall: 0.8633776091081594
Test Patient Level AUC: 0.9566375855600243
Test Patient Level Matthew's correlation coefficient: 0.7790116852629192
Test Patient Level AUPR: 0.9586082350350303
All Finished