Read and describe ADMISSIONS file

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
adm = pd.read_csv('/content/sample_data/ADMISSIONS.csv')
adm.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'],
      dtype='object')

In [None]:
adm.groupby(['ADMISSION_TYPE']).size()

ADMISSION_TYPE
ELECTIVE      7706
EMERGENCY    42071
NEWBORN       7863
URGENT        1336
dtype: int64

In [None]:
#convert dates to safer format for future calculations
adm.ADMITTIME = pd.to_datetime(adm.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
adm.DISCHTIME = pd.to_datetime(adm.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
adm.DEATHTIME = pd.to_datetime(adm.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [None]:
adm = adm.sort_values(['SUBJECT_ID','ADMITTIME'])
adm = adm.reset_index(drop = True)
#use shift to create a column with  next admit time and type (elective, emergency)
adm['NEXT_ADMTIME'] = adm.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
adm['NEXT_ADMTYPE'] = adm.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)
rows = adm.NEXT_ADMTYPE == 'ELECTIVE'
adm.loc[rows,'NEXT_ADMTIME'] = pd.NaT
adm.loc[rows,'NEXT_ADMTYPE'] = np.NaN
adm = adm.sort_values(['SUBJECT_ID','ADMITTIME'])
adm[['NEXT_ADMTIME','NEXT_ADMTYPE']] = adm.groupby(['SUBJECT_ID'])[['NEXT_ADMTIME','NEXT_ADMTYPE']].fillna(method = 'bfill')

In [None]:
#calculate admission days
adm['NEXT_ADMDAYS']=  (adm.NEXT_ADMTIME - adm.DISCHTIME).dt.total_seconds()/(24*60*60)

Read and describe NOTES file

In [None]:
notes = pd.read_csv("/content/sample_data/NOTES.csv")
notes.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype='object')

In [None]:
summ = notes.loc[notes.CATEGORY == 'Discharge summary']

In [None]:
dissumm = (summ.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()

Merge the files with conditions

In [None]:
adm_notes = pd.merge(adm[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','NEXT_ADMDAYS','NEXT_ADMTIME','ADMISSION_TYPE','DEATHTIME']],
                        dissumm[['SUBJECT_ID','HADM_ID','TEXT']], 
                        on = ['SUBJECT_ID','HADM_ID'],
                        how = 'left')

In [None]:
adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/adm_notes.groupby('ADMISSION_TYPE').size()
finaladm_notes = adm_notes.loc[adm_notes.ADMISSION_TYPE != 'NEWBORN'].copy()

In [None]:
finaladm_notes['LABEL'] = (finaladm_notes.NEXT_ADMDAYS < 30).astype('int')

In [None]:
finaladm_notes = finaladm_notes.sample(n = len(finaladm_notes), random_state = 42)
finaladm_notes = finaladm_notes.reset_index(drop = True)

# Save 30% of the data as validation and test data 
data=finaladm_notes.sample(frac=0.30,random_state=42)
finaldata=finaladm_notes.drop(data.index)

In [None]:
rows_pos = finaldata.LABEL == 1
data_pos = finaldata.loc[rows_pos]
data_neg = finaldata.loc[~rows_pos]

# merge the balanced data
train = pd.concat([data_pos, data_neg.sample(n = len(data_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
train = train.sample(n = len(train), random_state = 42).reset_index(drop = True)


In [None]:
train.to_csv('/content/sample_data/TRAIN.csv')