In [None]:
import numpy as np
import pandas as pd
from datetime import date
from gensim.parsing.preprocessing import remove_stopwords

Specify MIMIC-iii data folder

In [None]:
DATA_FOLDER = 'mimic-iii-clinical-database-1.4'

Read in admission info, diagnoses with ICD9 codes, and clinical notes CSV file

In [None]:
admissions = pd.read_csv(f'{DATA_FOLDER}/ADMISSIONS.csv')
diagnoses_ICD = pd.read_csv(f'{DATA_FOLDER}/DIAGNOSES_ICD.csv')
note_events = pd.read_csv(f'{DATA_FOLDER}/NOTEEVENTS.csv')

Drop "ROW_ID" column for each CSV file

In [None]:
addmissions = admissions.drop(columns=["ROW_ID"])
diagnoses_ICD = diagnoses_ICD.drop(columns=["ROW_ID"])
note_events = note_events.drop(columns=["ROW_ID"])

DROP NAN rows in clinical notes, and change dtype of admission ID from float to int

In [None]:
note_events = note_events.dropna(subset=['SUBJECT_ID', 'HADM_ID', 'TEXT'])
note_events['HADM_ID'] = note_events['HADM_ID'].astype(int)

For admissions that have mutiple discharge summary notes, only keep the longest discharge summary notes

In [None]:
def keep_longest_note(row):
    return [x for x in row.tolist() if len(x) == max([len(x) for x in row.tolist()])][0]

note_events = note_events.groupby('HADM_ID')['TEXT'].apply(keep_longest_note).reset_index()

Merge admission info and diagnoes ICD9 codes based on the unique patient ID and admission id

In [None]:
admissions_ICD = pd.merge(admissions, diagnoses_ICD, on=["HADM_ID", "SUBJECT_ID"])
admissions_ICD = admissions_ICD.drop(columns=["ROW_ID"])

Specify ICD9 codes related to heart failure, and filter admissions to keep only heart failure admissions

In [None]:
hf_ICD = ['39891', '40201', '40211', '40291',
          '40401', '40403', '40411', '40413', 
          '40491', '40493', '4280', '4281',
          '42820', '42821', '42822', '42823', 
          '42830', '42831', '42832', '42833',
          '42840', '42841', '42842', '42843', '4289']
hf_admissions = admissions_ICD[admissions_ICD['ICD9_CODE'].isin(hf_ICD)]

Get a list of admission ID that has discharge summary note

In [None]:
hf_admission_with_note = note_events['HADM_ID'].astype(int).to_list()

Filter admissions to keep only admissions with a discharge summary note

In [None]:
hf_admissions = hf_admissions[hf_admissions['HADM_ID'].isin(hf_admission_with_note)]

Merge admission with notes, and keep only 4 useful columns: patient ID, admission ID, admission time, discharge summary notes

In [None]:
cleaned = pd.merge(hf_admissions, note_events, on=['HADM_ID'])
cleaned = cleaned[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'TEXT']].copy()

Remove duplicate rows with same admission ID

In [None]:
cleaned = cleaned.drop_duplicates('HADM_ID')

Create Python Dict to map from patient ID to admission ID, admission ID to admission time

In [None]:
patient_to_admission = {}
admission_to_time = {}

patients = cleaned['SUBJECT_ID'].to_list()
admissions = cleaned['HADM_ID'].to_list()
time = cleaned['ADMITTIME'].to_list()

for patient, admission, time in zip(patients, admissions, time):
    if patient not in patient_to_admission:
        patient_to_admission[patient] = []
    patient_to_admission[patient].append(admission)
    parsed_time = time[:10].split('-')
    admission_to_time[admission] = date(int(parsed_time[0]), int(parsed_time[1]), int(parsed_time[2]))

Convert Dataframe to Python Dict

In [None]:
data = cleaned.set_index('HADM_ID').to_dict(orient='index')

Assign 30 days readmission label and general readimssion label based on admission time

In [None]:
for admission, info in data.items():
    patient = info['SUBJECT_ID']
    info['30_DAY_READMISSION'] = 0
    info['GENERAL_READMISSION'] = 0
    for other_admission in patient_to_admission[patient]:
        time_diff = (admission_to_time[other_admission] - admission_to_time[admission]).days
        if time_diff > 0:
            info['GENERAL_READMISSION'] = 1
        if 30 >= time_diff > 0:
            info['30_DAY_READMISSION'] = 1

Convert Python Dict to Dataframe

In [None]:
data = pd.DataFrame.from_dict(data, orient='index').reset_index()

Rename the index column

In [None]:
data = data.rename(columns={"index": "HADM_ID"})

Extract postive examples and sample same number of negative examples from dataset

In [None]:
postive = data[data['GENERAL_READMISSION'] == 1]
postive_30 = data[data['30_DAY_READMISSION'] == 1]
negative = data[data['GENERAL_READMISSION'] == 0].sample(len(postive))
negative_30 = data[data['30_DAY_READMISSION'] == 0].sample(len(postive_30))

Concat postive and negative examples to form dataset for general readmission and 30 days readmission

In [None]:
data = pd.concat([postive, negative])
data_30 = pd.concat([postive_30, negative_30])

Convert Dataframe to Python List and get text and labels for general readmission and 30 days readmission

In [None]:
x = data['TEXT'].to_list()
y = data['GENERAL_READMISSION'].to_list()
x_30 = data_30['TEXT'].to_list()
y_30 = data_30['30_DAY_READMISSION'].to_list()

Remove stop words from text using Genism

In [None]:
x = [remove_stopwords(text) for text in x]
x_30 = [remove_stopwords(text) for text in x_30]

Save the text and labels to txt file 

In [None]:
with open('CS598_DATA/x.txt', 'w') as fd:
    for data in x:
        data = str(data) + '\n'
        fd.writelines(data)

with open('CS598_DATA/x_30.txt', 'w') as fd:
    for data in x_30:
        data = str(data) + '\n'
        fd.writelines(data)

with open('CS598_DATA/y.txt', 'w') as fd:
    for data in y:
        data = str(data) + '\n'
        fd.writelines(data)

with open('CS598_DATA/y_30.txt', 'w') as fd:
    for data in y_30:
        data = str(data) + '\n'
        fd.writelines(data)