# Extraction Module

The objective of this module is to extract embeddings from multimodal data, including:

- Medical images in *dicom* format.
- Text notes.
- Time series events.

In this section, we showcase the functionalities of the extraction module using only CSV files to generate note and time series embeddings from the MIMIC dataset. Our work is inspired by the HAIM study.

The following CSV files are required for this notebook and should be placed under the *csv/original_data/* folder:

- For patient linkage:
    - *admissions.csv*
- For note embeddings extraction:
    - *radiology.csv*
- For time series embeddings extraction:
    - *chartevents.csv*
    - *d_items.csv*
    - *d_labitems.csv*
    - *labevents.csv*
    - *procedureevents.csv*

To limit computational time, we have chosen to demonstrate our work using a subset of only a thousand patients.

In [1]:
# Imports

import dask.dataframe as dd
import datetime
import numpy as np
import os
import pandas as pd

os.chdir('../src')
import extraction
from patient_list import PATIENT_LIST

## Get data from original CSV files

In [2]:
# Set the working directory
os.chdir('../csv')

In [3]:
# Read CSV data from original data
df_admissions = dd.read_csv('original_data/admissions.csv', assume_missing=True, dtype={'admission_location': 'object', 'deathtime': 'object', 'edouttime': 'object','edregtime': 'object'})
df_chart_events = dd.read_csv('original_data/chartevents.csv', assume_missing=True, low_memory=False, dtype={'value': 'object', 'valueuom': 'object'})
df_items = pd.read_csv('original_data/d_items.csv')
df_labitems = pd.read_csv('original_data/d_labitems.csv')
df_lab_events = dd.read_csv('original_data/labevents.csv', assume_missing=True, low_memory=False, dtype={'storetime': 'object', 'value': 'object', 'valueuom': 'object', 'flag': 'object', 'priority': 'object', 'comments': 'object'})
df_procedure_events = dd.read_csv('original_data/procedureevents.csv', assume_missing=True, dtype={'value': 'object', 'secondaryordercategoryname': 'object', 'totalamountuom': 'object'})
df_rad_notes = dd.from_pandas(pd.read_csv('original_data/radiology.csv', dtype={'charttime': 'object', 'storetime': 'object', 'text': 'object'}), chunksize=8)

In [4]:
# This cell is here in order to reduce computation time and doesn't illustrate the Extraction module functionalities

# Filter data according to patient list
df_admissions = extraction.filter_dataframe_by_patient(df_admissions, 'subject_id', PATIENT_LIST)
df_chart_events = extraction.filter_dataframe_by_patient(df_chart_events, 'subject_id', PATIENT_LIST)
df_lab_events = extraction.filter_dataframe_by_patient(df_lab_events, 'subject_id', PATIENT_LIST)
df_procedure_events = extraction.filter_dataframe_by_patient(df_procedure_events, 'subject_id', PATIENT_LIST)
df_rad_notes = extraction.filter_dataframe_by_patient(df_rad_notes, 'subject_id', PATIENT_LIST)

## Prepare data

In [5]:
# Convert CSV columns types
extraction.convert_df_time_column_to_format(df_admissions, 'admittime', format="%Y-%m-%d %H:%M:%S")
extraction.convert_df_time_column_to_format(df_chart_events, 'charttime', format="%Y-%m-%d %H:%M:%S")
extraction.convert_df_time_column_to_format(df_lab_events, 'charttime', format="%Y-%m-%d %H:%M:%S")
extraction.convert_df_time_column_to_format(df_procedure_events, 'starttime', format="%Y-%m-%d %H:%M:%S")
extraction.convert_df_time_column_to_format(df_rad_notes, 'charttime', format="%Y-%m-%d %H:%M:%S")

In [6]:
# Filter data to get only useful information
df_admissions = df_admissions[['subject_id', 'admittime']].compute().sort_values(by=['subject_id', 'admittime'])
df_chart_events = df_chart_events[['subject_id', 'charttime', 'itemid', 'valuenum']].compute()
df_lab_events = df_lab_events[['subject_id', 'charttime', 'itemid', 'valuenum']].compute()
df_procedure_events = df_procedure_events[['subject_id', 'starttime', 'itemid', 'value']].compute()
df_rad_notes = df_rad_notes[['subject_id', 'charttime', 'text']].compute()

[########################################] | 100% Completed | 731.25 ms
[########################################] | 100% Completed | 160.89 s
[########################################] | 100% Completed | 78.54 s
[########################################] | 100% Completed | 957.37 ms
[########################################] | 100% Completed | 544.38 s


## Time Series embeddings extraction

In [7]:
# Define constants
CHARTEVENTS = ['Heart Rate', 'Non Invasive Blood Pressure systolic',
               'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean',
               'Respiratory Rate', 'O2 saturation pulseoxymetry',
               'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response']
LABEVENTS = ['Glucose', 'Potassium', 'Sodium', 'Chloride', 'Creatinine',
             'Urea Nitrogen', 'Bicarbonate', 'Anion Gap', 'Hemoglobin', 'Hematocrit',
             'Magnesium', 'Platelet Count', 'Phosphate', 'White Blood Cells',
             'Calcium, Total', 'MCH', 'Red Blood Cells', 'MCHC', 'MCV', 'RDW',
             'Neutrophils', 'Vancomycin']
PROCEDUREEVENTS = ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis',
                   'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter',
                   'Chest Tube Removed', 'Hemodialysis']

In [8]:
# Embeddings extraction
df_chart_events_embeddings = extraction.generate_ts_embeddings(df_chart_events, 'subject_id', CHARTEVENTS, 'charttime', 'itemid', 'valuenum', 'chartevent', datetime.timedelta(days=1), df_items, 'itemid', 'label')
df_lab_events_embeddings = extraction.generate_ts_embeddings(df_lab_events, 'subject_id', LABEVENTS, 'charttime', 'itemid', 'valuenum', 'labevent', datetime.timedelta(days=1), df_labitems, 'itemid', 'label')
df_procedure_events_embeddings = extraction.generate_ts_embeddings(df_procedure_events, 'subject_id', PROCEDUREEVENTS, 'starttime', 'itemid', 'value', 'procedureevent', datetime.timedelta(days=1), df_items, 'itemid', 'label')

In [9]:
# Save generated embeddings as CSV files
df_chart_events_embeddings.to_csv('extracted_features/chart_events.csv', index=False)
df_lab_events_embeddings.to_csv('extracted_features/lab_events.csv', index=False)
df_procedure_events_embeddings.to_csv('extracted_features/procedure_events.csv', index=False)

## Notes embeddings extraction

In [10]:
# Compute weight for embeddings
sr_first_visit = pd.Series(data=[np.NaN for _ in range(len(set(df_admissions['subject_id'])))], index=set(df_admissions['subject_id']))
for patient_id in set(df_admissions['subject_id']):
    if not df_admissions.loc[df_admissions['subject_id'] == patient_id]['admittime'].empty:
        sr_first_visit[patient_id] = df_admissions.loc[df_admissions['subject_id'] == patient_id]['admittime'].iloc[0]
sr_first_visit.dropna(inplace=True)

In [11]:
# Embeddings extraction
df_notes_embeddings = extraction.generate_note_embeddings(df_rad_notes, 'subject_id', 'charttime', 'text', sr_first_visit, datetime.timedelta(days=1))

In [12]:
# Save generated embeddings as CSV files
df_notes_embeddings.to_csv('extracted_features/rad_notes.csv', index=False)