In [1]:
import ipywidgets as widgets
import sys
from pathlib import Path
import os
import importlib

module_path='preprocessing/day_intervals_preproc'
if module_path not in sys.path:
    sys.path.append(module_path)

module_path='utils'
if module_path not in sys.path:
    sys.path.append(module_path)
    
module_path='preprocessing'
if module_path not in sys.path:
    sys.path.append(module_path)
    
module_path='model'
if module_path not in sys.path:
    sys.path.append(module_path)

import day_intervals_cohort_v2
from day_intervals_cohort_v2 import *

import data_generation_icu

import feature_selection_icu
from feature_selection_icu import *

import ml_models
from ml_models import *

import dl_train
from dl_train import *

import tokenization
from tokenization import *

import behrt_train
from behrt_train import *

import evaluation
import fairness
import callibrate_output

## 1. DATA EXTRACTION



In [2]:
root_dir = os.path.dirname(os.path.abspath('UserInterface.ipynb'))
version_path="mimiciv/2.0"

In [3]:
cohort_output = day_intervals_cohort_v2.extract_data(root_dir)

EXTRACTING FOR: | ICU | READMISSION DUE TO J44 | 30 | 
[ READMISSION DUE TO J44 ]
Index(['subject_id', 'stay_id', 'hadm_id', 'intime', 'outtime', 'los',
       'min_valid_year', 'dod', 'Age', 'gender', 'race', 'insurance'],
      dtype='object')


100%|██████████████████████████████████████| 5217/5217 [00:15<00:00, 326.99it/s]


[ READMISSION LABELS FINISHED ]
[ COHORT SUCCESSFULLY SAVED ]
[ SUMMARY SUCCESSFULLY SAVED ]
Readmission FOR ICU DATA
# Admission Records: 7346
# Patients: 5217
# Positive cases: 1137
# Negative cases: 6209


## 2. FEATURE SELECTION
All features will be saved in **./data/features/**

**Please run below cell to select features**

In [4]:
feature_icu(cohort_output, version_path)

[EXTRACTING CHART EVENTS DATA]


32it [05:37, 10.55s/it]


# Unique Events:   412
# Admissions:   7345
Total rows 7568035
[SUCCESSFULLY SAVED CHART EVENTS DATA]


## 3. SUMMARY OF FEATURES

This step will generate summary of all features extracted so far.<br>
It will save summary files in **./data/summary/**<br>
- These files provide summary about **mean frequency** of medical codes per admission.<br>
- It also provides **total occurrence count** of each medical code.<br>
- For labs and chart events it will also provide <br>**missing %** which tells how many rows for a certain medical code has missing value.

Please use this information to further refine your cohort by selecting <br>which medical codes in each feature you want to keep and <br>which codes you would like to remove for downstream analysis tasks.

**Please run below cell to generate summary files**

In [5]:
generate_summary_icu(True)

[GENERATING FEATURE SUMMARY]
[SUCCESSFULLY SAVED FEATURE SUMMARY]


## 4. Feature Selection

based on the files generated in previous step and other infromation gathered by you,<br>
Please select which medical codes you want to include in this study.

In [6]:
# First get the path to the file. 
sum_path = "data/summary/chart_features.csv"

# Then use pandas to load up the dataframe.
feats = pd.read_csv(sum_path)

# Set up all of the medical codes you want to save
to_keep = [220045, 220051, 230073, 230068, 230070, 230071, 230083, 230084,
           220052, 220061, 220245]

keeps = []
for i in feats["itemid"]:
    keeps.append(i in to_keep)

# Filter feats
feat_filt = feats[keeps]

# Save the features of interest
feat_filt.to_csv(sum_path, index = False)

In [7]:
# This will perform the feature selection based on your subsetted list of features. 
features_selection_icu(cohort_output)


[FEATURE SELECTION CHART EVENTS DATA]
Total number of rows 887025
[SUCCESSFULLY SAVED CHART EVENTS DATA]


## 5. CLEANING OF FEATURES
Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.

Outlier removal is performed to remove values higher than selected **right threshold** percentile and lower than selected **left threshold** percentile among all values for each itemid. 

**Please run below cell to select preprocessing for diferent features**

In [8]:
print("Outlier removal in values of chart events ?")
layout = widgets.Layout(width='100%', height='40px') #set width and height

radio_input5 = widgets.RadioButtons(options=['No outlier detection','Impute Outlier (default:98)','Remove outliers (default:98)'],value='No outlier detection',layout=layout)
display(radio_input5)
outlier=widgets.IntSlider(
value=98,
min=90,
max=99,
step=1,
disabled=False,layout={'width': '100%'}
)
left_outlier=widgets.IntSlider(
value=0,
min=0,
max=10,
step=1,
disabled=False,layout={'width': '100%'}
)
#display(oulier)
display(widgets.HBox([widgets.Label('Right Outlier Threshold',layout={'width': '150px'}), outlier]))
display(widgets.HBox([widgets.Label('Left Outlier Threshold',layout={'width': '150px'}), left_outlier]))

Outlier removal in values of chart events ?


RadioButtons(layout=Layout(height='40px', width='100%'), options=('No outlier detection', 'Impute Outlier (def…

HBox(children=(Label(value='Right Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=98, layou…

HBox(children=(Label(value='Left Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=0, layout=…

In [9]:
thresh=0
clean_chart=radio_input5.value!='No outlier detection'
impute_outlier_chart=radio_input5.value=='Impute Outlier (default:98)'
thresh=outlier.value
left_thresh=left_outlier.value
preprocess_features_icu(cohort_output,clean_chart,impute_outlier_chart,thresh,left_thresh)


## 6. Time-Series Representation
In this section, please choose how you want to process and represent time-series data.

- First option is to select the length of time-series data you want to include for this study. (Default is 72 hours)

- Second option is to select bucket size which tells in what size time windows you want to divide your time-series.<br>
For example, if you select **2** bucket size, it wil aggregate data for every 2 hours and <br>a time-series of length 24 hours will be represented as time-series with 12 time-windows <br>where data for every 2 hours is agggregated from original raw time-series.

During this step, we will also save the time-series data in data dictionaries in the format that can be directly used for following deep learning analysis.

### Imputation
You can also choose if you want to impute lab/chart values. The imputation will be done by froward fill and mean or median imputation.<br>
Values will be forward fill first and if no value exists for that admission we will use mean or median value for the patient.

The data dictionaries will be saved in **./data/dict/**

Please refer the readme to know the structure of data dictionaries.

**Please run below cell to select time-series representation**

In [10]:
print("=======Time-series Data Represenation=======")

print("Length of data to be included for time-series prediction ?")

radio_input8 = widgets.RadioButtons(options=['Last 72 hours','Last 48 hours','Last 24 hours','Custom'],
                                    value='Last 72 hours')
display(radio_input8)
text2=widgets.IntSlider(
value=72,
min=24,
max=72,
step=1,
description='Last',
disabled=False
)
display(widgets.HBox([widgets.Label('Last (in hours):',layout={'width': '150px'}), text2]))
    
    
print("What time bucket size you want to choose ?")
radio_input7 = widgets.RadioButtons(options=['1 hour','2 hour','3 hour','4 hour','5 hour','Custom'],value='1 hour')
display(radio_input7)
text1=widgets.IntSlider(
    value=1,
    min=1,
    max=6,
    step=1,
    disabled=False
    )
display(widgets.HBox([widgets.Label('Bucket Size (in hours):',layout={'width': '150px'}), text1]))
print("Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?")
radio_impute = widgets.RadioButtons(options=['No Imputation', 'forward fill and mean','forward fill and median'],value='No Imputation')
display(radio_impute)   

radio_input6 = widgets.RadioButtons(options=['0 hours','2 hours','4 hours','6 hours'],value='0 hours')
print("**Please run below cell to perform time-series represenation and save in data dictionaries**")

Length of data to be included for time-series prediction ?


RadioButtons(options=('Last 72 hours', 'Last 48 hours', 'Last 24 hours', 'Custom'), value='Last 72 hours')

HBox(children=(Label(value='Last (in hours):', layout=Layout(width='150px')), IntSlider(value=72, description=…

What time bucket size you want to choose ?


RadioButtons(options=('1 hour', '2 hour', '3 hour', '4 hour', '5 hour', 'Custom'), value='1 hour')

HBox(children=(Label(value='Bucket Size (in hours):', layout=Layout(width='150px')), IntSlider(value=1, max=6,…

Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?


RadioButtons(options=('No Imputation', 'forward fill and mean', 'forward fill and median'), value='No Imputati…

**Please run below cell to perform time-series represenation and save in data dictionaries**


In [11]:
if (radio_input6.value=='Custom'):
    predW=int(text3.value)
else:
    predW=int(radio_input6.value[0].strip())
if (radio_input7.value=='Custom'):
    bucket=int(text1.value)
else:
    bucket=int(radio_input7.value[0].strip())
if (radio_input8.value=='Custom'):
    include=int(text2.value)
else:
    include=int(radio_input8.value.split()[1])
if (radio_impute.value=='forward fill and mean'):
    impute='Mean'
elif (radio_impute.value=='forward fill and median'):
    impute='Median'
else:
    impute=False

gen=data_generation_icu.Generator(cohort_output,impute,include,bucket,predW)


[ READ COHORT ]


1it [00:04,  4.65s/it]


[ READ ALL FEATURES ]
[ PROCESSED TIME SERIES TO EQUAL LENGTH  ]


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 284.62it/s]


bucket 1
[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]
72


100%|███████████████████████████████████████| 2538/2538 [00:39<00:00, 63.89it/s]

[ SUCCESSFULLY SAVED DATA DICTIONARIES ]





In [12]:
print("=======Machine :earning Models=======")
radio_input5 = widgets.RadioButtons(options=['Logistic Regression','Random Forest','Gradient Bossting','Xgboost'],value='Gradient Bossting')
display(radio_input5)
print("Do you wnat to conactenate the time-series feature")
radio_input6 = widgets.RadioButtons(options=['Conactenate','Aggregate'],value='Conactenate')
display(radio_input6)
print("Please select below option for cross-validation")
radio_input7 = widgets.RadioButtons(options=['No CV','5-fold CV','10-fold CV'],value='5-fold CV')
display(radio_input7)
print("Do you want to do oversampling for minority calss ?")
radio_input8 = widgets.RadioButtons(options=['True','False'],value='True')
display(radio_input8)



RadioButtons(index=2, options=('Logistic Regression', 'Random Forest', 'Gradient Bossting', 'Xgboost'), value=…

Do you wnat to conactenate the time-series feature


RadioButtons(options=('Conactenate', 'Aggregate'), value='Conactenate')

Please select below option for cross-validation


RadioButtons(index=1, options=('No CV', '5-fold CV', '10-fold CV'), value='5-fold CV')

Do you want to do oversampling for minority calss ?


RadioButtons(options=('True', 'False'), value='True')

In [13]:
if radio_input7.value=='No CV':
    cv=0
elif radio_input7.value=='5-fold CV':
    cv=int(5)
elif radio_input7.value=='10-fold CV':
    cv=int(10)
ml=ml_models.ML_models(data_icu,cv,radio_input5.value,concat=radio_input6.value=='Conactenate',oversampling=radio_input8.value=='True')

NameError: name 'ml_models' is not defined