# Pre-processing of the outputevents dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

file_path="./files/mimiciii/1.4/"

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [2]:
adm=pd.read_csv(file_path+"Admissions_processed.csv")

We now consider the outputevents dataset. We select only the patients with the same criteria as above.

In [3]:
outputs=pd.read_csv(file_path+"OUTPUTEVENTS.csv")

In [4]:
#Some checks
assert(len(outputs.loc[outputs["ISERROR"].notnull()].index)==0) #No entry with iserror==TRUE

#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm["HADM_ID"])
outputs=outputs.loc[outputs["HADM_ID"].isin(adm_ids)]

print("Number of patients remaining in the database: ")
print(outputs["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24540


We load the D_ITEMS dataframe which contains the name of the ITEMID. And we merge both tables together.

In [5]:
#item_id 
item_id=pd.read_csv(file_path+"D_ITEMS.csv")
item_id_1=item_id[["ITEMID","LABEL"]]
item_id_1.head()

#We merge the name of the item administrated.
outputs_2=pd.merge(outputs,item_id_1,on="ITEMID")
outputs_2.head()
print("Number of patients remaining in the database: ")
print(outputs_2["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24540


We compute the number of patients that have the specific outputs labels and we select only the features that are the most present over the whole data set. For this, we rank the features by number of patients and select the n_best.

In [6]:
n_best=15
#For each item, evaluate the number of patients who have been given this item.
pat_for_item=outputs_2.groupby("LABEL")["SUBJECT_ID"].nunique()
#Order by occurence and take the 20 best (the ones with the most patients)
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]

#Select only the time series with high occurence.
outputs_3=outputs_2.loc[outputs_2["LABEL"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(outputs_3["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

print(frequent_labels)

Number of patients remaining in the database: 
24404
Number of datapoints remaining in the database: 
1831977
LABEL
Urine Out Foley                              12682
Foley                                         9674
Stool Out Stool                               5104
Chest Tubes CTICU CT 1                        3500
Gastric Oral Gastric                          3357
OR Urine                                      3355
Void                                          3338
Urine Out Void                                3316
OR Out PACU Urine                             2926
Chest Tube #1                                 2558
Pre-Admission Output Pre-Admission Output     2197
Pre-Admission                                 2006
Oral Gastric                                  1819
OR Out OR Urine                               1767
OR Out EBL                                    1758
Name: SUBJECT_ID, dtype: int64


#### Eventually, we select the same labels of the paper

In [7]:
outputs_label_list=['Gastric Gastric Tube','Stool Out Stool','Urine Out Incontinent','Ultrafiltrate Ultrafiltrate','Foley', 'Void','Condom Cath','Fecal Bag','Ostomy (output)','Chest Tube #1','Chest Tube #2','Jackson Pratt #1','OR EBL','Pre-Admission','TF Residual']
outputs_bis=outputs_2.loc[outputs_2["LABEL"].isin(outputs_label_list)].copy()

print("Number of patients remaining in the database: ")
print(outputs_bis["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_bis.index))

outputs_3=outputs_bis.copy()

Number of patients remaining in the database: 
15964
Number of datapoints remaining in the database: 
781675


# Cleaning of the output data

### Units Cleaning

#### 1) Amounts

In [8]:
#Verification that all input labels have the same amounts units.
outputs_3.groupby("LABEL")["VALUEUOM"].value_counts() #OK

LABEL                        VALUEUOM
Chest Tube #1                mL           59614
Chest Tube #2                mL            7647
Condom Cath                  mL            2439
Fecal Bag                    mL            1278
Foley                        mL          603016
Gastric Gastric Tube         ml            1665
Jackson Pratt #1             mL            8487
OR EBL                       mL            1992
Ostomy (output)              mL            1579
Pre-Admission                mL            2759
Stool Out Stool              ml           30987
TF Residual                  mL           12181
Ultrafiltrate Ultrafiltrate  ml           18850
Urine Out Incontinent        ml             976
Void                         mL           27689
Name: VALUEUOM, dtype: int64

### Check for outliers

#### 1) In amounts

In [9]:
outputs_3.groupby("LABEL")["VALUE"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chest Tube #1,59614.0,37.56018,60.397672,0.0,10.0,30.0,50.0,2500.0
Chest Tube #2,7647.0,31.988361,63.908518,0.0,0.0,20.0,40.0,1900.0
Condom Cath,2439.0,243.642066,222.121737,0.0,100.0,200.0,325.0,2350.0
Fecal Bag,1278.0,445.411581,340.925912,0.0,200.0,350.0,678.75,2500.0
Foley,603016.0,125.559091,5896.453024,0.0,45.0,80.0,150.0,4555555.0
Gastric Gastric Tube,1727.0,116.611465,154.821776,0.0,14.0,60.0,150.0,1600.0
Jackson Pratt #1,8487.0,75.190256,104.936289,0.0,20.0,45.0,90.0,1325.0
OR EBL,1992.0,684.76506,1483.872724,0.0,100.0,250.0,600.0,20000.0
Ostomy (output),1579.0,203.476251,169.09129,0.0,100.0,175.0,300.0,2000.0
Pre-Admission,2759.0,616.539688,769.411777,-689.0,140.0,400.0,800.0,11000.0


In [10]:
#Remove all entries whose rate is more than 4 std away from the mean.
out_desc=outputs_3.groupby("LABEL")["VALUE"].describe()
name_list=list(out_desc.loc[out_desc["count"]!=0].index)
for label in name_list:
    outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]==label)&(outputs_3["VALUE"]>(out_desc.loc[label,"mean"]+4*out_desc.loc[label,"std"]))].index).copy()

print("Number of patients remaining in the database: ")
print(outputs_3["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

Number of patients remaining in the database: 
15934
Number of datapoints remaining in the database: 
780550


In [11]:
#Clean Foley, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Foley") & (outputs_3["VALUE"]>5500)].index).copy()
#Clean Expected Blood Loss, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR EBL") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Out Expected Blood Loss, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR Out EBL") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean OR Urine, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR Urine") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Pre-Admission, remove too large and negative values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission") & (outputs_3["VALUE"]<0)].index).copy()
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Pre-Admission output, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission Output Pre-Admission Output") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Urine Out Foley output, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Urine Out Foley") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Void, remove negative values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Void") & (outputs_3["VALUE"]<0)].index).copy()

outputs_3.dropna(subset=["VALUE"],inplace=True)

print("Number of patients remaining in the database: ")
print(outputs_3["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

Number of patients remaining in the database: 
12987
Number of datapoints remaining in the database: 
756217


As data is already in timestamp format, we don't neeed to consider rates

In [12]:
outputs_3.to_csv(file_path+"OUTPUTS_processed.csv")