v0.3.0: Adapted pipeline to MIMIC-IV v2.2, improved speed by changing…

… storage of timeseries, fixing bug from Issue #21
USM-CHU-FGuyon · Feb 19, 2024 · fe67a50 · fe67a50
1 parent 871f2ac
commit fe67a50
Show file tree

Hide file tree

Showing 38 changed files with 6,354 additions and 1,044 deletions.
diff --git a/0_prepare_files.py b/0_prepare_files.py
@@ -26,6 +26,11 @@
 
 ingredient_to_drug = om.run()
 
-mm = MedicationMapping(pth_dic)
-
-medication_json = mm.run(load_drugnames=True, fname='medications.json')
+mm = MedicationMapping(pth_dic,
+                       datasets=['hirid', 
+                                 'amsterdam',
+                                 'mimic4',
+                                 'mimic3',
+                                 'eicu'])
+
+medication_json = mm.run(load_drugnames=False, fname='medications.json')
diff --git a/1_extract_hirid.py b/1_extract_hirid.py
@@ -9,11 +9,10 @@
 
 hirid_prep = hiridPreparator(
     variable_ref_path='hirid_variable_reference_v1.csv',
-    raw_ts_path='raw_stage/observation_tables_parquet.tar.gz',
-    raw_pharma_path='raw_stage/pharma_records_parquet.tar.gz',
-    admissions_path='reference_data.tar.gz',
-    imputedstage_path='imputed_stage/imputed_stage_parquet.tar.gz',
-    untar=True)
+    ts_path='observation_tables/parquet/',
+    pharma_path='pharma_records/parquet/',
+    admissions_path='reference_data/general_table.csv',
+    imputedstage_path='imputed_stage/parquet/')
 
 hirid_prep.gen_labels()
 hirid_prep.gen_medication()

diff --git a/1_extract_mimic.py b/1_extract_mimic.py
diff --git a/1_extract_mimic3.py b/1_extract_mimic3.py
@@ -12,6 +12,7 @@
 
 mimic3_prep.load_raw_tables()
 
+mimic3_prep.icustays = mimic3_prep.gen_icustays()
 mimic3_prep.gen_labels()
 mimic3_prep.gen_flat()
 mimic3_prep.gen_medication()

diff --git a/1_extract_mimic4.py b/1_extract_mimic4.py
@@ -0,0 +1,22 @@
+"""
+This code extracts the data from the MIMIC-IV dataset 
+('mimic_source_path' in paths.json).
+
+It creates a set of .parquet files at the specified path 
+('mimic' in paths.json). 
+"""
+from mimic4_preprocessing.mimic4preparator import mimic4Preparator
+
+mimic4_prep = mimic4Preparator(
+    chartevents_pth='/icu/chartevents.csv.gz',                         
+    labevents_pth='/hosp/labevents.csv.gz')
+
+mimic4_prep.load_raw_tables()
+
+mimic4_prep.icustays = mimic4_prep.gen_icustays()
+mimic4_prep.gen_labels()
+mimic4_prep.gen_flat()
+mimic4_prep.gen_medication()
+mimic4_prep.gen_timeseriesoutputs()
+mimic4_prep.gen_timeserieslab()
+mimic4_prep.gen_timeseries()
diff --git a/2_mimic.py → 2_mimic4.py b/2_mimic.py → 2_mimic4.py
@@ -4,17 +4,17 @@
 Note that this produces the 'raw' data of the BlendedICU dataset.
 The preprocessed BlendedICU dataset will then be obtained with 3_blendedICU.py
 """
-from mimic_preprocessing.flat_and_labels import mimic_FLProcessor
-from mimic_preprocessing.timeseries import mimicTSP
+from mimic4_preprocessing.flat_and_labels import mimic4_FLProcessor
+from mimic4_preprocessing.timeseries import mimic4TSP
 
-tsp = mimicTSP(
+tsp = mimic4TSP(
     med_pth='medication.parquet',
     ts_pth='timeseries.parquet',
     tslab_pth='timeserieslab.parquet',
     outputevents_pth='timeseriesoutputs.parquet')
 
 tsp.run()
 
-flp = mimic_FLProcessor()
+flp = mimic4_FLProcessor()
 
 flp.run_labels()
diff --git a/3_blendedICU.py b/3_blendedICU.py
@@ -10,14 +10,13 @@
 from blended_preprocessing.timeseries import blendedicuTSP
 from blended_preprocessing.flat_and_labels import blended_FLProcessor
 
-flp = blended_FLProcessor(datasets=['mimic3',
+flp = blended_FLProcessor(datasets=['mimic4',
+                                    'mimic3',
                                     'hirid',
                                     'amsterdam',
-                                    'mimic',
                                     'eicu'])
-
 flp.run_flat_and_labels()
 
-tsp = blendedicuTSP(recompute_index=False)
+tsp = blendedicuTSP(compute_index=False)
 
 tsp.run()
diff --git a/5_figures_and_tables.py b/5_figures_and_tables.py
@@ -4,7 +4,6 @@
 from figures_and_tables.blendedICU_stats import Blendedicu_stats
 
 s = Blendedicu_stats()
-
 # basic statistics on how many ingredients and drugnames were included
 #s.medication_inclusion_stats()
 # Flat statistics that make the Tables 1 and 2 in the manuscript

diff --git a/Readme.md b/Readme.md
@@ -34,7 +34,9 @@ keywords = {OMOP common data format, Intensive care unit database, Data integrat
 }
 ```
 This repository contains the codes and files that allow the creation of the 
-BlendedICU dataset from the AmsterdamUMCdb, eICU, HiRID, and MIMIC-IV databases.
+BlendedICU dataset from the AmsterdamUMCdb, eICU, HiRID, MIMIC-III and MIMIC-IV databases.
+
+[<img src="plot/kdeplot.png" width="600"/>](plot/kdeplot.png)
 
 Before you begin
 ---

diff --git a/auxillary_files/medication_mapping_files/amsterdam_medications.csv b/auxillary_files/medication_mapping_files/amsterdam_medications.csv
@@ -1,4 +1,4 @@
-drugname;drugcount
+drugname;count
 Drukzak;0.8887301999480655
 NaCl 0,45%/Glucose 2,5%;0.7943391326928071
 Paracetamol;0.78230762572492

diff --git a/auxillary_files/medication_mapping_files/drugnames.parquet b/auxillary_files/medication_mapping_files/drugnames.parquet
diff --git a/auxillary_files/medication_mapping_files/eicu_medications.csv b/auxillary_files/medication_mapping_files/eicu_medications.csv
@@ -1,4 +1,4 @@
-drugname;drugcount
+drugname;count
 ACETAMINOPHEN 325 MG PO TABS;0.10559646319059639
 ACETAMINOPHEN;0.08462652905769719
 ONDANSETRON 2 MG/1 ML 2ML SDV INJ;0.07290188639792093