From the HDF5 container of the mapped OHDSI inpatient data a subset of the data is extracted into two separate matrices
`/independent/core_array` and `/dependent/core_array`.

In [205]:
import h5py
import numpy as np

In [235]:
ohdsi_file_name = "synpuf_inpatient_combined.hdf5"
f5 = h5py.File(ohdsi_file_name, "r")

In [236]:
# Define paths to data in HDF5 container
condition_path = "/ohdsi/condition_occurrence/"
procedure_path = "/ohdsi/procedure_occurrence/"
person_path = "/ohdsi/person/"
visit_occurrence_path = "/ohdsi/visit_occurrence/"
measurement_path = "/ohdsi/measurement/count/"
observation_path = "/ohdsi/observation/count/"
readmission_30_day_path = "/computed/next/30_days/visit_occurrence/"

In [237]:
# Define helper function for joining labels together
def flatten_column_annotations(f5, base_path, abbreviation=None, field_separator="|", first_part=2, second_part=1):
    column_annotations = f5[base_path + "column_annotations"][...]
    number_of_columns = column_annotations.shape[1]
    if abbreviation is not None:
        abbreviation = field_separator + abbreviation
    else:
        abbreviation = ""
    flattened_list = [column_annotations[first_part, i] + field_separator 
                      + column_annotations[second_part, i] + abbreviation 
                      for i in range(number_of_columns)]
    
    cleaned_flattened_list = []
    for name in flattened_list:
        if name[-1] == field_separator:
            name = name.strip()[:-1]
        
        cleaned_flattened_list += [name]
    
    return np.array(cleaned_flattened_list, dtype=column_annotations.dtype)

In [238]:
condition_names = flatten_column_annotations(f5, condition_path, abbreviation="C")
condition_names[0:10]

array(['No matching concept|0|C',
       'Gingival and periodontal disease|132344|C',
       'Staphylococcal scalded skin syndrome|132392|C',
       'Pressure ulcer stage 1|132393|C',
       'Post-laminectomy syndrome|132412|C',
       'Chronic osteomyelitis of hand|132414|C',
       'Congenital anomaly of skin|132446|C',
       'Contusion of scapular region|132491|C',
       'Chronic myeloid leukemia in remission|132572|C',
       'Postablative hypothyroidism|132583|C'], 
      dtype='|S128')

In [239]:
procedure_names = flatten_column_annotations(f5, procedure_path, abbreviation="P")
procedure_names[0:10]

array(['Infusion of drotrecogin alfa (activated)|2000012|P',
       'Injection or infusion of nesiritide|2000014|P',
       'Injection or infusion of oxazolidinone class of antibiotics|2000015|P',
       'High-dose infusion interleukin-2 [IL-2]|2000016|P',
       'Infusion of vasopressor agent|2000018|P',
       'Computer assisted surgery with CT/CTA|2000030|P',
       'Computer assisted surgery with fluoroscopy|2000032|P',
       'Other computer assisted surgery|2000035|P',
       'Procedure on single vessel|2000037|P',
       'Insertion of one vascular stent|2000042|P'], 
      dtype='|S128')

In [240]:
measurement_names = flatten_column_annotations(f5,measurement_path, abbreviation="M", second_part=0)
measurement_names[0:10]

array(['Myelophthisis|134315|M',
       'Abnormal results of cardiovascular function studies|137989|M',
       'Urinalysis, by dip stick or tablet reagent for bilirubin, glucose, hemoglobin, ketones, leukocytes, nitrite, pH, protein, specif',
       'Increased blood lymphocyte number|320074|M',
       'Electrocardiogram abnormal|320536|M', 'Laboratory test|4034850|M',
       'Band neutrophil count above reference range|40481861|M',
       'Type II diabetes mellitus uncontrolled|40482801|M',
       'Type 1 diabetes mellitus uncontrolled|40484648|M',
       'Body mass index 25-29 - overweight|4060705|M'], 
      dtype='|S128')

In [241]:
observation_names = flatten_column_annotations(f5, observation_path, abbreviation="M", second_part=0)
observation_names[0:10]

array(['No matching concept|0|M',
       'Unilateral recurrent femoral hernia with obstruction but no gangrene|196731|M',
       'Unilateral recurrent inguinal hernia with obstruction but no gangrene|197022|M',
       'Unilateral recurrent inguinal hernia|201899|M',
       'Unilateral partial vocal cord paralysis|261047|M',
       'Unilateral complete paralysis of vocal cords|261888|M',
       'Patient need for|4011950|M', 'Palliative care|4014023|M',
       'Consultation|4014829|M', 'Vaccination required|4015724|M'], 
      dtype='|S128')

In [242]:
person_names = flatten_column_annotations(f5, person_path, first_part=0)
person_names

array(['gender_concept_name|FEMALE', 'gender_concept_name|MALE',
       'race_concept_name|Black or African American',
       'race_concept_name|No matching concept', 'race_concept_name|White',
       'ethnicity_concept_name|Hispanic or Latino',
       'ethnicity_concept_name|Not Hispanic or Latino', 'birth_julian_day',
       'birth_date'], 
      dtype='|S128')

In [243]:
visit_names = flatten_column_annotations(f5, visit_occurrence_path, first_part=0)
visit_names

array(['visit_concept_name|Inpatient Visit',
       'visit_type_concept_name|Visit derived from encounter on claim',
       'age_at_visit_start_in_years_int', 'age_at_visit_start_in_days',
       'visit_start_julian_day', 'visit_end_julian_day',
       'visit_start_datetime', 'visit_end_datetime'], 
      dtype='|S128')

In [244]:
readmission_names = np.array(["30-day inpatient readmission"], dtype=visit_names.dtype)
readmission_names

array(['30-day inpatient readmission'], 
      dtype='|S128')

In [245]:
# Helper function for finding
def find_positions(names_array, to_find):
    return np.where(names_array == to_find)[0].tolist()

In [246]:
gender_position = find_positions(person_names, "gender_concept_name|FEMALE")

In [247]:
age_in_years_position = find_positions(visit_names, "age_at_visit_start_in_years_int")

In [248]:
condition_ap = f5[condition_path + "core_array"]
procedure_ap = f5[procedure_path + "core_array"]
measurement_ap = f5[measurement_path + "core_array"]
observation_ap = f5[observation_path + "core_array"]
visit_occurrence_ap = f5[visit_occurrence_path + "core_array"]
person_ap = f5[person_path + "core_array"]
readmission_30_day_ap = f5[readmission_30_day_path + "core_array"]

In [249]:
number_of_inpatient_stays = condition_ap.shape[0]
number_of_inpatient_stays

66700

In [250]:
# First two positions age and gender
number_of_columns = 2 + condition_names.shape[0] + procedure_names.shape[0] + measurement_names.shape[0] + observation_names.shape[0]
number_of_columns

5684L

In [251]:
hdf5_file_to_write_to = "inpatient_readmission_analysis.hdf5"
w5 = h5py.File(hdf5_file_to_write_to, "w")

IOError: Unable to create file (Unable to truncate a file which is already open)

In [252]:
independent_array_ds = w5.create_dataset("/independent/core_array", shape=(number_of_inpatient_stays, number_of_columns), 
                                      dtype="i", compression="gzip")

ValueError: Not a location id (Invalid object id)

In [253]:
independent_array_ds[:, 0] = visit_occurrence_ap[:, age_in_years_position[0]]

ValueError: Not a dataset (Not a dataset)

In [225]:
independent_array_ds[:, 1] = person_ap[:, gender_position[0]]

ValueError: Not a dataset (Not a dataset)

In [226]:
offset = 2
independent_array_ds[:, offset:(offset + condition_names.shape[0])] = condition_ap[...]
offset += condition_names.shape[0]

ValueError: Not a dataset (Not a dataset)

In [182]:
independent_array_ds[:, offset:(offset + procedure_names.shape[0])] = procedure_ap[...]
offset += procedure_names.shape[0]

In [183]:
independent_array_ds[:, offset:(offset + measurement_names.shape[0])] = measurement_ap[...]
offset += measurement_names.shape[0]

In [184]:
independent_array_ds[:, offset:(offset + observation_names.shape[0])] = observation_ap[...]
offset += observation_names.shape[0]

In [186]:
# For non 0 and 1 values set to 1
core_dummy_variables = w5["/independent/core_array"][:, 1:]
core_dummy_variables[core_dummy_variables > 1] = 1

In [187]:
independent_name_array = np.concatenate((person_names[gender_position], 
                visit_names[age_in_years_position],
                condition_names,
                procedure_names,
                measurement_names,
                observation_names
                ), axis=0)  

In [188]:
independent_name_array.shape

(5684L,)

In [189]:
independent_name_array_ds = w5.create_dataset("/independent/column_annotations", shape=(number_of_columns,), 
                                              dtype=independent_name_array.dtype)

In [190]:
independent_name_array_ds[...] = independent_name_array[...]

In [191]:
dependent_array_ds = w5.create_dataset("/dependent/core_array", shape=(number_of_inpatient_stays, 1), dtype="i", 
                                       compression="gzip")

In [192]:
dependent_array_ds[...] = readmission_30_day_ap[...]

In [193]:
dependent_array_name_ds = w5.create_dataset("/dependent/column_annotations", shape=(1,),
                                            dtype=readmission_names.dtype)

In [194]:
dependent_array_name_ds[...] = readmission_names[...]

In [232]:
w5.close()

ValueError: Not a file id (Not a file id)

In [233]:
ff5 = h5py.File(hdf5_file_to_write_to, 'r')

In [234]:
list(ff5["/"])

[u'dependent', u'independent']

In [203]:
np.sum(ff5["/independent/core_array"][:,1:])

665114

In [204]:
f5.close()
ff5.close()

ValueError: Not a file id (Not a file id)