In [51]:
import pandas as pd

# Load the required MIMIC-IV CSV files
try:
    patients = pd.read_csv('PATIENTS.csv')
    admissions = pd.read_csv('ADMISSIONS.csv')
    diagnoses_icd = pd.read_csv('DIAGNOSES_ICD.csv')
    d_icd_diagnoses = pd.read_csv('D_ICD_DIAGNOSES.csv')
    labevents = pd.read_csv('LABEVENTS.csv')
    d_labitems = pd.read_csv('D_LABITEMS.csv')
    vitalsign = pd.read_csv('vitalsign.csv')  # ✅ Use vitalsign.csv instead of CHARTEVENTS
except FileNotFoundError:
    print("❌ Error: One or more required CSV files not found.")
    exit()

# 1. Demographics: Gender, Age (using anchor_age)
patient_info = pd.merge(
    patients[['subject_id', 'gender', 'anchor_age']],
    admissions[['subject_id', 'hadm_id']],
    on='subject_id',
    how='inner'
)
patient_info = patient_info.rename(columns={'anchor_age': 'age'})

# 2. Lab Measurements (FBC items)
fbc_itemids = {
    50811: 'Hemoglobin',
    51221: 'Hematocrit',
    51250: 'MCV',
    51249: 'MCHC',
    51248: 'MCH',
    51279: 'RBC',
    51277: 'RDW',
    50903: 'Cholesterol'
}

fbc_data = labevents[labevents['itemid'].isin(fbc_itemids.keys())]
fbc_data = fbc_data[['subject_id', 'hadm_id', 'itemid', 'valuenum']].dropna(subset=['valuenum'])
fbc_data['lab_name'] = fbc_data['itemid'].map(fbc_itemids)

fbc_pivot = fbc_data.pivot_table(
    index=['subject_id', 'hadm_id'],
    columns='lab_name',
    values='valuenum',
    aggfunc='mean'
).reset_index()

# 3. Diagnosis (Only long_title available)
diagnosis_data = pd.merge(diagnoses_icd, d_icd_diagnoses, on='icd_code', how='left')
diagnosis_data = diagnosis_data.sort_values(by='seq_num')
first_diagnosis = diagnosis_data.groupby(['subject_id', 'hadm_id']).first().reset_index()
first_diagnosis = first_diagnosis[['subject_id', 'hadm_id', 'long_title']]

# 4. Blood Pressure from vitalsign.csv
vitalsign = vitalsign.rename(columns={'sbp': 'Systolic_BP', 'dbp': 'Diastolic_BP'})

# Add dummy hadm_id if not present
if 'hadm_id' not in vitalsign.columns:
    vitalsign['hadm_id'] = None

# Ensure correct data types before merging
vitalsign['hadm_id'] = pd.to_numeric(vitalsign['hadm_id'], errors='coerce')
bp_pivot = vitalsign[['subject_id', 'hadm_id', 'Systolic_BP', 'Diastolic_BP']].drop_duplicates()

# 5. Diabetes flag (ICD9 codes starting with '250')
d_icd_diagnoses['icd_code'] = d_icd_diagnoses['icd_code'].astype(str)
diabetes_icd9_codes = d_icd_diagnoses[d_icd_diagnoses['icd_code'].str.startswith('250')]['icd_code'].unique()
diabetes_flag = diagnoses_icd[diagnoses_icd['icd_code'].astype(str).isin(diabetes_icd9_codes)].copy()
diabetes_flag['diabetes'] = 1
diabetes_flag = diabetes_flag[['subject_id', 'hadm_id', 'diabetes']].drop_duplicates()

# Merge all dataframes
for df in [fbc_pivot, first_diagnosis, bp_pivot, diabetes_flag]:
    df['hadm_id'] = pd.to_numeric(df['hadm_id'], errors='coerce')

patient_info['hadm_id'] = pd.to_numeric(patient_info['hadm_id'], errors='coerce')

dataset = pd.merge(patient_info, fbc_pivot, on=['subject_id', 'hadm_id'], how='left')
dataset = pd.merge(dataset, first_diagnosis, on=['subject_id', 'hadm_id'], how='left')
dataset = pd.merge(dataset, bp_pivot, on=['subject_id', 'hadm_id'], how='left')
dataset = pd.merge(dataset, diabetes_flag, on=['subject_id', 'hadm_id'], how='left')

dataset['diabetes'] = dataset['diabetes'].fillna(0).astype(int)

# Columns order
columns_order = [
    'subject_id', 'hadm_id', 'gender', 'age', 'Hemoglobin', 'Hematocrit', 'MCV', 'MCHC', 'MCH', 'RBC', 'RDW',
    'Systolic_BP', 'Diastolic_BP', 'Cholesterol', 'diabetes', 'long_title'
]
available_cols = [col for col in columns_order if col in dataset.columns]

# Final dataset
final_dataset = dataset[available_cols].copy()

# Fill missing BP and Cholesterol columns with 0 if not present
for col in ['Systolic_BP', 'Diastolic_BP', 'Cholesterol']:
    if col not in final_dataset.columns:
        final_dataset[col] = 0

# Add BP column (1 if Systolic > 130 or Diastolic > 80)
final_dataset['BP'] = ((final_dataset['Systolic_BP'] > 130) | (final_dataset['Diastolic_BP'] > 80)).astype(int)

# Add cholesterol1 column (1 if Cholesterol > 240)
final_dataset['cholesterol1'] = (final_dataset['Cholesterol'] > 240).astype(int)

# Save the dataset
final_dataset.to_csv('mimiciv_dataset_final.csv', index=False)
print("✅ Dataset saved as mimiciv_dataset_final.csv")
print(final_dataset.head())


✅ Dataset saved as mimiciv_dataset_final.csv
   subject_id   hadm_id gender  age  Hemoglobin  Hematocrit        MCV  \
0    10014729  23300884      F   21         NaN   23.325000  88.666667   
1    10014729  28889419      F   21       10.15   29.533333  88.571429   
2    10003400  23559586      F   72        9.30   23.759574  91.377778   
3    10003400  20214994      F   72        8.84   26.750000  90.187500   
4    10003400  27296885      F   72         NaN   25.133333  87.666667   

        MCHC        MCH       RBC        RDW  Systolic_BP  Diastolic_BP  \
0  33.300000  29.500000  2.637500  12.766667          NaN           NaN   
1  34.071429  30.142857  3.315714  12.057143          NaN           NaN   
2  32.900000  30.046667  2.614444  15.780000          NaN           NaN   
3  34.831250  31.334375  2.977500  17.334375          NaN           NaN   
4  32.333333  28.366667  2.870000  20.600000          NaN           NaN   

   Cholesterol  diabetes                                   

In [41]:
import pandas as pd

# Step 1: Load your ICU subject list with ICD codes
icu_df = pd.read_csv("icustays.csv")  # or your ICU CSV file
subject_ids = icu_df['subject_id'].unique()

# Step 2: Load lab and chart event data
labevents = pd.read_csv("labevents.csv", low_memory=False)
d_labitems = pd.read_csv("d_labitems.csv")
chartevents = pd.read_csv("chartevents.csv", low_memory=False)
d_items = pd.read_csv("d_items.csv")

# Step 3: Define the list of lab tests and vitals you need
lab_keywords = ['hemoglobin', 'hematocrit', 'mcv', 'mch', 'mchc', 'rbc', 'rdw', 'cholesterol']
vital_keywords = ['non invasive bp systolic', 'non invasive bp diastolic']

# Step 4: Filter relevant lab itemids
lab_ids = d_labitems[d_labitems['label'].str.lower().isin(lab_keywords)][['itemid', 'label']]
lab_filtered = labevents[
    (labevents['itemid'].isin(lab_ids['itemid'])) & 
    (labevents['subject_id'].isin(subject_ids)) & 
    (labevents['valuenum'].notnull())
]

# Step 5: Pivot lab results
lab_merged = lab_filtered.merge(lab_ids, on="itemid")
lab_pivot = lab_merged.pivot_table(
    index="subject_id",
    columns="label",
    values="valuenum",
    aggfunc="max"
).reset_index()

# Step 6: Filter relevant vital itemids
vital_ids = d_items[d_items['label'].str.lower().isin(vital_keywords)][['itemid', 'label']]
vital_filtered = chartevents[
    (chartevents['itemid'].isin(vital_ids['itemid'])) & 
    (chartevents['subject_id'].isin(subject_ids)) & 
    (chartevents['valuenum'].notnull())
]

# Step 7: Pivot vital results
vital_merged = vital_filtered.merge(vital_ids, on="itemid")
vital_pivot = vital_merged.pivot_table(
    index="subject_id",
    columns="label",
    values="valuenum",
    aggfunc="max"
).reset_index()

# Step 8: Merge all data together
final_dataset = icu_df[['subject_id', 'icd9_code']].drop_duplicates()
final_dataset = final_dataset.merge(lab_pivot, on="subject_id", how="left")
final_dataset = final_dataset.merge(vital_pivot, on="subject_id", how="left")

# Step 9: Save final dataset
final_dataset.to_csv("final_lab_vital_data.csv", index=False)
print("✅ Final dataset saved as final_lab_vital_data.csv")
print(final_dataset.head())


KeyError: "['icd9_code'] not in index"