In [61]:
import pandas as pd


try:
    patients = pd.read_csv('PATIENTS.csv')
    admissions = pd.read_csv('ADMISSIONS.csv')
    diagnoses_icd = pd.read_csv('DIAGNOSES_ICD.csv')
    d_icd_diagnoses = pd.read_csv('D_ICD_DIAGNOSES.csv')
    labevents = pd.read_csv('LABEVENTS.csv')
    d_labitems = pd.read_csv('D_LABITEMS.csv')
    vitalsign = pd.read_csv('vitalsign.csv')  # vitalsign.csv has no hadm_id
except FileNotFoundError:
    print("❌ Error: One or more required CSV files not found.")
    exit()


patient_info = pd.merge(
    patients[['subject_id', 'gender', 'anchor_age']],
    admissions[['subject_id', 'hadm_id']],
    on='subject_id',
    how='inner'
)
patient_info = patient_info.rename(columns={'anchor_age': 'age'})


fbc_itemids = {
    51222: 'Hemoglobin',
    51221: 'Hematocrit',
    51250: 'MCV',
    51249: 'MCHC',
    51248: 'MCH',
    51279: 'RBC',
    51277: 'RDW',
    50907: 'Cholesterol'
}

fbc_data = labevents[labevents['itemid'].isin(fbc_itemids.keys())]
fbc_data = fbc_data[['subject_id', 'hadm_id', 'itemid', 'valuenum']].dropna(subset=['valuenum'])
fbc_data['lab_name'] = fbc_data['itemid'].map(fbc_itemids)

fbc_pivot = fbc_data.pivot_table(
    index=['subject_id', 'hadm_id'],
    columns='lab_name',
    values='valuenum',
    aggfunc='mean'
).reset_index()

#  Diagnosis 
diagnosis_data = pd.merge(diagnoses_icd, d_icd_diagnoses, on='icd_code', how='left')
diagnosis_data = diagnosis_data.sort_values(by='seq_num')
first_diagnosis = diagnosis_data.groupby(['subject_id', 'hadm_id']).first().reset_index()
first_diagnosis = first_diagnosis[['subject_id', 'hadm_id', 'long_title']]

# Blood Pressure from vitalsign.csv 
vitalsign = vitalsign.rename(columns={'sbp': 'Systolic_BP', 'dbp': 'Diastolic_BP'})
bp_pivot = vitalsign.groupby('subject_id')[['Systolic_BP', 'Diastolic_BP']].mean().reset_index()

#  Diabetes flag 
diabetes_icd9_codes = [code for code in d_icd_diagnoses['icd_code'].astype(str) if code.startswith('250')]
diabetes_flag = diagnoses_icd[diagnoses_icd['icd_code'].astype(str).isin(diabetes_icd9_codes)].copy()
diabetes_flag['diabetes'] = 1
diabetes_flag = diabetes_flag[['subject_id', 'hadm_id', 'diabetes']].drop_duplicates()

# Merge all dataframes
dataset = pd.merge(patient_info, fbc_pivot, on=['subject_id', 'hadm_id'], how='left')
dataset = pd.merge(dataset, first_diagnosis, on=['subject_id', 'hadm_id'], how='left')
dataset = pd.merge(dataset, bp_pivot, on='subject_id', how='left')  # Merge only on subject_id for BP
dataset = pd.merge(dataset, diabetes_flag, on=['subject_id', 'hadm_id'], how='left')

dataset['diabetes'] = dataset['diabetes'].fillna(0).astype(int)

# Columns order
columns_order = [
    'subject_id', 'hadm_id', 'gender', 'age', 'Hemoglobin', 'Hematocrit', 'MCV', 'MCHC', 'MCH', 'RBC', 'RDW',
    'Systolic_BP', 'Diastolic_BP', 'Cholesterol', 'diabetes', 'long_title'
]
available_cols = [col for col in columns_order if col in dataset.columns]

# Final dataset
final_dataset = dataset[available_cols].copy()

# Fill missing BP and Cholesterol columns with 0 if not present
for col in ['Systolic_BP', 'Diastolic_BP', 'Cholesterol']:
    if col not in final_dataset.columns:
        final_dataset[col] = 0


final_dataset['BP'] = ((final_dataset['Systolic_BP'] > 130) | (final_dataset['Diastolic_BP'] > 80)).astype(int)


final_dataset['cholesterol1'] = (final_dataset['Cholesterol'] > 240).astype(int)

# Save the dataset
final_dataset.to_csv('mimiciv_dataset_final.csv', index=False)
print("✅ Dataset saved as mimiciv_dataset_final.csv")
print(final_dataset.head())


✅ Dataset saved as mimiciv_dataset_final.csv
   subject_id   hadm_id gender  age  Hemoglobin  Hematocrit        MCV  \
0    10014729  23300884      F   21    7.783333   23.325000  88.666667   
1    10014729  28889419      F   21    9.985714   29.533333  88.571429   
2    10003400  23559586      F   72    7.797826   23.759574  91.377778   
3    10003400  20214994      F   72    9.312500   26.750000  90.187500   
4    10003400  27296885      F   72    8.100000   25.133333  87.666667   

        MCHC        MCH       RBC        RDW  Systolic_BP  Diastolic_BP  \
0  33.300000  29.500000  2.637500  12.766667   117.000000     67.200000   
1  34.071429  30.142857  3.315714  12.057143   117.000000     67.200000   
2  32.900000  30.046667  2.614444  15.780000   114.227273     68.704545   
3  34.831250  31.334375  2.977500  17.334375   114.227273     68.704545   
4  32.333333  28.366667  2.870000  20.600000   114.227273     68.704545   

   Cholesterol  diabetes                                   