<a href="https://colab.research.google.com/github/XORbit01/IHCA_data_cleaning/blob/main/IHCA_ENCODING_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
!pip install pandas



In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA_PATH = "ihca_cleaned.xlsx"



In [130]:
df = pd.read_excel(DATA_PATH)
df.columns = df.columns.str.lower().str.strip()

print("Shape:", df.shape)
df.head()


Shape: (873, 28)


Unnamed: 0,mrn,gender,birth_year,arrest_year,age,coronary_artery_disease,heart_failure,heart_disease,hypertension,copd,...,survival_24h,survival_to_discharge,origin,cpr_dt,rosc_dt,arrest_dt,cpr_duration,arrest_to_cpr,shock_count,max_energy
0,3143444,M,1967.0,2018,51.0,N,N,N,Y,N,...,Y,Y,Ward,2018-06-26 15:15:00,2018-06-26 15:30:00,2018-06-26 15:15:00,-0.489583,0.0,0,0
1,3151161,F,1963.0,2018,55.0,N,N,N,N,N,...,N,N,ICU,2018-07-14 14:32:00,2018-07-14 15:10:00,2018-07-14 14:30:00,0.026389,0.001389,0,0
2,3151503,F,1965.0,2018,53.0,N,N,N,Y,N,...,N,N,Ward,2018-07-30 15:30:00,2018-07-30 15:30:00,2018-07-30 15:30:00,0.0,0.0,0,0
3,3193356,F,1940.0,2018,78.0,N,N,N,Y,N,...,Y,Y,ICU,2018-10-14 14:00:00,NaT,2018-10-14 14:00:00,,0.0,0,0
4,3197363,M,1944.0,2018,74.0,N,N,N,Y,N,...,N,N,Ward,2018-06-11 04:15:00,2018-11-06 04:30:00,2018-06-11 04:15:00,148.010417,0.0,0,0


## ENCODING SECTION

List All Y/N Columns

In [131]:
binary_cols = [
    'coronary_artery_disease',
    'heart_failure',
    'heart_disease',
    'hypertension',
    'copd',
    'diabetes',
    'cancer',
    'covid_on_admission',
    'rosc',
    'survival_24h',
    'survival_to_discharge',
    'smoking'   # now only Y/N/U
]


In [132]:
binary_yes_no = [
    'coronary_artery_disease',
    'heart_failure',
    'heart_disease',
    'hypertension',
    'copd',
    'diabetes',
    'cancer',
    'covid_on_admission',
    'rosc',
    'survival_24h',
    'survival_to_discharge'
]


In [133]:
yes_no_map = {'Y': 1, 'N': 0, 'U':-1}

df[binary_yes_no] = df[binary_yes_no].replace(yes_no_map)


In [134]:
# Smoking contains:
# Y → 1
# N → 0
# U → keep as separate category
# To encode:
df['smoking'] = df['smoking'].replace({'Y': 1, 'N': 0, 'U': -1})

In [135]:
df['gender'] = df['gender'].map({'M': 1, 'F': 0, 'U':-1})

### Shockable rhythm flag

In [136]:
shockable = ['VF', 'VT']

df['shockable_rhythm'] = df['initial_rhythm'].isin(shockable).astype(int)

In [137]:
# One-Hot Encode Initial Rhythm ===
rhythm_dummies = pd.get_dummies(
    df['initial_rhythm'],
    prefix='rhythm',
    dummy_na=True,
    dtype=int
)
df = pd.concat([df.drop(columns=['initial_rhythm']), rhythm_dummies], axis=1)

In [138]:
# === 6) One-Hot Encode Event Location ===
event_loc_dummies = pd.get_dummies(
    df['event_location'],
    prefix='loc',
    dtype=int
)

In [139]:
df = pd.concat([df.drop(columns=['event_location']), event_loc_dummies], axis=1)

In [140]:
# Origin frequently conflicts with event_location (ICU origin but event in CCU, etc.)
df = df.drop(columns=['origin'])

In [141]:
# === 8) Shock Count & Max Energy → numeric ===
df['shock_count'] = pd.to_numeric(df['shock_count'], errors='coerce')
df['max_energy']  = pd.to_numeric(df['max_energy'],  errors='coerce')

In [142]:
# === 9) Datetime columns & clean durations ===
df['cpr_dt']    = pd.to_datetime(df['cpr_dt'])
df['rosc_dt']   = pd.to_datetime(df['rosc_dt'])
df['arrest_dt'] = pd.to_datetime(df['arrest_dt'])

In [143]:
# === 9) Datetime columns & clean durations ===
df['cpr_duration_min'] = (df['rosc_dt'] - df['cpr_dt']).dt.total_seconds() / 60
df['arrest_to_cpr_min'] = (df['cpr_dt'] - df['arrest_dt']).dt.total_seconds() / 60


In [144]:
for col in ['cpr_duration', 'arrest_to_cpr']:
    if col in df.columns:
        df = df.drop(columns=[col])

In [145]:
encoding_info = {
    "binary_yes_no": binary_yes_no,
    "binary_yes_no_map": yes_no_map,
    "smoking_encoding": {"Y": 1, "N": 0, "U": -1},
    "gender_encoding": {"M": 1, "F": 0},
    "shockable_rhythm": {"VF/VT": 1, "other": 0},
    "initial_rhythm_dummies": list(rhythm_dummies.columns),
    "event_location_dummies": list(event_loc_dummies.columns),
    "duration_columns": ["cpr_duration_min", "arrest_to_cpr_min"],
}

encoding_info

{'binary_yes_no': ['coronary_artery_disease',
  'heart_failure',
  'heart_disease',
  'hypertension',
  'copd',
  'diabetes',
  'cancer',
  'covid_on_admission',
  'rosc',
  'survival_24h',
  'survival_to_discharge'],
 'binary_yes_no_map': {'Y': 1, 'N': 0, 'U': -1},
 'smoking_encoding': {'Y': 1, 'N': 0, 'U': -1},
 'gender_encoding': {'M': 1, 'F': 0},
 'shockable_rhythm': {'VF/VT': 1, 'other': 0},
 'initial_rhythm_dummies': ['rhythm_AF',
  'rhythm_Asystole',
  'rhythm_Bradycardia',
  'rhythm_PEA',
  'rhythm_Sinus/Other',
  'rhythm_VF',
  'rhythm_VT',
  'rhythm_nan'],
 'event_location_dummies': ['loc_CCU',
  'loc_COVID_WARD',
  'loc_CVU',
  'loc_DSU',
  'loc_FMS',
  'loc_ICU',
  'loc_MMS',
  'loc_NEONATAL_ICU',
  'loc_OBSTETRICS',
  'loc_OPERATING_ROOM',
  'loc_PEDIATRIC',
  'loc_PEDIATRIC_ICU',
  'loc_POST_ICU',
  'loc_UNKNOWN'],
 'duration_columns': ['cpr_duration_min', 'arrest_to_cpr_min']}

In [146]:
# we want for EDA this dataframe
# Columns we *explicitly* keep
base_eda_cols = [
    # Demographics
    "age", "gender", "smoking",

    # Comorbidities
    "coronary_artery_disease",
    "heart_failure",
    "heart_disease",
    "hypertension",
    "copd",
    "diabetes",
    "cancer",
    "covid_on_admission",

    # Arrest characteristics
    "shock_count",
    "max_energy",
    "shockable_rhythm",

    # Outcomes
    "rosc",
    "survival_24h",
    "survival_to_discharge",

    # Timing
    "cpr_duration_min",
    "arrest_to_cpr_min",
    "cpr_dt",
    "arrest_dt",
    "rosc_dt",
]

# Add all rhythm_* and loc_* dummies automatically
rhythm_cols = [c for c in df.columns if c.startswith("rhythm_")]
loc_cols    = [c for c in df.columns if c.startswith("loc_")]

eda_cols = base_eda_cols + rhythm_cols + loc_cols

# Final EDA dataframe
df_eda = df[eda_cols].copy()


In [147]:
df_eda.head(5)

Unnamed: 0,age,gender,smoking,coronary_artery_disease,heart_failure,heart_disease,hypertension,copd,diabetes,cancer,...,loc_FMS,loc_ICU,loc_MMS,loc_NEONATAL_ICU,loc_OBSTETRICS,loc_OPERATING_ROOM,loc_PEDIATRIC,loc_PEDIATRIC_ICU,loc_POST_ICU,loc_UNKNOWN
0,51.0,1,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,55.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53.0,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,78.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,74.0,1,0,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [148]:
!pip install ydata-profiling



In [149]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df_eda, title="IHCA EDA Report")
profile.to_file("ihca_eda_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/44 [00:00<?, ?it/s][A
 14%|█▎        | 6/44 [00:00<00:00, 50.63it/s][A
 27%|██▋       | 12/44 [00:00<00:00, 44.65it/s][A
 39%|███▊      | 17/44 [00:00<00:00, 44.81it/s][A
 52%|█████▏    | 23/44 [00:00<00:00, 49.41it/s][A
 66%|██████▌   | 29/44 [00:00<00:00, 47.74it/s][A
 77%|███████▋  | 34/44 [00:00<00:00, 42.42it/s][A
 89%|████████▊ | 39/44 [00:00<00:00, 41.18it/s][A
100%|██████████| 44/44 [00:01<00:00, 41.88it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [150]:
df['coronary_artery_disease'].unique(), df['coronary_artery_disease'].dtype

(array([0, 1, -1], dtype=object), dtype('O'))

In [153]:
# export
df_eda.to_excel("ihca_eda_encoded.xlsx", index=False)