# Synthetic Dataset Expansion using Synthetic Minority Over-sampling Technique-Eigenvector Centrality (SMOTE-EC) method.

The cleaned dataset ICU_Cleaned.csv is used for expansion, adding additional columns

In [2]:
# ==========================================================
# ICU pipeline:
# 1) Add 'PH of blood' and 'Oxygen level' (2 decimals)
# 2) Expand dataset to EXACTLY 1200 rows (SMOTE-EC style)
# 3) Ensure ID, AGE, SYS_BP, HEART_RATE are integers
# 4) Guarantee unique IDs (no duplicates)
# Output saved in SAME folder as notebook.
# ==========================================================

import os
import numpy as np
import pandas as pd

np.random.seed(1442)

In [3]:
# --------- helpers ---------
def _is_cat(s):
    return s.dtype == "object" or str(s.dtype) == "category"

def _synthesize_between(a, b, noise_scale=0.02):
    lam = np.random.rand()
    base = a + lam * (b - a)
    span = abs(b - a)
    noise = np.random.normal(0, max(noise_scale * (span if span>0 else 1.0), 1e-6))
    return base + noise

def _expand_class(df_class, add_n, num_cols, cat_cols, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    rows = []
    if len(df_class) == 0:
        return pd.DataFrame(columns=df_class.columns)
    idx = df_class.index.to_numpy()
    for _ in range(add_n):
        i1, i2 = np.random.choice(idx, size=2, replace=True)
        r1, r2 = df_class.loc[i1], df_class.loc[i2]
        new = {}
        for c in df_class.columns:
            if c in exclude_cols:
                new[c] = r1[c]
            elif c in num_cols:
                v1, v2 = r1[c], r2[c]
                if pd.isna(v1): v1 = df_class[c].median(skipna=True)
                if pd.isna(v2): v2 = df_class[c].median(skipna=True)
                try:
                    new[c] = float(_synthesize_between(float(v1), float(v2)))
                except Exception:
                    med = df_class[c].median(skipna=True)
                    new[c] = float(med) if pd.notna(med) else 0.0
            elif c in cat_cols:
                v = np.random.choice([r1[c], r2[c]])
                if pd.isna(v):
                    m = df_class[c].mode(dropna=True)
                    v = m.iloc[0] if not m.empty else "Unknown"
                new[c] = v
            else:
                new[c] = r1[c]
        rows.append(new)
    return pd.DataFrame(rows)

In [4]:
def add_columns_then_expand(
    input_csv= r'..\Data_Cleaning\ICU_Cleaned.csv',
    output_name="ICU_Expanded_raw.csv",
    target_total=1200,
    id_col="ID",
    int_cols=("AGE","SYS_BP","HEART_RATE")
):
    base_dir = os.getcwd()
    output_path = os.path.join(base_dir, output_name)

    # ---- 1) Load ----
    df = pd.read_csv(input_csv)
    if df.empty:
        raise ValueError("ICU.csv has 0 rows.")
    df = df.loc[:, df.notna().any()]  # drop empty columns

    # ---- 2) Add two new numeric columns (rounded to 2 decimals) ----
    ph = np.random.normal(loc=7.40, scale=0.04, size=len(df))
    spo2 = np.random.normal(loc=96.0, scale=2.5, size=len(df))

    outcome_candidates = [
        "Outcome","outcome","Label","label","Class","class",
        "Mortality","mortality","Death","death","Deceased","deceased",
        "Survived","survived","ICU_outcome","icu_outcome","Status","status"
    ]
    y_like = next((c for c in outcome_candidates if c in df.columns), None)
    if y_like is not None:
        bad_tokens = ["death","deceased","expired","mortality","critical","severe","icu","yes","1","readmit"]
        ys = df[y_like].astype(str).str.lower()
        mask = ys.isin(bad_tokens) | ys.str.contains("|".join(bad_tokens), na=False)
        if mask.sum() > 0:
            ph[mask] -= np.random.normal(0.03, 0.01, size=mask.sum())
            spo2[mask] -= np.random.normal(4.0, 1.5, size=mask.sum())

    ph = np.clip(ph, 7.0, 7.7)
    spo2 = np.clip(spo2, 70.0, 100.0)
    df["PH of blood"] = np.round(ph, 2)
    df["Oxygen level"] = np.round(spo2, 2)

    # ---- 3) Prepare column types ----
    cat_cols = [c for c in df.columns if _is_cat(df[c])]
    num_cols = [c for c in df.columns if c not in cat_cols and pd.api.types.is_numeric_dtype(df[c])]
    exclude_cols = []
    if id_col in df.columns:
        if id_col in num_cols:
            num_cols.remove(id_col)
        exclude_cols.append(id_col)

    # ---- 4) Create label for SMOTE-EC ----
    y_col = y_like
    created_pseudo = False
    if y_col is None:
        y_col = None
        for c in cat_cols:
            if 2 <= df[c].nunique(dropna=True) <= 10:
                y_col = c
                break
        if y_col is None:
            created_pseudo = True
            y_col = "__pseudo_label__"
            num_choice = None
            for c in num_cols:
                if df[c].nunique(dropna=True) >= 10:
                    num_choice = c
                    break
            if num_choice is None:
                if len(num_cols) == 0:
                    df["synthetic_metric"] = np.random.normal(0, 1, size=len(df))
                    num_cols.append("synthetic_metric")
                    num_choice = "synthetic_metric"
                else:
                    num_choice = num_cols[0]
            df[y_col] = pd.qcut(df[num_choice], q=3, labels=["low","mid","high"], duplicates="drop")
            cat_cols.append(y_col)

    # ---- 5) Expand to target_total ----
    counts = df[y_col].value_counts(dropna=False)
    current_total = int(counts.sum())

    if current_total >= target_total:
        df_final = df.sample(n=target_total, random_state=1442).reset_index(drop=True)
        is_synth_flags = np.zeros(len(df_final), dtype=bool)
    else:
        to_add_total = target_total - current_total
        classes = counts.index.tolist()
        props = (counts / counts.sum()).to_numpy()
        base = np.floor(props * to_add_total).astype(int)
        remainder = to_add_total - base.sum()
        fracs = (props * to_add_total) - base
        order = np.argsort(-fracs)
        for i in range(remainder):
            base[order[i % len(order)]] += 1

        synth_parts = []
        for cls, add_n in zip(classes, base):
            if add_n <= 0:
                continue
            df_cls = df[df[y_col] == cls].copy()
            synth = _expand_class(df_cls, add_n, num_cols, cat_cols, exclude_cols)
            synth_parts.append(synth)

        df_synth = pd.concat(synth_parts, ignore_index=True)
        df_final = pd.concat([df, df_synth], ignore_index=True).reset_index(drop=True)
        is_synth_flags = np.zeros(len(df_final), dtype=bool)
        is_synth_flags[-len(df_synth):] = True

    # ---- 6) Integer columns and bounds ----
    def _coerce_int(series, lo=None, hi=None, fill=0):
        s = pd.to_numeric(series, errors="coerce").fillna(fill)
        if lo is not None: s = s.clip(lower=lo)
        if hi is not None: s = s.clip(upper=hi)
        return s.round(0).astype(int)

    if "AGE" in df_final.columns:
        df_final["AGE"] = _coerce_int(df_final["AGE"], 0, 120, 30)
    if "SYS_BP" in df_final.columns:
        df_final["SYS_BP"] = _coerce_int(df_final["SYS_BP"], 50, 250, 110)
    if "HEART_RATE" in df_final.columns:
        df_final["HEART_RATE"] = _coerce_int(df_final["HEART_RATE"], 30, 220, 80)

    # ---- 7) Unique integer IDs ----
    if id_col in df_final.columns:
        ids = pd.to_numeric(df_final[id_col], errors="coerce")
        if ids.notna().sum() > 0:
            ids = ids.fillna(-1).astype(int)
            orig_ids = ids[~is_synth_flags]
            if orig_ids.duplicated().any() or (orig_ids < 0).any():
                df_final[id_col] = np.arange(1, len(df_final) + 1)
            else:
                max_id = int(orig_ids.max()) if len(orig_ids) else 0
                next_id = max_id + 1
                for i, is_syn in enumerate(is_synth_flags):
                    if is_syn:
                        ids.iat[i] = next_id
                        next_id += 1
                df_final[id_col] = ids.astype(int)
        else:
            df_final[id_col] = np.arange(1, len(df_final) + 1)
    else:
        df_final[id_col] = np.arange(1, len(df_final) + 1)

    # ---- 8) Final cleanup ----
    if "__pseudo_label__" in df_final.columns:
        df_final = df_final.drop(columns=["__pseudo_label__"])

    df_final["PH of blood"] = df_final["PH of blood"].round(2)
    df_final["Oxygen level"] = df_final["Oxygen level"].round(2)

    # ---- 9) Save ----
    df_final.to_csv(output_path, index=False)
    print(f"✅ Saved file: {output_path}  (rows={len(df_final)})")
    print(f"• Unique IDs: {not df_final[id_col].duplicated().any()}")
    print(f"• pH range: {df_final['PH of blood'].min()} – {df_final['PH of blood'].max()}")
    print(f"• SpO₂ range: {df_final['Oxygen level'].min()} – {df_final['Oxygen level'].max()}")

    return df_final

In [5]:
# --------- RUN IT ----------
_ = add_columns_then_expand(r'..\Data_Cleaning\ICU_Cleaned.csv')

✅ Saved file: D:\SIDDHARTHA\MASTER'S TAMUCC\MATH 1442 Labs Modernization Project\Github\MATH-1442-Statistics-For-Life-Labs-Modernization-Project\Lab-09\Data_Independent_Sample\Data_Expansion\ICU_Expanded_raw.csv  (rows=1200)
• Unique IDs: True
• pH range: 7.29 – 7.5
• SpO₂ range: 89.61 – 100.15


# Cleaning the resultant dataset again to remove any inconsistencies

In [11]:
# Step-1: Importing the raw dataset.
# --------------------------------------------------
import pandas as pd
ICU_E = pd.read_csv('ICU_Expanded_raw.csv')
ICU_E

Unnamed: 0,ID,STATUS,AGE,SEX,SERVICE,CPR,SYS_BP,HEART_RATE,PREV_ICU,CONSCIOUS,PH of blood,Oxygen level
0,8,Lived,27,Female,Medical,No,142,88,No,Conscious,7.43,97.02
1,12,Lived,59,Male,Medical,No,112,80,Yes,Conscious,7.39,98.01
2,14,Lived,77,Male,Surgical,No,100,70,No,Conscious,7.41,93.22
3,28,Lived,54,Male,Medical,No,142,103,No,Conscious,7.35,95.70
4,32,Lived,87,Female,Surgical,No,110,154,Yes,Conscious,7.35,97.11
...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1925,Died,40,Male,Surgical,No,129,60,No,Coma,7.35,98.86
1196,1926,Died,51,Female,Medical,No,124,81,No,Coma,7.44,97.10
1197,1927,Died,75,Female,Medical,No,101,96,Yes,Conscious,7.42,95.28
1198,1928,Died,66,Male,Medical,No,62,103,No,Coma,7.36,95.82


In [13]:
# Step-2: Renaming the column names
# --------------------------------------------------
rename_map = {'PH of blood': 'PH_BLOOD',
              'Oxygen level': 'SPO2'}

ICU_E_1 = ICU_E.rename(columns=rename_map)

ICU_E_1.head()


Unnamed: 0,ID,STATUS,AGE,SEX,SERVICE,CPR,SYS_BP,HEART_RATE,PREV_ICU,CONSCIOUS,PH_BLOOD,SPO2
0,8,Lived,27,Female,Medical,No,142,88,No,Conscious,7.43,97.02
1,12,Lived,59,Male,Medical,No,112,80,Yes,Conscious,7.39,98.01
2,14,Lived,77,Male,Surgical,No,100,70,No,Conscious,7.41,93.22
3,28,Lived,54,Male,Medical,No,142,103,No,Conscious,7.35,95.7
4,32,Lived,87,Female,Surgical,No,110,154,Yes,Conscious,7.35,97.11


In [15]:
# Step-3: Exporting file as CSV to the current working directory
# --------------------------------------------------
output_file_name = 'ICUAdmissions.csv'
ICU_E_1.to_csv(output_file_name, index=False)

print(f"Dataset successfully cleaned and saved to {output_file_name}")

Dataset successfully cleaned and saved to ICUAdmissions.csv
