In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
def load_data(base_dir="./data"):
    base = Path(base_dir)
    dfs = {
        "patients": pd.read_csv(base / "syn_patients.csv"),
        "admissions": pd.read_csv(base / "syn_admissions.csv"),
        "diagnoses": pd.read_csv(base / "syn_diagnoses_icd.csv"),
        "icustays": pd.read_csv(base / "syn_icustays.csv"),
        "labevents": pd.read_csv(base / "syn_labevents.csv"),
        "chartevents": pd.read_csv(base / "syn_chartevents.csv"),
        "outputevents": pd.read_csv(base / "syn_outputevents.csv")
    }
    print("Data loaded")
    return dfs

In [21]:
def extract_demographics(patients, admissions):
    #기본 demographic 및 admission 관련 feature 생성
    demo = admissions.merge(patients, on="subject_id", how="left")

    # 나이, 성별
    demo["sex"] = demo["sex"].map({"M": 1, "F": 0})
    demo["anchor_age"] = demo["anchor_age"].fillna(demo["anchor_age"].median())

    # 범주형 feature (예: 보험유형, 입원타입)
    cat_cols = ["admission_type", "insurance", "ethnicity"]
    demo[cat_cols] = demo[cat_cols].fillna("Unknown")

    enc = OneHotEncoder(sparse=False, handle_unknown="ignore")
    encoded = enc.fit_transform(demo[cat_cols])
    encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(cat_cols))

    demo_features = pd.concat([
        demo[["hadm_id", "anchor_age", "sex"]].reset_index(drop=True),
        encoded_df
    ], axis=1)

    demo_features = demo_features.groupby("hadm_id").first().reset_index()
    return demo_features


def aggregate_labevents(labevents):
    #각 입원(hadm_id)별 주요 lab 통계
    lab_agg = labevents.groupby(["hadm_id", "itemid"])["valuenum"].agg(['mean','std','min','max','count']).unstack(fill_value=0)
    lab_agg.columns = [f"{col[0]}_{col[1]}" for col in lab_agg.columns]
    lab_agg.reset_index(inplace=True)
    return lab_agg


def aggregate_chartevents(chartevents):
    #chart event도 비슷하게 요약
    chart_agg = chartevents.groupby(["hadm_id", "itemid"])["valuenum"].agg(['mean','std','min','max']).unstack(fill_value=0)
    chart_agg.columns = [f"chart_{col[0]}_{col[1]}" for col in chart_agg.columns]
    chart_agg.reset_index(inplace=True)
    return chart_agg


def extract_diagnosis_features(diagnoses):
    #ICD 진단을 원 핫 인코딩 처럼 수정
    diag = diagnoses.groupby("hadm_id")["icd_code"].apply(list).reset_index()
    all_codes = sorted(set(code for lst in diag.icd_code for code in lst))
    matrix = np.zeros((len(diag), len(all_codes)))
    code_index = {code: i for i, code in enumerate(all_codes)}

    for i, codes in enumerate(diag.icd_code):
        for c in codes:
            matrix[i, code_index[c]] = 1
    diag_feat = pd.DataFrame(matrix, columns=[f"diag_{c}" for c in all_codes])
    diag_feat["hadm_id"] = diag["hadm_id"]
    return diag_feat


def extract_icustay_features(icustays):
    #ICU 체류 관련 feature
    icu_agg = icustays.groupby("hadm_id").agg(
        icu_count=("stay_id", "count"),
        total_icu_los=("los", "sum")
    ).reset_index()
    return icu_agg


def aggregate_outputevents(outputs):
    #출력량 등 요약
    out_agg = outputs.groupby("hadm_id")["value"].agg(["sum", "mean", "count"]).reset_index()
    out_agg.rename(columns={"sum":"output_sum", "mean":"output_mean", "count":"output_count"}, inplace=True)
    return out_agg

In [22]:
def merge_features(demo, diag, labs, charts, icu, outputs):
    dfs = [demo, diag, labs, charts, icu, outputs]
    from functools import reduce
    df = reduce(lambda left, right: pd.merge(left, right, on="hadm_id", how="outer"), dfs)
    df = df.fillna(0)
    return df


def scale_features(df):
    numeric = df.drop(columns=["hadm_id"])
    scaler = StandardScaler()
    X = scaler.fit_transform(numeric)
    return pd.DataFrame(X, columns=numeric.columns), scaler

In [23]:
def compute_similarity(X_scaled, hadm_ids):
    sim = cosine_similarity(X_scaled)
    sim_df = pd.DataFrame(sim, index=hadm_ids, columns=hadm_ids)
    return sim_df

In [24]:

if __name__ == "__main__":
    dfs = load_data("./data")

    demo = extract_demographics(dfs["patients"], dfs["admissions"])
    diag = extract_diagnosis_features(dfs["diagnoses"])
    labs = aggregate_labevents(dfs["labevents"])
    charts = aggregate_chartevents(dfs["chartevents"])
    icu = extract_icustay_features(dfs["icustays"])
    outputs = aggregate_outputevents(dfs["outputevents"])

    features = merge_features(demo, diag, labs, charts, icu, outputs)
    X_scaled, scaler = scale_features(features)
    sim_df = compute_similarity(X_scaled, features["hadm_id"])

    # 저장
    features.to_csv("./patient_features.csv", index=False)
    sim_df.to_csv("./patient_similarity_matrix.csv")
    print("Feature extraction complete")

Data loaded
Feature extraction complete
