In [1]:
import numpy as np
import pandas as pd

References:
* https://hbiostat.org/data/repo/rhc
* https://search.r-project.org/CRAN/refmans/ATbounds/html/RHC.html

Note that two sources have different number of columns and column descriptions. Data is only available in the first link. However, most research papers use the second link as column name references. So this notebook tend to create a cleaned version of rhc dataset according to the criteria in the second link.

* "survival" column, which is Y, is a binary classifier. But we want time here to do survival analysis, so I use "t3d30" instead.

In [2]:
df = pd.read_csv('data/raw_rhc.csv')

print(f"Shape of the dataset: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

Shape of the dataset: (5735, 63)
Columns: ['Unnamed: 0', 'cat1', 'cat2', 'ca', 'sadmdte', 'dschdte', 'dthdte', 'lstctdte', 'death', 'cardiohx', 'chfhx', 'dementhx', 'psychhx', 'chrpulhx', 'renalhx', 'liverhx', 'gibledhx', 'malighx', 'immunhx', 'transhx', 'amihx', 'age', 'sex', 'edu', 'surv2md1', 'das2d3pc', 't3d30', 'dth30', 'aps1', 'scoma1', 'meanbp1', 'wblc1', 'hrt1', 'resp1', 'temp1', 'pafi1', 'alb1', 'hema1', 'bili1', 'crea1', 'sod1', 'pot1', 'paco21', 'ph1', 'swang1', 'wtkilo1', 'dnr1', 'ninsclas', 'resp', 'card', 'neuro', 'gastr', 'renal', 'meta', 'hema', 'seps', 'trauma', 'ortho', 'adld3p', 'urin1', 'race', 'income', 'ptid']


Unnamed: 0.1,Unnamed: 0,cat1,cat2,ca,sadmdte,dschdte,dthdte,lstctdte,death,cardiohx,...,meta,hema,seps,trauma,ortho,adld3p,urin1,race,income,ptid
0,1,COPD,,Yes,11142,11151.0,,11382,No,0,...,No,No,No,No,No,0.0,,white,Under $11k,5
1,2,MOSF w/Sepsis,,No,11799,11844.0,11844.0,11844,Yes,1,...,No,No,Yes,No,No,,1437.0,white,Under $11k,7
2,3,MOSF w/Malignancy,MOSF w/Sepsis,Yes,12083,12143.0,,12400,No,0,...,No,No,No,No,No,,599.0,white,$25-$50k,9
3,4,ARF,,No,11146,11183.0,11183.0,11182,Yes,0,...,No,No,No,No,No,,,white,$11-$25k,10
4,5,MOSF w/Sepsis,,No,12035,12037.0,12037.0,12036,Yes,0,...,No,No,No,No,No,,64.0,white,Under $11k,11


In [3]:
df.columns

Index(['Unnamed: 0', 'cat1', 'cat2', 'ca', 'sadmdte', 'dschdte', 'dthdte',
       'lstctdte', 'death', 'cardiohx', 'chfhx', 'dementhx', 'psychhx',
       'chrpulhx', 'renalhx', 'liverhx', 'gibledhx', 'malighx', 'immunhx',
       'transhx', 'amihx', 'age', 'sex', 'edu', 'surv2md1', 'das2d3pc',
       't3d30', 'dth30', 'aps1', 'scoma1', 'meanbp1', 'wblc1', 'hrt1', 'resp1',
       'temp1', 'pafi1', 'alb1', 'hema1', 'bili1', 'crea1', 'sod1', 'pot1',
       'paco21', 'ph1', 'swang1', 'wtkilo1', 'dnr1', 'ninsclas', 'resp',
       'card', 'neuro', 'gastr', 'renal', 'meta', 'hema', 'seps', 'trauma',
       'ortho', 'adld3p', 'urin1', 'race', 'income', 'ptid'],
      dtype='object')

In [4]:
# Treatment A
A_raw = df["swang1"]
if A_raw.dtype == "O":
    A = (A_raw == "RHC").astype(int)
else:
    A = (A_raw.astype(float) > 0).astype(int)

# Outcome Y
Y = df["t3d30"]

# Covariates X - all columns except Y, A, Z, W
exclude_cols = ["swang1", "t3d30", "pafi1", "paco21", "ph1", "hema1", "Unnamed: 0"]
X = df.drop(columns=exclude_cols).copy()

# Create binary columns for cat1 (primary disease category, omitted = ARF)
if "cat1" in X.columns:
    X["cat1_CHF"] = (X["cat1"] == "CHF").astype(int)
    X["cat1_Cirrhosis"] = (X["cat1"] == "Cirrhosis").astype(int)
    X["cat1_Colon_Cancer"] = (X["cat1"] == "Colon Cancer").astype(int)
    X["cat1_Coma"] = (X["cat1"] == "Coma").astype(int)
    X["cat1_COPD"] = (X["cat1"] == "COPD").astype(int)
    X["cat1_Lung_Cancer"] = (X["cat1"] == "Lung Cancer").astype(int)
    X["cat1_MOSF_Malignancy"] = (X["cat1"] == "MOSF w/Malignancy").astype(int)
    X["cat1_MOSF_Sepsis"] = (X["cat1"] == "MOSF w/Sepsis").astype(int)
    X = X.drop(columns=["cat1"])

# Create binary columns for cat2 (secondary disease category, omitted = NA)
if "cat2" in X.columns:
    X["cat2_Cirrhosis"] = (X["cat2"] == "Cirrhosis").astype(int)
    X["cat2_Colon_Cancer"] = (X["cat2"] == "Colon Cancer").astype(int)
    X["cat2_Coma"] = (X["cat2"] == "Coma").astype(int)
    X["cat2_Lung_Cancer"] = (X["cat2"] == "Lung Cancer").astype(int)
    X["cat2_MOSF_Malignancy"] = (X["cat2"] == "MOSF w/Malignancy").astype(int)
    X["cat2_MOSF_Sepsis"] = (X["cat2"] == "MOSF w/Sepsis").astype(int)
    X = X.drop(columns=["cat2"])

# Create binary columns for income (omitted category = Under $11k)
if "income" in X.columns:
    X["income1"] = (X["income"] == "$11-$25k").astype(int)
    X["income2"] = (X["income"] == "$25-$50k").astype(int)
    X["income3"] = (X["income"] == "> $50k").astype(int)
    X = X.drop(columns=["income"])

# Create binary columns for cancer (omitted category = no cancer)
if "ca" in X.columns:
    X["ca_Yes"] = (X["ca"] == "Yes").astype(int)
    X["ca_Metastatic"] = (X["ca"] == "Metastatic").astype(int)
    X = X.drop(columns=["ca"])

# Create binary columns for insurance (omitted category = Private)
if "ninsclas" in X.columns:
    X["ninsclas_Medicaid"] = (X["ninsclas"] == "Medicaid").astype(int)
    X["ninsclas_Medicare"] = (X["ninsclas"] == "Medicare").astype(int)
    X["ninsclas_Medicare_and_Medicaid"] = (X["ninsclas"] == "Medicare & Medicaid").astype(int)
    X["ninsclas_No_insurance"] = (X["ninsclas"] == "No Insurance").astype(int)
    X["ninsclas_Private_and_Medicare"] = (X["ninsclas"] == "Private & Medicare").astype(int)
    X = X.drop(columns=["ninsclas"])

# Convert categorical variables if needed
if "sex" in X.columns and X["sex"].dtype == "O":
    X["sex"] = (X["sex"] == "Female").astype(int)

# Create binary columns for race (omitted category = White)
if "race" in X.columns and X["race"].dtype == "O":
    X["race_black"] = (X["race"] == "black").astype(int)
    X["race_other"] = (X["race"] == "other").astype(int)
    X = X.drop(columns=["race"])

# Proxies Z & W
Z = df[["pafi1", "paco21"]].copy() 
W = df[["ph1", "hema1"]].copy()  

analysis_cols = pd.concat(
    [
        Y.rename("Y"),
        A.rename("A"),
        X,
        Z.rename(columns={"pafi1": "pafi1", "paco21": "paco21"}),
        W.rename(columns={"ph1": "ph1", "hema1": "hema1"}),
    ],
    axis=1,
)

analysis_df = analysis_cols.copy()

# Overwrite with cleaned arrays
Y = analysis_df["Y"].values
A = analysis_df["A"].values.astype(int)
X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
X = analysis_df[X_colnames]
Z = analysis_df[["pafi1", "paco21"]]
W = analysis_df[["ph1", "hema1"]]

print(analysis_df.shape)
analysis_df.head()


(5735, 82)


Unnamed: 0,Y,A,sadmdte,dschdte,dthdte,lstctdte,death,cardiohx,chfhx,dementhx,...,ninsclas_Medicare,ninsclas_Medicare_and_Medicaid,ninsclas_No_insurance,ninsclas_Private_and_Medicare,race_black,race_other,pafi1,paco21,ph1,hema1
0,30,0,11142,11151.0,,11382,No,0,0,0,...,1,0,0,0,0,0,68.0,40.0,7.359375,58.0
1,30,1,11799,11844.0,11844.0,11844,Yes,1,1,0,...,0,0,0,1,0,0,218.3125,34.0,7.329102,32.5
2,30,1,12083,12143.0,,12400,No,0,0,0,...,0,0,0,0,0,0,275.5,16.0,7.359375,21.097656
3,30,0,11146,11183.0,11183.0,11182,Yes,0,0,0,...,0,0,0,1,0,0,156.65625,30.0,7.459961,26.296875
4,2,1,12035,12037.0,12037.0,12036,Yes,0,0,0,...,1,0,0,0,0,0,478.0,17.0,7.229492,24.0


In [5]:
analysis_df.columns

Index(['Y', 'A', 'sadmdte', 'dschdte', 'dthdte', 'lstctdte', 'death',
       'cardiohx', 'chfhx', 'dementhx', 'psychhx', 'chrpulhx', 'renalhx',
       'liverhx', 'gibledhx', 'malighx', 'immunhx', 'transhx', 'amihx', 'age',
       'sex', 'edu', 'surv2md1', 'das2d3pc', 'dth30', 'aps1', 'scoma1',
       'meanbp1', 'wblc1', 'hrt1', 'resp1', 'temp1', 'alb1', 'bili1', 'crea1',
       'sod1', 'pot1', 'wtkilo1', 'dnr1', 'resp', 'card', 'neuro', 'gastr',
       'renal', 'meta', 'hema', 'seps', 'trauma', 'ortho', 'adld3p', 'urin1',
       'ptid', 'cat1_CHF', 'cat1_Cirrhosis', 'cat1_Colon_Cancer', 'cat1_Coma',
       'cat1_COPD', 'cat1_Lung_Cancer', 'cat1_MOSF_Malignancy',
       'cat1_MOSF_Sepsis', 'cat2_Cirrhosis', 'cat2_Colon_Cancer', 'cat2_Coma',
       'cat2_Lung_Cancer', 'cat2_MOSF_Malignancy', 'cat2_MOSF_Sepsis',
       'income1', 'income2', 'income3', 'ca_Yes', 'ca_Metastatic',
       'ninsclas_Medicaid', 'ninsclas_Medicare',
       'ninsclas_Medicare_and_Medicaid', 'ninsclas_No_insurance

In [6]:
cols_to_drop = ["adld3p", "urin1", "dthdte", "dschdte", "sadmdte", "death", "dth30", "ptid"]
analysis_df = analysis_df.drop(columns=[col for col in cols_to_drop if col in analysis_df.columns])
analysis_df.shape

(5735, 74)

In [7]:
# Convert all Yes/No columns to 1/0

for col in analysis_df.columns:
    try:
        # Check if column has object dtype
        if analysis_df[col].dtype == object:
            unique_vals = analysis_df[col].dropna().unique()
            # Check if column contains Yes/No values
            if len(unique_vals) > 0 and set(unique_vals).issubset({'Yes', 'No'}):
                analysis_df[col] = (analysis_df[col] == 'Yes').astype(int)
                print(f"Converted {col}: Yes → 1, No → 0")
    except Exception as e:
        print(f"Error processing column {col}: {e}")

print(f"\nShape after conversion: {analysis_df.shape}")
print(f"Data types:\n{analysis_df.dtypes.value_counts()}")


Converted dnr1: Yes → 1, No → 0
Converted resp: Yes → 1, No → 0
Converted card: Yes → 1, No → 0
Converted neuro: Yes → 1, No → 0
Converted gastr: Yes → 1, No → 0
Converted renal: Yes → 1, No → 0
Converted meta: Yes → 1, No → 0
Converted hema: Yes → 1, No → 0
Converted seps: Yes → 1, No → 0
Converted trauma: Yes → 1, No → 0
Converted ortho: Yes → 1, No → 0

Shape after conversion: (5735, 74)
Data types:
int64      57
float64    17
Name: count, dtype: int64


In [8]:
# Rename diagnosis columns
diagnosis_rename = {
    'resp': 'resp_Yes',
    'card': 'card_Yes',
    'neuro': 'neuro_Yes',
    'gastr': 'gastr_Yes',
    'renal': 'renal_Yes',
    'meta': 'meta_Yes',
    'hema': 'hema_Yes',
    'seps': 'seps_Yes',
    'trauma': 'trauma_Yes',
    'ortho': 'ortho_Yes',
    'sex': 'sex_Female'
}

cols_to_rename = {k: v for k, v in diagnosis_rename.items() if k in analysis_df.columns}
analysis_df = analysis_df.rename(columns=cols_to_rename)

analysis_df.columns


Index(['Y', 'A', 'lstctdte', 'cardiohx', 'chfhx', 'dementhx', 'psychhx',
       'chrpulhx', 'renalhx', 'liverhx', 'gibledhx', 'malighx', 'immunhx',
       'transhx', 'amihx', 'age', 'sex_Female', 'edu', 'surv2md1', 'das2d3pc',
       'aps1', 'scoma1', 'meanbp1', 'wblc1', 'hrt1', 'resp1', 'temp1', 'alb1',
       'bili1', 'crea1', 'sod1', 'pot1', 'wtkilo1', 'dnr1', 'resp_Yes',
       'card_Yes', 'neuro_Yes', 'gastr_Yes', 'renal_Yes', 'meta_Yes',
       'hema_Yes', 'seps_Yes', 'trauma_Yes', 'ortho_Yes', 'cat1_CHF',
       'cat1_Cirrhosis', 'cat1_Colon_Cancer', 'cat1_Coma', 'cat1_COPD',
       'cat1_Lung_Cancer', 'cat1_MOSF_Malignancy', 'cat1_MOSF_Sepsis',
       'cat2_Cirrhosis', 'cat2_Colon_Cancer', 'cat2_Coma', 'cat2_Lung_Cancer',
       'cat2_MOSF_Malignancy', 'cat2_MOSF_Sepsis', 'income1', 'income2',
       'income3', 'ca_Yes', 'ca_Metastatic', 'ninsclas_Medicaid',
       'ninsclas_Medicare', 'ninsclas_Medicare_and_Medicaid',
       'ninsclas_No_insurance', 'ninsclas_Private_and_Medic

In [9]:
analysis_df.head()

Unnamed: 0,Y,A,lstctdte,cardiohx,chfhx,dementhx,psychhx,chrpulhx,renalhx,liverhx,...,ninsclas_Medicare,ninsclas_Medicare_and_Medicaid,ninsclas_No_insurance,ninsclas_Private_and_Medicare,race_black,race_other,pafi1,paco21,ph1,hema1
0,30,0,11382,0,0,0,0,1,0,0,...,1,0,0,0,0,0,68.0,40.0,7.359375,58.0
1,30,1,11844,1,1,0,0,0,0,0,...,0,0,0,1,0,0,218.3125,34.0,7.329102,32.5
2,30,1,12400,0,0,0,0,0,0,0,...,0,0,0,0,0,0,275.5,16.0,7.359375,21.097656
3,30,0,11182,0,0,0,0,0,0,0,...,0,0,0,1,0,0,156.65625,30.0,7.459961,26.296875
4,2,1,12036,0,0,0,0,0,0,0,...,1,0,0,0,0,0,478.0,17.0,7.229492,24.0


In [10]:
import os

data_folder = "data"

output_path = os.path.join(data_folder, "cleaned_rhc.csv")
analysis_df.to_csv(output_path, index=False)
