In [1]:
import numpy as np
import pandas as pd
# https://hbiostat.org/data/repo/csupport2
# https://hbiostat.org/data/repo/supportdesc
# https://archive.ics.uci.edu/dataset/880/support2
df = pd.read_csv('support2.csv')
df = df.drop(columns=['dnrday', 'adlp', 'adls', 'totcst', 'totmcst'])
print(df.shape)
df = df[~(df['charges'].isna())]
print(df.shape)

(9105, 42)
(8933, 42)


  from pandas.core import (


In [2]:
df.columns

Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'dzgroup',
       'dzclass', 'num.co', 'edu', 'income', 'scoma', 'charges', 'avtisst',
       'race', 'sps', 'aps', 'surv2m', 'surv6m', 'hday', 'diabetes',
       'dementia', 'ca', 'prg2m', 'prg6m', 'dnr', 'meanbp', 'wblc', 'hrt',
       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',
       'bun', 'urine', 'sfdm2', 'adlsc'],
      dtype='object')

In [3]:
# Baseline Variable	Normal Fill-in Value
# Serum albumin	3.5
# PaO2/FiO2 ratio (pafi)	333.3
# Bilirubin	1.01
# Creatinine	1.01
# BUN	6.51
# White blood count	9 (thousands)
# Urine output	2502

fill_values = {
    "alb": 3.5,      # Serum albumin
    "pafi": 333.3,   # PaO2/FiO2 ratio
    "bili": 1.01,    # Bilirubin
    "crea": 1.01,    # Creatinine
    "bun": 6.51,     # Blood Urea Nitrogen (BUN)
    "wblc": 9,       # White blood count (in thousands)
    "urine": 2502    # Urine output
}
for col, value in fill_values.items():
    df[col].fillna(value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(value, inplace=True)


In [4]:
df[(df[['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod']].isna().any(axis=1))][['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod']]

Unnamed: 0,scoma,sps,aps,surv2m,surv6m,meanbp,hrt,resp,temp,sod
5441,0.0,31.398438,60.0,0.7229,0.638916,,,,,


In [5]:
print(df.shape)
df = df[~(df[['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod', 'dnr']].isna().any(axis=1))]
print(df.shape)

(8933, 42)
(8928, 42)


In [6]:
for col in ['sex', 'dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2']:
    print(f"Summary for column: {col}")
    print(df[col].value_counts(dropna=False).sort_values())
    print("-" * 30)

Summary for column: sex
sex
female    3895
male      5033
Name: count, dtype: int64
------------------------------
Summary for column: dzgroup
dzgroup
Cirrhosis             501
Colon Cancer          502
Coma                  584
MOSF w/Malig          694
Lung Cancer           893
COPD                  956
CHF                  1371
ARF/MOSF w/Sepsis    3427
Name: count, dtype: int64
------------------------------
Summary for column: dzclass
dzclass
Coma                   584
Cancer                1395
COPD/CHF/Cirrhosis    2828
ARF/MOSF              4121
Name: count, dtype: int64
------------------------------
Summary for column: income
income
>$50k          672
$25-$50k      1028
$11-$25k      1496
under $11k    2803
NaN           2929
Name: count, dtype: int64
------------------------------
Summary for column: race
race
NaN           42
asian         79
other        111
hispanic     286
black       1352
white       7058
Name: count, dtype: int64
------------------------------
Summary 

In [7]:
print(df[['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',
       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',
       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',
       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',
       'bun', 'urine', 'adlsc']].isna().sum())

age            0
death          0
sex            0
hospdead       0
slos           0
d.time         0
num.co         0
edu         1609
scoma          0
charges        0
avtisst       81
sps            0
aps            0
surv2m         0
surv6m         0
hday           0
diabetes       0
dementia       0
prg2m       1604
prg6m       1588
meanbp         0
wblc           0
hrt            0
resp           0
temp           0
pafi           0
alb            0
bili           0
crea           0
sod            0
ph          2239
glucose     4435
bun            0
urine          0
adlsc          0
dtype: int64


In [8]:
print(df.shape)
df = df[~(df[['prg2m', 'avtisst']].isna().any(axis=1))]
print(df.shape)
df['edu'].fillna(df['edu'].mean(), inplace=True)
df['ph'].fillna(df['ph'].mean(), inplace=True)
df['glucose'].fillna(df['glucose'].mean(), inplace=True)
print(df[['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',
       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',
       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',
       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',
       'bun', 'urine', 'adlsc']].isna().sum())


(8928, 42)
(7270, 42)
age         0
death       0
sex         0
hospdead    0
slos        0
d.time      0
num.co      0
edu         0
scoma       0
charges     0
avtisst     0
sps         0
aps         0
surv2m      0
surv6m      0
hday        0
diabetes    0
dementia    0
prg2m       0
prg6m       0
meanbp      0
wblc        0
hrt         0
resp        0
temp        0
pafi        0
alb         0
bili        0
crea        0
sod         0
ph          0
glucose     0
bun         0
urine       0
adlsc       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['edu'].fillna(df['edu'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ph'].fillna(df['ph'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [9]:
for col in ['death', 'dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2']:
    print(f"Summary for column: {col}")
    print(df[col].value_counts(dropna=False).sort_values())
    print("-" * 30)

Summary for column: death
death
0    2333
1    4937
Name: count, dtype: int64
------------------------------
Summary for column: dzgroup
dzgroup
Colon Cancer          392
Cirrhosis             408
Coma                  458
MOSF w/Malig          563
Lung Cancer           707
COPD                  824
CHF                  1020
ARF/MOSF w/Sepsis    2898
Name: count, dtype: int64
------------------------------
Summary for column: dzclass
dzclass
Coma                   458
Cancer                1099
COPD/CHF/Cirrhosis    2252
ARF/MOSF              3461
Name: count, dtype: int64
------------------------------
Summary for column: income
income
>$50k          563
$25-$50k       835
$11-$25k      1216
NaN           2251
under $11k    2405
Name: count, dtype: int64
------------------------------
Summary for column: race
race
NaN           33
asian         60
other         73
hispanic     233
black       1094
white       5777
Name: count, dtype: int64
------------------------------
Summary for co

In [10]:
for c in ['dzgroup', 'dzclass', 'income', 'race', 'dnr', 'sfdm2']:
    df = df.join(pd.get_dummies(df[c], prefix=c))
    print(pd.get_dummies(df[c], prefix=c).head())
df['sex'] = np.where((df['sex'] == 'male'), 1, 0)
df['ca_metastatic'] = df["ca"].map({"metastatic": 1, "no": 0, "yes": 0})
df['ca_yes'] = df["ca"].map({"metastatic": 1, "no": 0, "yes": 1})
df = df.drop(columns=['dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2'])
df.columns

   dzgroup_ARF/MOSF w/Sepsis  dzgroup_CHF  dzgroup_COPD  dzgroup_Cirrhosis  \
1                      False        False         False              False   
3                      False        False         False               True   
4                      False        False         False              False   
5                       True        False         False              False   
6                      False        False         False              False   

   dzgroup_Colon Cancer  dzgroup_Coma  dzgroup_Lung Cancer  \
1                 False         False                 True   
3                 False         False                False   
4                 False         False                 True   
5                 False         False                False   
6                 False          True                False   

   dzgroup_MOSF w/Malig  
1                 False  
3                 False  
4                 False  
5                 False  
6                 False  
  

Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',
       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',
       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',
       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',
       'bun', 'urine', 'adlsc', 'dzgroup_ARF/MOSF w/Sepsis', 'dzgroup_CHF',
       'dzgroup_COPD', 'dzgroup_Cirrhosis', 'dzgroup_Colon Cancer',
       'dzgroup_Coma', 'dzgroup_Lung Cancer', 'dzgroup_MOSF w/Malig',
       'dzclass_ARF/MOSF', 'dzclass_COPD/CHF/Cirrhosis', 'dzclass_Cancer',
       'dzclass_Coma', 'income_$11-$25k', 'income_$25-$50k', 'income_>$50k',
       'income_under $11k', 'race_asian', 'race_black', 'race_hispanic',
       'race_other', 'race_white', 'dnr_dnr after sadm', 'dnr_dnr before sadm',
       'dnr_no dnr', 'sfdm2_<2 mo. follow-up', 'sfdm2_Coma or Intub',
       'sfdm2_SIP>=30', 'sfdm2_adl>=4 (>=5 if sur)',
       'sfdm2_no(M2 and SIP pres)', 'ca_metastati

In [11]:
df

Unnamed: 0,age,death,sex,hospdead,slos,d.time,num.co,edu,scoma,charges,...,dnr_dnr after sadm,dnr_dnr before sadm,dnr_no dnr,sfdm2_<2 mo. follow-up,sfdm2_Coma or Intub,sfdm2_SIP>=30,sfdm2_adl>=4 (>=5 if sur),sfdm2_no(M2 and SIP pres),ca_metastatic,ca_yes
1,62.84998,0,1,0,5,2029,0,11.000000,0.0,9715.0,...,False,False,True,False,False,False,False,False,1,1
3,52.74698,1,0,0,17,47,2,12.000000,0.0,41094.0,...,False,False,True,True,False,False,False,False,0,0
4,42.38498,1,0,0,3,133,2,11.000000,0.0,3075.0,...,False,False,True,False,False,False,False,True,1,1
5,79.88495,0,0,0,16,2029,1,11.745941,26.0,50127.0,...,False,False,True,False,False,False,False,True,0,0
6,93.01599,1,1,1,4,4,1,14.000000,55.0,6884.0,...,False,False,True,True,False,False,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9099,70.42297,1,1,0,15,17,4,12.000000,0.0,80504.0,...,True,False,False,True,False,False,False,False,0,1
9101,66.07300,0,1,0,23,350,1,8.000000,0.0,52870.0,...,False,False,True,False,False,False,False,False,0,0
9102,55.15399,0,0,0,29,347,1,11.000000,41.0,35377.0,...,False,False,True,False,False,False,False,False,0,0
9103,70.38196,0,1,0,8,346,1,11.745941,0.0,46564.0,...,False,False,True,False,False,False,False,False,0,0


In [12]:
# df.to_csv('support2_processed.csv', index=False)