## Draft Code for PHS-7020 Final Project

In [58]:
# Load all packages here. 
import pandas as pd
import numpy as np
from sklearn import preprocessing
import fancyimpute
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import statsmodels as sm
import statsmodels.imputation.mice as mice
from sklearn.preprocessing import LabelEncoder


In [114]:
# Load in the NSQIP data.
acsd = pd.read_csv("NSQIP_12_17_Outpatient_2.csv", header = 0)


In [61]:
# Check into the data. 
acsd.head(5)


Unnamed: 0.1,Unnamed: 0,SEX,RACE_NEW,WORKRVU,TRANST,Age,AdmYR,ANESTHES,SURGSPEC,HEIGHT,...,PRWBC,PRHCT,PRPLATE,PRPTT,PRINR,PRPT,EMERGNCY,WNDCLAS,ASACLAS,READMPODAYS1
0,2,male,White,7.96,Not transferred (admitted from home),69,2012,General,General Surgery,71,...,4.9,41.2,240.0,-99.0,-99.0,-99.0,No,1-Clean,2-Mild Disturb,-99
1,3,female,White,11.76,Not transferred (admitted from home),51,2012,General,General Surgery,61,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,No,2-Clean/Contaminated,2-Mild Disturb,-99
2,4,female,White,11.76,Not transferred (admitted from home),30,2011,General,General Surgery,70,...,8.4,38.8,253.0,-99.0,-99.0,-99.0,No,2-Clean/Contaminated,2-Mild Disturb,-99
3,5,female,Unknown/Not Reported,6.59,Not transferred (admitted from home),35,2012,General,General Surgery,61,...,6.3,40.9,231.0,-99.0,-99.0,-99.0,No,1-Clean,2-Mild Disturb,-99
4,6,female,Black or African American,6.14,Not transferred (admitted from home),23,2012,General,General Surgery,63,...,4.4,39.9,255.0,-99.0,-99.0,-99.0,No,1-Clean,2-Mild Disturb,-99


In [None]:
# We now begin data cleaning. This data needs to be made analysis-ready. 
# Primarily, the categorical variables need to be updated. We will generate
# a series of indicators.  

In [115]:
# Begin by converting strings telling unknown to NA
# for all columns.

# Convert "NULL", "Unknown", and other strings indicating
# missing to NaN.

acsd['RACE_NEW'] = acsd['RACE_NEW'].replace(to_replace = 'Unknown/Not Reported', value = np.nan)
acsd['TRANST'] = acsd['TRANST'].replace(to_replace = 'Unknown', value = np.nan)
acsd['ANESTHES'] = acsd['ANESTHES'].replace(to_replace = 'Unknown', value = np.nan)
acsd['SURGSPEC'] = acsd['SURGSPEC'].replace(to_replace = 'Unknown', value = np.nan)
acsd['DIABETES'] = acsd['DIABETES'].replace(to_replace = 'NULL', value = np.nan)
acsd['SMOKE'] = acsd['SMOKE'].replace(to_replace = 'NULL', value = np.nan)
acsd['DYSPNEA'] = acsd['DYSPNEA'].replace(to_replace = 'NULL', value = np.nan)
acsd['FNSTATUS2'] = acsd['FNSTATUS2'].replace(to_replace = 'NULL', value = np.nan)
acsd['FNSTATUS2'] = acsd['FNSTATUS2'].replace(to_replace = 'Unknown', value = np.nan)
acsd['VENTILAT'] = acsd['VENTILAT'].replace(to_replace = 'NULL', value = np.nan)
acsd['HXCOPD'] = acsd['HXCOPD'].replace(to_replace = 'NULL', value = np.nan)
acsd['RENAFAIL'] = acsd['RENAFAIL'].replace(to_replace = 'NULL', value = np.nan)
acsd['STEROID'] = acsd['STEROID'].replace(to_replace = 'NULL', value = np.nan)
acsd['WTLOSS'] = acsd['WTLOSS'].replace(to_replace = 'NULL', value = np.nan)
acsd['TRANSFUS'] = acsd['TRANSFUS'].replace(to_replace = 'NULL', value = np.nan)
acsd['PRSEPIS'] = acsd['PRSEPIS'].replace(to_replace = 'NULL', value = np.nan)
acsd['EMERGNCY'] = acsd['EMERGNCY'].replace(to_replace = 'NULL', value = np.nan)
acsd['ASACLAS'] = acsd['ASACLAS'].replace(to_replace = 'None assigned', value = np.nan)



In [116]:
# Now, for all of the continuous variables, a -99 indicates missing.
# We will replace the -99s with NaNs. 
acsd=acsd.replace(-99, np.nan)


In [117]:
# Check data again. 
acsd.head(25)

Unnamed: 0.1,Unnamed: 0,SEX,RACE_NEW,WORKRVU,TRANST,Age,AdmYR,ANESTHES,SURGSPEC,HEIGHT,...,PRWBC,PRHCT,PRPLATE,PRPTT,PRINR,PRPT,EMERGNCY,WNDCLAS,ASACLAS,READMPODAYS1
0,2,male,White,7.96,Not transferred (admitted from home),69,2012,General,General Surgery,71.0,...,4.9,41.2,240.0,,,,No,1-Clean,2-Mild Disturb,
1,3,female,White,11.76,Not transferred (admitted from home),51,2012,General,General Surgery,61.0,...,,,,,,,No,2-Clean/Contaminated,2-Mild Disturb,
2,4,female,White,11.76,Not transferred (admitted from home),30,2011,General,General Surgery,70.0,...,8.4,38.8,253.0,,,,No,2-Clean/Contaminated,2-Mild Disturb,
3,5,female,,6.59,Not transferred (admitted from home),35,2012,General,General Surgery,61.0,...,6.3,40.9,231.0,,,,No,1-Clean,2-Mild Disturb,
4,6,female,Black or African American,6.14,Not transferred (admitted from home),23,2012,General,General Surgery,63.0,...,4.4,39.9,255.0,,,,No,1-Clean,2-Mild Disturb,
5,7,male,White,7.96,Not transferred (admitted from home),66,2012,General,General Surgery,72.0,...,6.4,40.2,315.0,,,,No,1-Clean,2-Mild Disturb,
6,8,male,White,7.96,Not transferred (admitted from home),65,2012,MAC/IV Sedation,General Surgery,72.0,...,7.3,48.0,188.0,,,,No,1-Clean,2-Mild Disturb,
7,9,male,,7.96,Not transferred (admitted from home),70,2012,MAC/IV Sedation,General Surgery,69.0,...,5.2,42.7,284.0,,,,No,1-Clean,2-Mild Disturb,
8,10,male,White,10.6,Not transferred (admitted from home),63,2012,General,General Surgery,70.0,...,12.1,46.9,159.0,,,,No,2-Clean/Contaminated,2-Mild Disturb,
9,11,female,White,12.37,Not transferred (admitted from home),48,2012,General,General Surgery,63.0,...,5.9,40.9,248.0,,,,No,1-Clean,2-Mild Disturb,


In [118]:
# Remove variables with more than 35% missingness. 
acsd = acsd[acsd.columns[acsd.isnull().mean() < 0.35]]

In [119]:
# Remove the admit year column - we will not be using
# this. 
acsd=acsd.drop(columns=['AdmYR'])

In [120]:
# Check. Get column types.
acsd.head()
acsd.dtypes

Unnamed: 0      int64
SEX            object
RACE_NEW       object
WORKRVU       float64
TRANST         object
Age             int64
ANESTHES       object
SURGSPEC       object
HEIGHT        float64
WEIGHT        float64
DIABETES       object
SMOKE          object
DYSPNEA        object
FNSTATUS2      object
VENTILAT       object
HXCOPD         object
ASCITES        object
HXCHF          object
HYPERMED       object
RENAFAIL       object
DIALYSIS       object
DISCANCR       object
WNDINF         object
STEROID        object
WTLOSS         object
BLEEDDIS       object
TRANSFUS       object
PRSEPIS        object
DPRNA         float64
DPRCREAT      float64
DPRWBC        float64
DPRHCT        float64
DPRPLATE      float64
PRSODM        float64
PRCREAT       float64
PRWBC         float64
PRHCT         float64
PRPLATE       float64
EMERGNCY       object
WNDCLAS        object
ASACLAS        object
dtype: object

In [94]:
# START EXAMPLE MI.
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
cdat = acsd[['PRWBC', 'DPRHCT']]
cdat2 = acsd[['RACE_NEW', 'ASACLAS']]

In [103]:

#cdat2.RACE_NEW = pd.Categorical(cdat2.RACE_NEW)
#cdat2.ASACLAS = pd.Categorical(cdat2.ASACLAS)
#cdat2.RACE_NEW = cdat2.RACE_NEW.cat.codes
#cdat2.ASACLAS = cdat2.ASACLAS.cat.codes

cdat2['RACE_NEW'] = pd.factorize(cdat2['RACE_NEW'])[0]
cdat2['ASACLAS'] = pd.factorize(cdat2['ASACLAS'])[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [113]:
cdat2.head(5)

Unnamed: 0,RACE_NEW,ASACLAS
0,0,0
1,0,0
2,0,0
3,1,0
4,2,0


In [105]:
# Try imputation on this testing data. 
timp = mice.MICEData(cdat)
timp2 = mice.MICEData(cdat2)

In [39]:
# Works for the continuous columns. 
for j in range(2):
    timp.update_all()
    timp.data.to_csv('data%02d.csv' % j)

In [51]:
# Now test for new categorical columns. Make this done with polyreg. 
timp2.set_imputer('RACE_NEW', model_class = 'MNLogit')
timp2.set_imputer('ASACLAS', model_class = 'MNLogit')
for j in range(2):
    timp2.update_all()
    timp2.data.to_csv('data%12d.csv' % j)
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
# END EXAMPLE MI. 

In [121]:
# Our data has all columns we want to use for prediction. However, we 
# need to handle missing values. We will use multiple imputation with
# chained equations, which we implement using the statsmodels package.

# Firstly, we will convert all of the currently string/object columns
# to categorical columns with the following loop. 
for col in ['SEX', 'RACE_NEW', 'ANESTHES', 'TRANST', 'SURGSPEC', 
           'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD',
            'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 
            'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 
           'EMERGNCY', 'WNDCLAS', 'ASACLAS']:
    acsd[col] = acsd[col].astype('category')

In [122]:
for col in ['SEX', 'RACE_NEW', 'ANESTHES', 'TRANST', 'SURGSPEC', 
           'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD',
            'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 
            'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 
           'EMERGNCY', 'WNDCLAS', 'ASACLAS']:
    acsd[col] = pd.factorize(acsd[col])[0]

In [125]:
# Doing the above steps forced out negative values. So 
# we now need to replace -1 in any of these categorical
# columns back with NAs. 
for col in ['SEX', 'RACE_NEW', 'ANESTHES', 'TRANST', 'SURGSPEC', 
           'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD',
            'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 
            'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 
           'EMERGNCY', 'WNDCLAS', 'ASACLAS']:
    acsd[col] =  acsd[col].replace(-1, np.nan)



In [128]:
acsd.head(11)

Unnamed: 0.1,Unnamed: 0,SEX,RACE_NEW,WORKRVU,TRANST,Age,ANESTHES,SURGSPEC,HEIGHT,WEIGHT,...,DPRHCT,DPRPLATE,PRSODM,PRCREAT,PRWBC,PRHCT,PRPLATE,EMERGNCY,WNDCLAS,ASACLAS
0,2,0,0.0,7.96,0.0,69,0.0,0.0,71.0,210.0,...,12.0,12.0,142.0,1.14,4.9,41.2,240.0,0.0,0,0.0
1,3,1,0.0,11.76,0.0,51,0.0,0.0,61.0,114.0,...,,,,,,,,0.0,1,0.0
2,4,1,0.0,11.76,0.0,30,0.0,0.0,70.0,258.0,...,0.0,0.0,138.0,0.7,8.4,38.8,253.0,0.0,1,0.0
3,5,1,,6.59,0.0,35,0.0,0.0,61.0,98.0,...,8.0,8.0,,,6.3,40.9,231.0,0.0,0,0.0
4,6,1,1.0,6.14,0.0,23,0.0,0.0,63.0,108.0,...,8.0,8.0,138.0,0.4,4.4,39.9,255.0,0.0,0,0.0
5,7,0,0.0,7.96,0.0,66,0.0,0.0,72.0,212.0,...,28.0,28.0,136.0,0.84,6.4,40.2,315.0,0.0,0,0.0
6,8,0,0.0,7.96,0.0,65,1.0,0.0,72.0,175.0,...,6.0,6.0,140.0,1.05,7.3,48.0,188.0,0.0,0,0.0
7,9,0,,7.96,0.0,70,1.0,0.0,69.0,180.0,...,5.0,5.0,135.0,0.91,5.2,42.7,284.0,0.0,0,0.0
8,10,0,0.0,10.6,0.0,63,0.0,0.0,70.0,231.0,...,0.0,0.0,134.0,1.2,12.1,46.9,159.0,0.0,1,0.0
9,11,1,0.0,12.37,0.0,48,0.0,0.0,63.0,230.0,...,4.0,4.0,136.0,,5.9,40.9,248.0,0.0,0,0.0


In [129]:
# Now, we get our imputation object. 
mimpobj = mice.MICEData(acsd)

In [None]:
# We now one hot encode. We do this the very arduous way because
# we want to keep NAN location for multiple imputation and
# the support online is absolutely terrible for this purpose.

In [6]:
# Start with Race.
race = acsd['RACE_NEW']
raced = pd.get_dummies(race)
raced['Prior_Cat']=acsd['RACE_NEW']
raced.loc[raced.Prior_Cat.isnull(),['American Indian or Alaska Native','Asian','Black or African American','Hispanic Ethnicity','Native Hawaiian or Pacific Islander','White']]=np.nan
race = raced[['American Indian or Alaska Native','Asian','Black or African American','Hispanic Ethnicity','Native Hawaiian or Pacific Islander']]

In [7]:
# Continue with TRANST. 
TRANST = acsd['TRANST']
TRANSTd = pd.get_dummies(TRANST)
TRANSTd['Prior_Cat']=acsd['TRANST']
TRANSTd.loc[TRANSTd.Prior_Cat.isnull(),['From acute care hospital inpatient', 'Not transferred (admitted from home)','Nursing home - Chronic care - Intermediate care','Outside emergency department','Transfer from other']]=np.nan
TRANST = TRANSTd[[ 'Not transferred (admitted from home)','Nursing home - Chronic care - Intermediate care','Outside emergency department','Transfer from other']]

In [9]:
# Now for ANESTHES.
ANESTHES = acsd['ANESTHES']
ANESTHESd = pd.get_dummies(ANESTHES)
ANESTHESd['Prior_Cat']=acsd['ANESTHES']
ANESTHESd.loc[ANESTHESd.Prior_Cat.isnull(),['Epidural','General','Local','MAC/IV Sedation','None','Other','Regional','Spinal']]=np.nan
ANESTHES = ANESTHESd[['Epidural','Local','MAC/IV Sedation','None','Other','Regional','Spinal']]

In [10]:
# Surgical specialty.
SURGSPEC = acsd['SURGSPEC']
SURGSPECd = pd.get_dummies(SURGSPEC)
SURGSPECd['Prior_Cat']=acsd['SURGSPEC']
SURGSPECd.loc[SURGSPECd.Prior_Cat.isnull(),['Cardiac Surgery','General Surgery','Gynecology','Interventional Radiologist','Neurosurgery','Orthopedics','Other','Otolaryngology (ENT)','Plastics','Thoracic','Urology','Vascular']]=np.nan
SURGSPEC = SURGSPECd[['Cardiac Surgery','Gynecology','Interventional Radiologist','Neurosurgery','Orthopedics','Other','Otolaryngology (ENT)','Plastics','Thoracic','Urology','Vascular']]

In [11]:
# DIABETES.
DIABETES = acsd['DIABETES']
DIABETESd = pd.get_dummies(DIABETES)
DIABETESd['Prior_Cat']=acsd['DIABETES']
DIABETESd.loc[DIABETESd.Prior_Cat.isnull(),['INSULIN','NO','NON-INSULIN']]=np.nan
DIABETES = DIABETESd[['INSULIN','NON-INSULIN']]

In [12]:
# SMOKE.
SMOKE = acsd['SMOKE']
SMOKEd = pd.get_dummies(SMOKE)
SMOKEd['Prior_Cat']=acsd['SMOKE']
SMOKEd.loc[SMOKEd.Prior_Cat.isnull(),['No','Yes']]=np.nan
SMOKE = SMOKEd[['Yes']]

In [13]:
# DYSPNEA.
RENAFAIL = acsd['RENAFAIL']
RENAFAILd = pd.get_dummies(RENAFAIL)
RENAFAILd['Prior_Cat']=acsd['RENAFAIL']
RENAFAILd.loc[RENAFAILd.Prior_Cat.isnull(),['No','Yes']]=np.nan
RENAFAIL = RENAFAILd[['Yes']]
DYSPNEA = acsd['DYSPNEA']
DYSPNEAd = pd.get_dummies(DYSPNEA)
DYSPNEAd['Prior_Cat']=acsd['DYSPNEA']
DYSPNEAd.loc[DYSPNEAd.Prior_Cat.isnull(),['AT REST','MODERATE EXERTION','No']]=np.nan
DYSPNEA = DYSPNEAd[['AT REST','MODERATE EXERTION']]

In [14]:
# FNSTATUS2
FNSTATUS2 = acsd['FNSTATUS2']
FNSTATUS2d = pd.get_dummies(FNSTATUS2)
FNSTATUS2d['Prior_Cat']=acsd['FNSTATUS2']
FNSTATUS2d.loc[FNSTATUS2d.Prior_Cat.isnull(),['Independent','Partially Dependent','Totally Dependent']]=np.nan
FNSTATUS2 = FNSTATUS2d[['Partially Dependent','Totally Dependent']]

In [15]:
# VENTILAT
VENTILAT = acsd['VENTILAT']
VENTILATd = pd.get_dummies(VENTILAT)
VENTILATd['Prior_Cat']=acsd['VENTILAT']
VENTILATd.loc[VENTILATd.Prior_Cat.isnull(),['No','Yes']]=np.nan
VENTILAT = VENTILATd[['Yes']]

In [16]:
# HXCOPD
HXCOPD = acsd['HXCOPD']
HXCOPDd = pd.get_dummies(HXCOPD)
HXCOPDd['Prior_Cat']=acsd['HXCOPD']
HXCOPDd.loc[HXCOPDd.Prior_Cat.isnull(),['No','Yes']]=np.nan
HXCOPD = HXCOPDd[['Yes']]

In [17]:
# ASCITES.
ASCITES = acsd['ASCITES']
ASCITESd = pd.get_dummies(ASCITES)
ASCITESd['Prior_Cat']=acsd['ASCITES']
ASCITESd.loc[ASCITESd.Prior_Cat.isnull(),['No','Yes']]=np.nan
ASCITES = ASCITESd[['Yes']]

In [18]:
# HXCHF
HXCHF = acsd['HXCHF']
HXCHFd = pd.get_dummies(HXCHF)
HXCHFd['Prior_Cat']=acsd['HXCHF']
HXCHFd.loc[HXCHFd.Prior_Cat.isnull(),['No','Yes']]=np.nan
HXCHF = HXCHFd[['Yes']]

In [20]:
# HYPERMED
HYPERMED = acsd['HYPERMED']
HYPERMEDd = pd.get_dummies(HYPERMED)
HYPERMEDd['Prior_Cat']=acsd['HYPERMED']
HYPERMEDd.loc[HYPERMEDd.Prior_Cat.isnull(),['No','Yes']]=np.nan
HYPERMED = HYPERMEDd[['Yes']]


In [19]:
# RENAFAIL
RENAFAIL = acsd['RENAFAIL']
RENAFAILd = pd.get_dummies(RENAFAIL)
RENAFAILd['Prior_Cat']=acsd['RENAFAIL']
RENAFAILd.loc[RENAFAILd.Prior_Cat.isnull(),['No','Yes']]=np.nan
RENAFAIL = RENAFAILd[['Yes']]

In [21]:
# DIALYSIS
DIALYSIS = acsd['DIALYSIS']
DIALYSISd = pd.get_dummies(DIALYSIS)
DIALYSISd['Prior_Cat']=acsd['DIALYSIS']
DIALYSISd.loc[DIALYSISd.Prior_Cat.isnull(),['No','Yes']]=np.nan
DIALYSIS = DIALYSISd[['Yes']]

In [22]:
# DISCANCR
DISCANCR = acsd['DISCANCR']
DISCANCRd = pd.get_dummies(DISCANCR)
DISCANCRd['Prior_Cat']=acsd['DISCANCR']
DISCANCRd.loc[DISCANCRd.Prior_Cat.isnull(),['No','Yes']]=np.nan
DISCANCR = DISCANCRd[['Yes']]

In [23]:
# WNDINF
WNDINF = acsd['WNDINF']
WNDINFd = pd.get_dummies(WNDINF)
WNDINFd['Prior_Cat']=acsd['WNDINF']
WNDINFd.loc[WNDINFd.Prior_Cat.isnull(),['No','Yes']]=np.nan
WNDINF = WNDINFd[['Yes']]

In [24]:
# STEROID
STEROID = acsd['STEROID']
STEROIDd = pd.get_dummies(STEROID)
STEROIDd['Prior_Cat']=acsd['STEROID']
STEROIDd.loc[STEROIDd.Prior_Cat.isnull(),['No','Yes']]=np.nan
STEROID = STEROIDd[['Yes']]

In [25]:
# WTLOSS
WTLOSS = acsd['WTLOSS']
WTLOSSd = pd.get_dummies(WTLOSS)
WTLOSSd['Prior_Cat']=acsd['WTLOSS']
WTLOSSd.loc[WTLOSSd.Prior_Cat.isnull(),['No','Yes']]=np.nan
WTLOSS = WTLOSSd[['Yes']]

In [26]:
# BLEEDDIS
BLEEDDIS = acsd['BLEEDDIS']
BLEEDDISd = pd.get_dummies(BLEEDDIS)
BLEEDDISd['Prior_Cat']=acsd['BLEEDDIS']
BLEEDDISd.loc[BLEEDDISd.Prior_Cat.isnull(),['No','Yes']]=np.nan
BLEEDDIS = BLEEDDISd[['Yes']]

In [27]:
# TRANSFUS
TRANSFUS = acsd['TRANSFUS']
TRANSFUSd = pd.get_dummies(TRANSFUS)
TRANSFUSd['Prior_Cat']=acsd['TRANSFUS']
TRANSFUSd.loc[TRANSFUSd.Prior_Cat.isnull(),['No','Yes']]=np.nan
TRANSFUS = TRANSFUSd[['Yes']]

In [28]:
# PRSEPIS
PRSEPIS = acsd['PRSEPIS']
PRSEPISd = pd.get_dummies(PRSEPIS)
PRSEPISd['Prior_Cat']=acsd['PRSEPIS']
PRSEPISd.loc[PRSEPISd.Prior_Cat.isnull(),['None','SIRS','Sepsis','Septic Shock']]=np.nan
PRSEPIS = PRSEPISd[['SIRS','Sepsis','Septic Shock']]

In [29]:
# EMERGNCY
EMERGNCY = acsd['EMERGNCY']
EMERGNCYd = pd.get_dummies(EMERGNCY)
EMERGNCYd['Prior_Cat']=acsd['EMERGNCY']
EMERGNCYd.loc[EMERGNCYd.Prior_Cat.isnull(),['No','Yes']]=np.nan
EMERGNCY = EMERGNCYd[['Yes']]

In [30]:
# WNDCLAS
WNDCLAS = acsd['WNDCLAS']
WNDCLASd = pd.get_dummies(WNDCLAS)
WNDCLASd['Prior_Cat']=acsd['WNDCLAS']
WNDCLASd.loc[WNDCLASd.Prior_Cat.isnull(),['1-Clean', '2-Clean/Contaminated', '3-Contaminated', '4-Dirty/Infected']]=np.nan
WNDCLAS = WNDCLASd[['2-Clean/Contaminated', '3-Contaminated', '4-Dirty/Infected']]

In [31]:
# ASACLAS
ASACLAS = acsd['ASACLAS']
ASACLASd = pd.get_dummies(ASACLAS)
ASACLASd['Prior_Cat']=acsd['ASACLAS']
ASACLASd.loc[ASACLASd.Prior_Cat.isnull(),['1-No Disturb', '2-Mild Disturb', '3-Severe Disturb', '4-Life Threat', '5-Moribund']]=np.nan
ASACLAS = ASACLASd[['2-Mild Disturb', '3-Severe Disturb', '4-Life Threat', '5-Moribund']]

In [32]:
# Now, all these new pandas dataframes should be combined with
# the acsd data, which we first remove the original factor variables 
# from. 
acs_cont = acsd[['WORKRVU', 'Age', 'HEIGHT', 'WEIGHT', 'DPRNA', 'DPRBUN',
                'DPRCREAT', 'DPRALBUM', 'DPRBILI', 'DPRSGOT', 'DPRALKPH', 
                'DPRWBC', 'DPRHCT', 'DPRPLATE', 'DPRPTT', 'DPRPT', 'DPRINR',
                'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT',
                'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 
                'PRPT', 'READMPODAYS1']]

# Generate an early-readmission (any readmits in 0-7 days) based
# on the days-to-readmission column. 
acs_cont['early_readmission'] = np.where(acs_cont['READMPODAYS1']<=7.0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [None]:
# Before we merge back the categorical data, we will remove the lab
# values with 35% or more missingness. In short, we wish to have an 
# analysis with lab values, but because of missingness, we will 
# do MI. However, MI will not be used on variables with roughly
# more than 1/3 missingness, so we will not perform MI
# to impute on such variables, and will just not use them for
# prediction. 

# We can firstly visualize the percentage of missingness here. 
acs_cont.isnull().mean()

In [33]:
# Now remove those columns where there is greater than 35% missingness.
acs_cont = acs_cont[acs_cont.columns[acs_cont.isnull().mean() < 0.35]]

In [None]:
# Now, check into this data. 
acs_cont.head()

In [None]:
# Also, lets free up some memory space to create our final analysis-ready
# data (prior to imputation).
list_data = [acsd]
del list_data

In [34]:
# Now we will add back the categorical one-hot variables to this 
# dataframe. 
a1 = pd.concat([acs_cont.reset_index(drop=True), race], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), TRANST], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), ANESTHES], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), SURGSPEC], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), DIABETES], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), SMOKE], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), RENAFAIL], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), FNSTATUS2], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), DYSPNEA], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), VENTILAT], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), HXCOPD], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), ASCITES], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), HYPERMED], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), ASACLAS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), RENAFAIL], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), DIALYSIS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), DISCANCR], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), STEROID], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), WTLOSS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), PRSEPIS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), TRANSFUS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), WNDCLAS], axis=1)
a1 = pd.concat([a1.reset_index(drop=True), HXCHF], axis=1)

In [38]:
# Check the data. 
a1.head()

Unnamed: 0,WORKRVU,Age,HEIGHT,WEIGHT,DPRNA,DPRCREAT,DPRWBC,DPRHCT,DPRPLATE,PRSODM,...,Yes,Yes.1,SIRS,Sepsis,Septic Shock,Yes.2,2-Clean/Contaminated,3-Contaminated,4-Dirty/Infected,Yes.3
0,7.96,69,71.0,210.0,12.0,12.0,12.0,12.0,12.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.76,51,61.0,114.0,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,11.76,30,70.0,258.0,0.0,0.0,0.0,0.0,0.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,6.59,35,61.0,98.0,,,8.0,8.0,8.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.14,23,63.0,108.0,25.0,25.0,8.0,8.0,8.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Now, we will conduct a possibly naive version of multiple imputation. 
# We will be using the IterativeImputer function from sklearn. 
# This utilizes a particular model type to regress a column
# with missing on all other available input columns of a dataframe.
# We will pretend all columns are continuous and employ Bayesian
# Ridge regression. We ignore that after performing the one-hot
# encoding there are pre-existing collinearity issues in the
# transformed data. 

In [39]:
# Set up imputation. 

# Here we establish the method.
impguy = IterativeImputer(max_iter = 10, random_state=0, initial_strategy='median')

# Now, get the imputation data. This is going to be all of the data
# except the outcome, which will be left out. 
features = a1.loc[:, a1.columns != 'early_readmission']

In [40]:
# Here, we fit the imputation model object to the features data created
# above. 
impguy.fit(features)



IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='median',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [41]:
features.head(100)

Unnamed: 0,WORKRVU,Age,HEIGHT,WEIGHT,DPRNA,DPRCREAT,DPRWBC,DPRHCT,DPRPLATE,PRSODM,...,Yes,Yes.1,SIRS,Sepsis,Septic Shock,Yes.2,2-Clean/Contaminated,3-Contaminated,4-Dirty/Infected,Yes.3
0,7.96,69,71.0,210.0,12.0,12.0,12.0,12.0,12.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.76,51,61.0,114.0,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,11.76,30,70.0,258.0,0.0,0.0,0.0,0.0,0.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,6.59,35,61.0,98.0,,,8.0,8.0,8.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.14,23,63.0,108.0,25.0,25.0,8.0,8.0,8.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.96,66,72.0,212.0,28.0,28.0,28.0,28.0,28.0,136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7.96,65,72.0,175.0,6.0,6.0,6.0,6.0,6.0,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7.96,70,69.0,180.0,5.0,5.0,5.0,5.0,5.0,135.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10.60,63,70.0,231.0,0.0,0.0,0.0,0.0,0.0,134.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,12.37,48,63.0,230.0,4.0,,4.0,4.0,4.0,136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
