In [13]:
import pandas as pd
import numpy as np
import time

In [None]:
def tribble(columns, *data):
    return pd.DataFrame(data=list(zip(*[iter(data)]*len(columns))),
        columns=columns)

procedure_risk_groups = tribble(["surgery_procedure", "surgery_risk"],
        "colon", "high_risk", #high risk
        "small_bowel", "high_risk", #high risk
        "exploratory_abdominal", "high_risk", #high risk
        "rectal", "high_risk", #high risk
        "gastric", "med_risk",
        "bile_duct_liver_pancreatic", "high_risk",
        "spleen", "med_risk",
        "thoracic", "med_risk",
        "limb_amputation", "med_risk",
        "av_shunt_dialysis", "low_risk",
        "ventricular_shunt", "low_risk",
        "abdominal_hysterectomy", "med_risk", #abdominal
        "vaginal_hysterectomy", "med_risk",
        "cesarean_section", "med_risk",
        "ovarian", "med_risk",
        "breast", "low_risk", 
        "prostate", "med_risk", 
        "thyroid", "low_risk", 
        "herniorrhaphy", "high_risk",
        "spinal_fusion", "low_risk", 
        "laminectomy", "low_risk",
        "refusion_spine", "low_risk",
        "knee_prosthesis", "low_risk",
        "hip_prosthesis", "low_risk",
        "cardiac", "low_risk",
        "abdominal_aortic_aneurysm", "med_risk",
        "peripheral_vascular_bypass", "low_risk", 
        "carotid_endarterectomy", "low_risk", 
        "coronary_bypass_graft_chest_incision", "low_risk",
        "coronary_bypass_chest_donor_incision", "low_risk",
        "kidney_transplant", "high_risk",
        "gallbladder", "high_risk",
        "appendix", "high_risk",
        "kidney_surgery", "med_risk",
        "pacemaker", "med_risk",
        "craniotomy", "low_risk",
        "fracture", "low_risk",
        "neck", "med_risk")

In [14]:
def preprocess(cov_train, cov_test, cov_val = None, race = None, age_bin = None, gender = None, check = False, processed = False):
    if not processed:
        # round chronic conditions to the nearest integer 
        col_chr = cov_train.columns[cov_train.columns.str.startswith("chronic_condition_")]
        cov_train[col_chr] = cov_train[col_chr].astype('int')
        cov_test[col_chr] = cov_test[col_chr].astype('int')
#         if isinstance(cov_val, pd.DataFrame):
        cov_val[col_chr] = cov_val[col_chr].astype('int')

        # convert all binary variables to factors
        col = [*cov_train.columns[cov_train.columns.str.startswith("med_")],*cov_train.columns[cov_train.columns.str.startswith("patient_race_")],
               *cov_train.columns[cov_train.columns.str.startswith("surgery_surgical_service_")], *cov_train.columns[cov_train.columns.str.startswith("surgery_diagnosis_")],
               *cov_train.columns[cov_train.columns.str.startswith("patient_smoker_")], "surgery_outpatient", "surgery_emergency", "surgery_anesthesia",
               "surgery_trauma", "patient_incarcerated","patient_gender_male"]

        #cov_train[col] = cov_train[col].astype('category')
        #cov_test[col] = cov_test[col].astype('category')

        # add a variable for the difference in max temperature before and after surgery
        cov_train["vitals_temp_diff"] = cov_train["vitals_max_temperature_post_surgery"] - cov_train["vitals_max_temperature_pre_surgery"]
        cov_train["vitals_pulse_diff"] = cov_train["vitals_max_pulse_post_surgery"] - cov_train["vitals_max_pulse_pre_surgery"]
        
        cov_test["vitals_temp_diff"] = cov_test["vitals_max_temperature_post_surgery"] - cov_test["vitals_max_temperature_pre_surgery"]
        cov_test["vitals_pulse_diff"] = cov_test["vitals_max_pulse_post_surgery"] - cov_test["vitals_max_pulse_pre_surgery"]
        
        cov_val["vitals_temp_diff"] = cov_val["vitals_max_temperature_post_surgery"] - cov_val["vitals_max_temperature_pre_surgery"]
        cov_val["vitals_pulse_diff"] = cov_val["vitals_max_pulse_post_surgery"] - cov_val["vitals_max_pulse_pre_surgery"]

        cov_train = pd.merge(cov_train, procedure_risk_groups, on = ["surgery_procedure"])
        cov_train.loc[cov_train["surgery_risk"].isin(["low_risk"]), "surgery_risk"] = 1
        cov_train.loc[cov_train["surgery_risk"].isin(["mid_risk", "med_risk"]), "surgery_risk"] = 2
        cov_train.loc[cov_train["surgery_risk"].isin(["high_risk"]), "surgery_risk"] = 3

        cov_test = pd.merge(cov_test, procedure_risk_groups, on = ["surgery_procedure"])
        cov_test.loc[cov_test["surgery_risk"].isin(["low_risk"]), "surgery_risk"] = 1
        cov_test.loc[cov_test["surgery_risk"].isin(["med_risk"]), "surgery_risk"] = 2
        cov_test.loc[cov_test["surgery_risk"].isin(["high_risk"]), "surgery_risk"] = 3
        
        cov_val = pd.merge(cov_val, procedure_risk_groups, on = ["surgery_procedure"])
        cov_val.loc[cov_val["surgery_risk"].isin(["low_risk"]), "surgery_risk"] = 1
        cov_val.loc[cov_val["surgery_risk"].isin(["med_risk"]), "surgery_risk"] = 2
        cov_val.loc[cov_val["surgery_risk"].isin(["high_risk"]), "surgery_risk"] = 3


        onehot = pd.get_dummies(cov_train["surgery_procedure"])
        onehot.columns = list("surgery_procedure_" + x for x in onehot.columns)
        cov_train = pd.concat([cov_train, onehot], axis = 1)
        cov_train.drop(columns = ["surgery_procedure", "surgery_hour"], inplace = True)

        onehot_test = pd.get_dummies(cov_test["surgery_procedure"])
        onehot_test.columns = list("surgery_procedure_" + x for x in onehot_test.columns)
        cov_test = pd.concat([cov_test, onehot_test], axis = 1)
        cov_test.drop(columns = ["surgery_procedure", "surgery_hour"], inplace = True)
        
        onehot_val = pd.get_dummies(cov_val["surgery_procedure"])
        onehot_val.columns = list("surgery_procedure_" + x for x in onehot_val.columns)
        cov_val = pd.concat([cov_val, onehot_val], axis = 1)
        cov_val.drop(columns = ["surgery_procedure", "surgery_hour"], inplace = True)

#         col_sur = cov_train.columns[cov_train.columns.str.startswith("surgery_procedure_")]
#         cov_train[col_sur] = cov_train[col_sur].astype('int')
#         cov_test[col_sur] = cov_test[col_sur].astype('int')
#         cov_val[col_sur] = cov_val[col_sur].astype('int')

        cov_train.dropna(inplace = True)
        cov_test.dropna(inplace = True)
        cov_val.dropna(inplace = True)

        cov_test["patient_race_african_american_or_black"] = 0
        cov_test.loc[cov_test[cov_test.columns[cov_test.columns.str.startswith("patient_race")]].sum(axis = 1) == 0, 
               "patient_race_african_american_or_black"] = 1

        cov_train["patient_race_african_american_or_black"] = 0
        cov_train.loc[cov_train[cov_train.columns[cov_train.columns.str.startswith("patient_race")]].sum(axis = 1) == 0, 
               "patient_race_african_american_or_black"] = 1
        
        cov_val["patient_race_african_american_or_black"] = 0
        cov_val.loc[cov_val[cov_val.columns[cov_val.columns.str.startswith("patient_race")]].sum(axis = 1) == 0, 
               "patient_race_african_american_or_black"] = 1


        cov_train.surgery_risk = cov_train.surgery_risk.astype('int')
        cov_test.surgery_risk = cov_test.surgery_risk.astype('int')
        cov_val.surgery_risk = cov_val.surgery_risk.astype('int')

        #since "patnums" while preprocessing had different names
        cov_train.rename(columns = {"patient_race_White" : "patient_race_white"}, inplace = True)
        cov_test.rename(columns = {"patient_race_White" : "patient_race_white"}, inplace = True)
        cov_val.rename(columns = {"patient_race_White" : "patient_race_white"}, inplace = True)

        cov_train.rename(columns = {"patient_race_American.Indian.or.Alaska.Native" : "patient_race_american_indian_or_alaska_native"}, inplace = True)
        cov_test.rename(columns = {"patient_race_American.Indian.or.Alaska.Native" : "patient_race_american_indian_or_alaska_native"}, inplace = True)
        cov_val.rename(columns = {"patient_race_American.Indian.or.Alaska.Native" : "patient_race_american_indian_or_alaska_native"}, inplace = True)
        
        cov_train.rename(columns = {"patient_race_Asian" : "patient_race_asian"}, inplace = True)
        cov_test.rename(columns = {"patient_race_Asian" : "patient_race_asian"}, inplace = True)
        cov_val.rename(columns = {"patient_race_Asian" : "patient_race_asian"}, inplace = True)

        cov_train.rename(columns = {"patient_race_Hispanic" : "patient_race_hispanic"}, inplace = True)
        cov_test.rename(columns = {"patient_race_Hispanic" : "patient_race_hispanic"}, inplace = True)
        cov_val.rename(columns = {"patient_race_Hispanic" : "patient_race_hispanic"}, inplace = True)

        cov_train.rename(columns = {"patient_race_Native.Hawaiian.or.other.Pacific.Islander" : "patient_race_native_hawaiian_or_other_pacific_islander"}, inplace = True)
        cov_test.rename(columns = {"patient_race_Native.Hawaiian.or.other.Pacific.Islander" : "patient_race_native_hawaiian_or_other_pacific_islander"}, inplace = True)
        cov_val.rename(columns = {"patient_race_Native.Hawaiian.or.other.Pacific.Islander" : "patient_race_native_hawaiian_or_other_pacific_islander"}, inplace = True)

        cov_train.rename(columns = {"patient_race_Other" : "patient_race_other"}, inplace = True)
        cov_test.rename(columns = {"patient_race_Other" : "patient_race_other"}, inplace = True)
        cov_val.rename(columns = {"patient_race_Other" : "patient_race_other"}, inplace = True)

        #cov_train_final.drop(11198, axis = 0, inplace = True)
#         cov_train = cov_train.loc[(cov_train.med_antiarthritics_pre_surgery == 0)|(cov_train.med_antiarthritics_pre_surgery == 1),cov_train.columns]

    if race and race != "patient_race_non-white":
        cov_test = cov_test[cov_test[race] == 1]
        if check:
            cov_train = cov_train[cov_train[race] == 1]
    elif race == "patient_race_non-white":
        cov_test = cov_test[cov_test['patient_race_white'] == 0]
            
    if age_bin:
        if age_bin == "kids":
            cov_test = cov_test[cov_test.patient_age <= 10.1]
            if check:
                cov_train = cov_train[cov_train.patient_age <= 10.1]
        elif age_bin == "teens":
            cov_test = cov_test[(cov_test.patient_age > 10.1)&(cov_test.patient_age <= 20.2)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 10.1)&(cov_train.patient_age <= 20.2)]
        elif age_bin == "twenties":
            cov_test = cov_test[(cov_test.patient_age > 20.2)&(cov_test.patient_age <= 30.3)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 20.2)&(cov_train.patient_age <= 30.3)]
        elif age_bin == "thirties":
            cov_test = cov_test[(cov_test.patient_age > 30.3)&(cov_test.patient_age <= 40.4)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 30.3)&(cov_train.patient_age <= 40.4)]
        elif age_bin == "forties":
            cov_test = cov_test[(cov_test.patient_age > 40.4)&(cov_test.patient_age <= 50.5)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 40.4)&(cov_train.patient_age <= 50.5)]
        elif age_bin == "fifties":
            cov_test = cov_test[(cov_test.patient_age > 50.5)&(cov_test.patient_age <= 60.6)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 50.5)&(cov_train.patient_age <= 60.6)]
        elif age_bin == "sixties":
            cov_test = cov_test[(cov_test.patient_age > 60.6)&(cov_test.patient_age <= 70.7)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 60.6)&(cov_train.patient_age <= 70.7)]
        elif age_bin == "seventies":
            cov_test = cov_test[(cov_test.patient_age > 70.7)&(cov_test.patient_age <= 80.8)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 70.7)&(cov_train.patient_age <= 80.8)]
        elif age_bin == "eighties":
            cov_test = cov_test[(cov_test.patient_age > 80.8)&(cov_test.patient_age <= 90.9)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 80.8)&(cov_train.patient_age <= 90.9)]
        elif age_bin == "ninties":
            cov_test = cov_test[(cov_test.patient_age > 90.9)&(cov_test.patient_age <= 101)]
            if check:
                cov_train = cov_train[(cov_train.patient_age > 90.9)&(cov_train.patient_age <= 101)]
                
    if gender == "Male":
        cov_test = cov_test[cov_test.patient_gender_male == 1]
        if check:
            cov_train = cov_train[cov_train.patient_gender_male == 1]
    elif gender == "Female":
        cov_test = cov_test[cov_test.patient_gender_male == 0]
        if check:
            cov_train = cov_train[cov_train.patient_gender_male == 0]
     
    return (cov_train, cov_test, cov_val)