## IMPORTS

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

## PRE IMPUTATION PROCESSING

In [2]:
## mixtures of hamza's code and mine
df_trauma = pd.read_csv("C:/Users/15125/Desktop/CAPSTONE PROJECT/RAW DATA/TQIP_2010_2016_Merged_MGHTrauma2019Jan.csv")
initial_length = len(df_trauma)
df_trauma.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,inc_key,yobirth,age,gender,race1,race2,ethnic,teachsta,acslevel,acspedl,...,compldes8,complkey9,compldes9,complkey10,compldes10,complkey11,compldes11,complkey12,compldes12,_merge
0,00000ae7-4797,1985.0,28.0,Female,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
1,00001376-9675,1960.0,54.0,Male,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,I,...,,,,,,,,,,matched (3)
2,00001774-08a9,1975.0,40.0,Male,Other Race,Not Known/Not Recorded BIU 2,Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
3,00002cba-17d4,1996.0,16.0,Male,Black or African American,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
4,000048a2-1679,1993.0,20.0,Male,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,Not Applicable,Not Applicable,...,,,,,,,,,,matched (3)


### General preprocessing

In [3]:
## General preprocessing
allowed_eddisp = [
  "Operating Room",
  # "Transferred to another hospital",
  "Observation unit (unit that provides &lt; 24 hour stays)",
  "Intensive Care Unit (ICU)",
  "Telemetry/step-down unit (less acuity than ICU)",
  "Floor bed (general admission, non specialty unit bed)"
  # "Home without services",
  # "Other (jail, institutional care facility, mental health, etc)",
  # "Home with services",
  # "Left against medical advice",
]
df_trauma = df_trauma[df_trauma["eddisp"].isin(allowed_eddisp)]
df_trauma.loc[(df_trauma.tmode1.isnull()) & ~(df_trauma.tmode2.isnull()), "tmode1"] = df_trauma.loc[
    (df_trauma.tmode1.isnull()) & ~(df_trauma.tmode2.isnull()), "tmode2"
]

comorkeys = [x for x in df_trauma.columns if "comorkey" in x]
complkeys = [x for x in df_trauma.columns if "complkey" in x]
predotkeys = [x for x in df_trauma.columns if "predot" in x]
severitykeys = [x for x in df_trauma.columns if "severity" in x]

## Daisy here refers to Daisy Zhou from Interpretable AI who assisted with the key selection
columns_kept_daisy = (
    comorkeys
    + complkeys
    + predotkeys
    + severitykeys
    + [
        "inc_key", "issais", "age", "gender", "race1", "ethnic", "acslevel",
        "tmode1", "tmode2", "transfer", "alcohol", "drug1", "signsoflife",
        "sbp1", "sbp2", "pulse1", "pulse2", "rr1", "rr2",
        "oxysat1", "oxysat2", "temp1", "gcstot1", "gcstot2",
        "ecode", "icd10_primary_ecode", "icd10_additonal_ecode",
        "eddisp", "hospdisp", "yoadmit", "teachsta", "region", "hemorrhage_ctrl_type",
        
    ]
)
df_trauma = df_trauma[columns_kept_daisy]

### Handling Missing Values

In [4]:
##### Mapping values to NaNs #####
for col_severity in severitykeys:
    df_trauma[col_severity] = df_trauma[col_severity].replace({9: np.nan})
df_trauma = df_trauma.replace({
    "Not Applicable BIU 1": np.nan,
    "Not Known/Not Recorded BIU 2": np.nan,
    "Not Applicable": np.nan,
    -99: np.nan,
    -1: np.nan,
    -2: np.nan,
})

### Comorbodities

In [5]:
for new_column, value in zip(
    ["alcohol_use_disorder", "bleeding_disorder", "current_chemotherapy", "congestive_heart_failure",
    "current_smoker", "chronic_renal_failure", "history_cva", "diabetes", "disseminated_cancer", "copd",
    "steroid", "cirrhosis", "drug_use_disorder", "history_MI", "history_pvd", "hypertension_medication"],
    [2, 4, 5, 7, 8, 9, 10, 11, 12, 23, 24, 25, 28, 17, 18, 19]
):
    df_trauma[new_column] = 0
    df_trauma.loc[
        ((df_trauma["comorkey1"] == value) | (df_trauma["comorkey2"] == value) |
        (df_trauma["comorkey3"] == value) | (df_trauma["comorkey4"] == value) |
        (df_trauma["comorkey5"] == value) | (df_trauma["comorkey6"] == value) |
        (df_trauma["comorkey7"] == value) | (df_trauma["comorkey8"] == value) |
        (df_trauma["comorkey9"] == value) | (df_trauma["comorkey10"] == value) |
        (df_trauma["comorkey11"] == value) | (df_trauma["comorkey12"] == value)),
        new_column
    ] = 1
    print(f"Created new column {new_column}")
df_trauma = df_trauma.loc[:, [col for col in df_trauma.columns if "comorkey" not in col]]

Created new column alcohol_use_disorder
Created new column bleeding_disorder
Created new column current_chemotherapy
Created new column congestive_heart_failure
Created new column current_smoker
Created new column chronic_renal_failure
Created new column history_cva
Created new column diabetes
Created new column disseminated_cancer
Created new column copd
Created new column steroid
Created new column cirrhosis
Created new column drug_use_disorder
Created new column history_MI
Created new column history_pvd
Created new column hypertension_medication


### Morbidities

In [6]:
# 0 means that the patient has no allowed morbidity, 1 means he has one
df_trauma["morbidity"] = 0
morb_lists = []
# allowed_morbidities = [4, 5, 8, 11, 12, 14, 15, 18, 19, 21, 22, 23, 25, 30, 31, 32, 35]
# Modification of allowed morbidities on 15/12/2019: deleted 11, 15, 18, 22, 23, 30, 31,35
allowed_morbidities = [4, 5, 8, 12, 14, 19, 21, 25, 32]
for col_compl in complkeys:
    df_trauma.loc[(df_trauma[col_compl].isin(allowed_morbidities)), "morbidity"] = 1
    for morb in allowed_morbidities:
        morb_list = np.zeros(len(df_trauma.index))
        to_add_idx = df_trauma.loc[(df_trauma[col_compl].isin([morb])),:].index[df_trauma.loc[(df_trauma[col_compl].isin([morb])),:].index < len(df_trauma)]
        morb_list[to_add_idx] = 1
        morb_lists.append(morb_list)

In [7]:
morb_names= ['Acute_Kidney_Injury','ARDS', 'Cardiac_Arrest_Requiring_CPR','Deep_Surgical_Site_Infection',
            'Deep_Vein_Thrombosis', 'Organ_Space_Surgical_Site_Infection','Pulmonary_Embolism','Unplanned_Intubation',
            'Severe_Sepsis']
for name,vals in zip(morb_names, morb_lists):
    df_trauma[name] = vals

## Method of Injury

In [8]:
# Mapping ecode (ecode2 has exactly the sames keys so it's useless to do any mapping)

icd_mapping_ecode = pd.read_csv("C:/Users/15125/Desktop/CAPSTONE PROJECT/RAW DATA/icd.csv", sep=";")
dict_icd_mapping_ecode = {}
for j in range(len(icd_mapping_ecode.columns)):
    injury_type = icd_mapping_ecode.columns[j]
    print(injury_type)
    ecode_values = icd_mapping_ecode.iloc[0, j].split(' ')
    ecode_values = [x[6:-1].split("\n")[0] if "float" in x else x.split("\n")[0] for x in ecode_values]
    ecode_values = [float(x) if ')' not in x else float(x[:-1]) for x in ecode_values]
    dict_icd_mapping_ecode.update({ecode: injury_type for ecode in ecode_values})
print("===> Preprocessed the ICD mapping")
# We replace all the NaNs with "Unknown"
df_trauma["method_of_injury_ecode"] = df_trauma["ecode"].astype(float).round(1).map(dict_icd_mapping_ecode)
df_trauma["method_of_injury_ecode"] = df_trauma["method_of_injury_ecode"].replace({np.nan: "Unknown"})
print(df_trauma["method_of_injury_ecode"].value_counts())


## Mapping icd10_primary_code
# (icd10_additional_code has exactly the sames keys so it's useless to do any mapping)
icd_mapping_primary = pd.read_csv("C:/Users/15125/Desktop/CAPSTONE PROJECT/RAW DATA/icd_primary_ecodes.csv", sep=";")
dict_icd_mapping_primary_icd = {}
for j in range(len(icd_mapping_primary.columns)):
    injury_type = icd_mapping_primary.columns[j]
    print(injury_type)
    primary_icd_values = icd_mapping_primary.iloc[0, j].split(' ')
    primary_icd_values = [x.split("\n")[0] if "\n" in x else x for x in primary_icd_values]
    dict_icd_mapping_primary_icd.update({primary_icd: injury_type for primary_icd in primary_icd_values})
print("===> Preprocessed the ICD mapping primary icd")
# We replace all the NaNs and unknowns (-1, -2) with "Unknown"
dict_icd_mapping_primary_icd.update({-1: "Unknown", -2: "Unknown"})
df_trauma["method_of_injury_icd_primary"] = df_trauma["icd10_primary_ecode"].map(
    dict_icd_mapping_primary_icd
)
df_trauma["method_of_injury_icd_primary"] = df_trauma["method_of_injury_icd_primary"].replace({np.nan: "Unknown"})
print(df_trauma["method_of_injury_icd_primary"].value_counts())


## Creating final method_of_injury from both other columns
# This will be the final method_of_injury column built from the two intermediary columns
df_trauma["method_of_injury"] = df_trauma.method_of_injury_ecode
# Completing ecode with primary_icd10_code
df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode == "Unknown") &
         (df_trauma.method_of_injury_icd_primary != "Unknown") &
         (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury"
] = df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode == "Unknown") &
         (df_trauma.method_of_injury_icd_primary != "Unknown") &
         (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury_icd_primary"
]

# Completing primary_icd10_code with ecode
df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode != "Unknown") &
        (df_trauma.method_of_injury_icd_primary == "Unknown") &
        (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury"
] = df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode != "Unknown") &
        (df_trauma.method_of_injury_icd_primary == "Unknown") &
        (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury_ecode"
]
# Dropping these two intermediary columns
df_trauma.drop(["method_of_injury_ecode", "method_of_injury_icd_primary"], axis=1, inplace=True)
print(df_trauma.method_of_injury.value_counts())

Blunt - Fall
Blunt - Other
Blunt - MVT occupant
Blunt - MVT Pedal cyclist/pedestrian
Blunt - MVT motorcyclist
Penetrating - Gunshot Wound
Penetrating - Stab Wound
Penetrating - Other/Mixed
===> Preprocessed the ICD mapping
Blunt - Fall                            493077
Unknown                                 246166
Blunt - MVT occupant                    240367
Blunt - Other                           147345
Blunt - MVT motorcyclist                 76245
Penetrating - Gunshot Wound              59054
Blunt - MVT Pedal cyclist/pedestrian     52362
Penetrating - Stab Wound                 31874
Penetrating - Other/Mixed                   41
Name: method_of_injury_ecode, dtype: int64
Blunt - Fall
Blunt - Other
Blunt - MVT occupant
Blunt - MVT Pedal cyclist/pedestrian
Blunt - MVT motorcyclist
Penetrating - Gunshot Wound
Penetrating - Stab Wound
Penetrating - Other/Mixed
===> Preprocessed the ICD mapping primary icd
Unknown                                 1077557
Blunt - Fall                

### Alcohol

In [9]:
df_trauma["alcohol"] = df_trauma["alcohol"].map({
    "Yes (confirmed by test [beyond legal limit])": "Alcohol",
    "No (confirmed by test)": "Residual/no alcohol",
    "Yes (confirmed by test [trace levels])": "Residual/no alcohol",
    "No (not tested)": np.nan
})

### Severity of Trauma

In [10]:
# # Predots Cleaning (severity)

ais_inputs = pd.read_csv("C:/Users/15125/Desktop/CAPSTONE PROJECT/RAW DATA//ais_inputs.csv", sep=";")
columns_ais = list(ais_inputs.columns)[1:]
injury_locations = [x[9:] for x in columns_ais]


for col, location in zip(columns_ais, injury_locations):
    vars()[f"dict_ais_inputs_{location}"] = {}
    ais_temp = ais_inputs.loc[~ais_inputs[col].isnull(), ["AIS_Predots", col]]
    predots = ais_temp.AIS_Predots.tolist()
    severity = ais_temp[col].tolist()
    vars()[f"dict_ais_inputs_{location}"].update({k:v for k,v in zip(predots, severity)})


for location in injury_locations:
    print(location)
    df_temp = df_trauma.copy()
    df_temp[f"{location}_severity"] = np.nan
    for predot_col in predotkeys:
        df_temp[predot_col] = df_temp[predot_col].astype(float)
        df_temp[predot_col] = df_temp[predot_col].map(vars()[f"dict_ais_inputs_{location}"])
    df_temp[f"{location}_severity"] = df_temp[predotkeys].max(axis=1)
    df_trauma[f"{location}_severity"] = df_temp[f"{location}_severity"].copy()
    print(df_trauma[f"{location}_severity"].value_counts()/len(df_trauma)*100)
    print("% of NaNs: ",df_trauma[f"{location}_severity"].isnull().sum()/len(df_trauma)*100)
    print('----------------------------------')

# Taking the maximum severity for a patient over all new severity columns
df_trauma["severity_max"] = df_trauma[[f"{location}_severity" for location in injury_locations]].max(axis=1)


# Transfers Cleaning

print(f"Length before transfer cleaning (keeping only 'No'): {len(df_trauma)}")
df_trauma = df_trauma[df_trauma.transfer == "No"]
print(f"Length after transfer cleaning (keeping only 'No'): {len(df_trauma)}")


# Filtering nan hospdisp
print(f"Length before filtering NaNs from hospdisp: {len(df_trauma)}")
df_trauma = df_trauma[~df_trauma.hospdisp.isnull()]
print(f"Length after filtering NaNs from hospdisp: {len(df_trauma)}")


# Dropping the severity 9 values (mean unknown)
index_severity_9 = df_trauma[df_trauma.severity_max == 9].index.values
df_trauma.drop(index_severity_9, inplace=True)

# Dropping the severity 6 values
index_severity_6 = df_trauma[df_trauma.severity_max == 6].index.values
df_trauma.drop(index_severity_6, inplace=True)
df_trauma.reset_index(inplace=True, drop=True)

Face
1.0    22.374903
2.0     6.052961
3.0     0.172814
4.0     0.014185
9.0     0.010323
Name: Face_severity, dtype: float64
% of NaNs:  71.37481424490042
----------------------------------
Neck
1.0    0.770127
3.0    0.552234
2.0    0.246337
4.0    0.036241
9.0    0.007129
5.0    0.006461
6.0    0.000223
Name: Neck_severity, dtype: float64
% of NaNs:  98.38124781382679
----------------------------------
Head
3.0    25.495811
1.0     7.532170
4.0     2.971859
5.0     1.685368
2.0     1.568104
9.0     0.162566
6.0     0.026958
Name: Head_severity, dtype: float64
% of NaNs:  60.557165041131626
----------------------------------
Thorax
3.0    16.765674
2.0     9.480658
1.0     1.938314
5.0     1.555553
4.0     1.006958
6.0     0.617216
9.0     0.065650
Name: Thorax_severity, dtype: float64
% of NaNs:  68.5699772229529
----------------------------------
Abdomen
2.0    6.153962
1.0    4.664950
4.0    2.721140
3.0    2.653856
5.0    0.615508
9.0    0.037801
6.0    0.003565
Name: Abdomen_sev

### Time Split Between Blunt and Injury

In [11]:
blunt_injury = ['Blunt - MVT occupant', 'Blunt - Fall', 'Blunt - MVT motorcyclist', 'Blunt - Other',
               'Blunt - MVT Pedal cyclist/pedestrian']
penetrating_injury = ['Penetrating - Gunshot Wound', 'Penetrating - Stab Wound', 'Penetrating - Other/Mixed']
df_blunt = df_trauma[df_trauma.method_of_injury.isin(blunt_injury)].reset_index(drop=True)
df_penetrating = df_trauma[df_trauma.method_of_injury.isin(penetrating_injury)].reset_index(drop=True)
print(f"Length of blunt dataframe: {len(df_blunt)/len(df_trauma)}")
print(f"Length of penetrating dataframe: {len(df_penetrating)/len(df_trauma)}")

Length of blunt dataframe: 0.8927768765419581
Length of penetrating dataframe: 0.09370986684240765


### Mapping for OptImpute

In [12]:
str_columns_to_map = ["gender", "race1", "acslevel", "signsoflife",
                      "alcohol", "method_of_injury", "eddisp","tmode1","tmode2","drug1"]
# Ordered:
mapping_acslevel = {"I": 1, "II": 2, "III": 3, "Unknown": -1}
#Non-ordered
mapping_gender = {"Female": 1, "Male": 0}
mapping_race1 = {'Other Race': 0, 'Black or African American': 1, 'White': 2, 'American Indian': 3,
               'Native Hawaiian or Other Pacific Islander': 4, 'Asian': 5}
mapping_signsoflife = {'Unknown':-1, 'Arrived with signs of life': 1, 'Arrived with NO signs of life': 0}
mapping_alcohol = {"Alcohol": 1, "Residual/no alcohol": 0, "Unknown": -1}
mapping_method_of_injury_penetrating = {'Penetrating - Stab Wound': 1, 'Penetrating - Gunshot Wound': 2,
                                       'Penetrating - Other/Mixed':3}
mapping_method_of_injury_blunt = {'Blunt - MVT occupant': 1, 'Blunt - Fall': 2, 'Blunt - MVT motorcyclist': 3,
 'Blunt - Other': 4, 'Blunt - MVT Pedal cyclist/pedestrian': 5}
mapping_eddisp = {'Operating Room' :0,
       'Floor bed (general admission, non specialty unit bed)': 1,
       'Telemetry/step-down unit (less acuity than ICU)': 2,
       'Intensive Care Unit (ICU)': 3,
       'Observation unit (unit that provides &lt; 24 hour stays)': 4}

mapping_drug1 = {'No (not tested)': 0,
       'Yes (confirmed by test [illegal use drug])': 1,
       'No (confirmed by test)': 2,
       'Yes (confirmed by test [prescription drug])': 3}

mapping_tmode1 = {'Ground Ambulance': 0, 'Helicopter Ambulance' :1,
       'Private/Public Vehicle/Walk-in': 2, 'Other': 3, 'Police': 4,
       'Fixed-wing Ambulance': 5}

mapping_tmode2 = mapping_tmode1

### Columns to keep, along with train-test split, and categorial variable imputation

In [13]:
# With the new morbidity and acslevel deleted on February 1 2020

dict_injury = {0: "blunt", 1: "penetrating"}
for i, df_injury in enumerate([df_blunt,df_penetrating]):
    print(dict_injury[i])
    columns_to_keep = [
        "inc_key", "age", "gender", "race1",
        # "teachsta", # "region",
        "acslevel",  "tmode1", # "transfer",
        "signsoflife", "sbp1",  "sbp2",
        "pulse1",  "pulse2",
        "oxysat1",  "oxysat2",
        "temp1", "gcstot1",  "gcstot2",
        "alcohol", "bleeding_disorder",
        "current_chemotherapy", "congestive_heart_failure",
        "current_smoker", "chronic_renal_failure",
        "history_cva", "diabetes", "disseminated_cancer",
        "copd", "steroid", "cirrhosis", "history_MI",
        "history_pvd", "hypertension_medication",  "eddisp",
        "method_of_injury",  # new AIS"
        "Head_severity", "Face_severity", "Neck_severity", "Thorax_severity",
        "Abdomen_severity", "Spine_severity",
        "Upper_Extremity_severity", "Lower_Extremity_severity",
        "Pelvis_Perineum_severity", "External_severity", "severity_max","hospdisp",
        "tmode1", "tmode2","alcohol_use_disorder", "drug_use_disorder", "rr2","rr1",
        "issais","morbidity","drug1","acslevel"
        #"hemorrhage_ctrl_type","hospdisp"
    ]
    
    columns_to_keep = df_injury.columns
        
    map_hosdisp= {'Discharged/Transferred to another type of rehabilitation or long term': 'post_acute_care',
             'Discharged/Transferred to Skilled Nursing Facility': 'post_acute_care',
             'Discharged/Transferred to an Intermediate Care Facility (ICF)': 'post_acute_care',
             'Discharged/Transferred to a short-term general hospital for inpatient': 'post_acute_care',
             'Discharged/Transferred to another type of institution not defined else': 'post_acute_care',
             'Discharged/Transferred to inpatient rehab or designated unit': 'post_acute_care',
             'Discharged/Transferred to Long Term Care hospital': 'post_acute_care',
             'Discharged home with no home services': 'home',
             'Discharge/Transferred to home under care of organized home health serv': 'home',
             'Discharged to home or self-care (routine discharge)': 'home',
             'Discharged/Transferred to court/law enforcement': 'home',
             'Expired': 'died',
             'Deceased/Expired': 'died',
             'Left against medical advice or discontinued care': 'exclude',
             'Discharged/Transferred to hospice care': 'exclude'
             }


    df_injury= df_injury.replace(map_hosdisp)

#     len_pre_drop= len(df_injury)
#     print(f'Number of rows pre drop: {len_pre_drop}')

#     df_injury = df_injury[df_injury.hospdisp != 'Left against medical advice or discontinued care']
#     df_injury = df_injury[df_injury.hospdisp != 'Discharged/Transferred to hospice care']


#     len_post_drop= len(df_injury)
#     print(f'Number of rows post drop: {len_post_drop}')
    
    hospdisp_target= df_injury.hospdisp

    

    # Getting indices of the two time periods for train/test split
    index_before_2016 = df_injury[
        df_injury.yoadmit < 2016
        ].index.values
    index_from_2016 = df_injury[
        df_injury.yoadmit >= 2016
        ].index.values

    # Imputing categorical variables missing values with other categories
    severities = [
        "Head_severity",
        "Face_severity",
        "Neck_severity",
        "Thorax_severity",
        "Abdomen_severity",
        "Spine_severity",
        "Upper_Extremity_severity",
        "Lower_Extremity_severity",
        "Pelvis_Perineum_severity",
        "External_severity",
        "severity_max"
    ]
    for col in severities:
        df_injury[col] = df_injury[col].fillna(0).astype(int)
    #df_injury["acslevel"] = df_injury["acslevel"].fillna("Unknown")
    df_injury["race1"] = df_injury["race1"].fillna("Other Race")
    # Not filling signsoflife because will be optimputed
    # df_injury["signsoflife"] = df_injury["signsoflife"].fillna("Unknown")
    df_injury["alcohol"] = df_injury["alcohol"].fillna("Unknown")

    for col in str_columns_to_map:
        if col == "method_of_injury":
#             print(col)
#             print(df_injury[col].isnull().sum())
            df_injury[col] = df_injury[col].replace(vars()[f"mapping_{col}_{dict_injury[i]}"]).astype(int)
        else:
#             print(col)
#             print(df_injury[col].isnull().sum())
            df_injury[col] = df_injury[col].replace(vars()[f"mapping_{col}"])
            try:
                df_injury[col] = df_injury[col].astype(int)
            except:
                continue
                
     # X data train/test split
    df_injury_train = df_injury.iloc[
                      index_before_2016, :
                      ].reset_index(drop=True)
    df_injury_test = df_injury.iloc[
                     index_from_2016, :
                     ].reset_index(drop=True)
    
    df_injury_train = df_injury_train[columns_to_keep]
    df_injury_test = df_injury_test[columns_to_keep]
    
    df_injury_train = df_injury_train[df_injury_train.hospdisp != 'exclude']
    df_injury_test = df_injury_test[df_injury_test.hospdisp != 'exclude']
    df_injury_train = df_injury_train[df_injury_train.hospdisp != 'died']
    df_injury_test = df_injury_test[df_injury_test.hospdisp != 'died']
    
    # Label train/test split
    
    hospdisp_target_train= pd.DataFrame(hospdisp_target[index_before_2016])
    hospdisp_target_test= pd.DataFrame(hospdisp_target[index_from_2016])
    
    print(len(hospdisp_target_train) + len(hospdisp_target_test))
    
    hospdisp_target_train= hospdisp_target_train[hospdisp_target_train.hospdisp != 'exclude']
    hospdisp_target_test= hospdisp_target_test[hospdisp_target_test.hospdisp != 'exclude']
    hospdisp_target_train= hospdisp_target_train[hospdisp_target_train.hospdisp != 'died']
    hospdisp_target_test= hospdisp_target_test[hospdisp_target_test.hospdisp != 'died']
   
    print(len(hospdisp_target_train) + len(hospdisp_target_test))
    
    base_path_pre_process= 'C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST PRE PROCESSING'
    
    df_injury_train.to_csv(
        f"{base_path_pre_process}/{dict_injury[i]}/X_train_pre_process_full_{dict_injury[i]}.csv",
        index=False
    )
    
    df_injury_test.to_csv(
        f"{base_path_pre_process}/{dict_injury[i]}/X_test_pre_process_full_{dict_injury[i]}.csv",
        index=False
    )
    
    
    hospdisp_target_train.to_csv(
        f"{base_path_pre_process}/{dict_injury[i]}/y_train_pre_process_full_{dict_injury[i]}.csv",
        index=False
    )
    
    hospdisp_target_test.to_csv(
        f"{base_path_pre_process}/{dict_injury[i]}/y_test_pre_process_full_{dict_injury[i]}.csv",
        index=False
    )
    
    print(f"Size of test set with time separation for injury {dict_injury[i]}: {len(df_injury_test)}")
    print("--------------------------------------------------------")

blunt
845324
791160
Size of test set with time separation for injury blunt: 167081
--------------------------------------------------------
penetrating
88729
79315
Size of test set with time separation for injury penetrating: 17241
--------------------------------------------------------


#### Imputation is done in a Julia file and then post imputation processing will proceed below

## POST IMPUTATION PROCESSING

In [14]:
def save_time_train_test_split(
    injury: str, train_X_time_injury_imputed: pd.DataFrame, test_X_time_injury_imputed: pd.DataFrame
):
    # Time train/test split (saving the inc_keys as well)
    # Saving train set
    base_filepath_timesplit = "C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST IMPUTATION PROCESSING"
    inc_keys_train_X_time_injury_imputed = train_X_time_injury_imputed.inc_key
    inc_keys_train_X_time_injury_imputed.to_csv(
        f"{base_filepath_timesplit}/{injury}/inc_keys_train_X_{injury}_final"
    )
    train_X_time_injury_imputed.drop("inc_key", axis=1).to_csv(
        f"{base_filepath_timesplit}/{injury}/train_X_{injury}_final.csv"
    )

    # Saving test set
    inc_keys_test_X_time_injury_imputed = test_X_time_injury_imputed.inc_key
    inc_keys_test_X_time_injury_imputed.to_csv(
        f"{base_filepath_timesplit}/{injury}/inc_keys_test_X_{injury}_final"
    )
    test_X_time_injury_imputed.drop("inc_key", axis=1).to_csv(
        f"{base_filepath_timesplit}/{injury}/test_X_{injury}_final.csv"
    )

In [34]:
def save_random_train_test_split(
    injury: str,
    train_X_time_injury_imputed: pd.DataFrame,
    test_X_time_injury_imputed: pd.DataFrame,
):
    base_filepath_timesplit = 'C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST PRE PROCESSING'
    y_train_hospdisp = pd.read_csv(
        f"{base_filepath_timesplit}/{injury}/y_train_pre_process_{injury}.csv",
        header=None,
    )
    y_test_hospdisp = pd.read_csv(
        f"{base_filepath_timesplit}/{injury}/y_test_pre_process_{injury}.csv",
        header=None,
    )
    
    
    X = pd.concat(
        [train_X_time_injury_imputed, test_X_time_injury_imputed]
    ).reset_index(drop=True)
    y = pd.concat(
        [y_train_hospdisp, y_test_hospdisp]
    ).reset_index(drop=True)
    
    
    ## need to do this so the split works
    y= y.rename(columns={0: 'hospdisp'})
    y= y[y.hospdisp != 'hospdisp']
    
    # Filter out severity 6
    X["severity_max"] = X[
        [
            "Head_severity",
            "Face_severity",
            "Neck_severity",
            "Thorax_severity",
            "Abdomen_severity",
            "Spine_severity",
            "Upper_Extremity_severity",
            "Lower_Extremity_severity",
            "Pelvis_Perineum_severity",
            "External_severity",
        ]
    ].max(axis=1)
    indices_severity_6_to_drop = X[X.severity_max == 6].index.values
    X.drop(indices_severity_6_to_drop, axis=0, inplace=True)
    y.drop(indices_severity_6_to_drop, axis=0, inplace=True)
    X.drop("severity_max", axis=1, inplace=True)
    

    # Reset index
    X.reset_index(inplace=True, drop=True)
    y.reset_index(inplace=True, drop=True)
    
    ### to fix some earlier replacement that did not occur
    replace_dict_2= {
        "Discharged/Transferred to Long Term Care Hospital": "post_acute_care",
        "Discharged/transferred to a psychiatric hospital or psychiatric distin": "post_acute_care"
    }
    
    replace_dict_3= {
        'home': 0,
        'post_acute_care': 1
    }
    
    y= y.replace(replace_dict_2)
    y= y.replace(replace_dict_3)
    
    
#### for smote addition refer to colab notebooks

    train_X, test_X, train_y, test_y = train_test_split(
        X, y, stratify=y, random_state=7, train_size=0.8
    )

#### Oversampling stuff
#     X= pd.concat([train_X, train_y], axis=1)
#     home = X[X.hospdisp==0]
#     post_acute_care = X[X.hospdisp==1]
#     post_acute_care_upsampled = resample(post_acute_care,
#                           replace=True, # sample with replacement
#                           n_samples=len(home)/2, # match number in majority class
#                           random_state=1)
    
# #### Undersampling stuff
#     home_downsampled = resample(home,
#                           replace=True, # sample with replacement
#                           n_samples=len(post_acute_care), # match number in majority class
#                           random_state=1)
    
#     # combine majority and upsampled minority
#     upsampled = pd.concat([home, post_acute_care_upsampled])
#     train_y_os= pd.DataFrame(upsampled['hospdisp'])
#     train_X_os = upsampled.drop('hospdisp', axis=1)
    
#     # combine downsampled majority and minority
#     downsampled = pd.concat([home_downsampled, post_acute_care])
#     train_y_ds= pd.DataFrame(downsampled['hospdisp'])
#     train_X_ds = downsampled.drop('hospdisp', axis=1)
 
    train_y.columns = ["label"]
    test_y.columns = ["label"]
    
#     train_y_os.columns = ["label"]
#     print(train_y_os.label.unique())
    
#     train_y_ds.columns = ["label"]
#     print(train_y_ds.label.unique())
    

#     train_y_sm.columns = ["label"]
#     print(train_y.label.unique())
    
#     test_y_sm.columns = ["label"]
#     print(test_y_sm.label.unique())''
    

    base_path_final= 'C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST IMPUTATION PROCESSING'
    
#     train_X['inc_key'].to_csv(
#         f"{base_path_final}/{injury}/inc_keys_train_random_final_{injury}.csv",
#         index=False
#     )
    
    
#     test_X['inc_key'].to_csv(
#         f"{base_path_final}/{injury}/inc_keys_test_random_final_{injury}.csv",
#         index=False
#     )
    
#     train_X= train_X.drop(['inc_key'], axis=1)
#     test_X= test_X.drop(['inc_key'], axis=1)
#     train_X_sm= train_X_sm.drop(['inc_key'], axis=1)
#     test_X_sm= test_X_sm.drop(['inc_key'], axis=1)

    train_X['gcstot2'] = train_X['gcstot2'].round(0).astype(int)
    test_X['gcstot2'] =  test_X['gcstot2'].round(0).astype(int)
    train_X['gcstot1'] = train_X['gcstot1'].round(0).astype(int)
    test_X['gcstot1'] =  test_X['gcstot1'].round(0).astype(int)
    
    train_X.to_csv(
        f"{base_path_final}/{injury}/X_train_{injury}_morb_splits.csv",
        index=False
    )
    
    test_X.to_csv(
        f"{base_path_final}/{injury}/X_test_{injury}_morb_splits.csv",
        index=False
    )
    
    train_y.to_csv(
        f"{base_path_final}/{injury}/y_train_{injury}_morb_splits.csv",
        index=False
    )
    
    test_y.to_csv(
        f"{base_path_final}/{injury}/y_test_{injury}_morb_splits.csv",
        index=False
    )
    
#     train_X_os.to_csv(
#         f"{base_path_final}/{injury}/X_train_os_random_final_{injury}.csv",
#         index=False
#     )
    
    
#     train_y_os.to_csv(
#         f"{base_path_final}/{injury}/y_train_os_random_final_{injury}.csv",
#         index=False
#     )
    
#     train_X_ds.to_csv(
#         f"{base_path_final}/{injury}/X_train_ds_random_final_{injury}.csv",
#         index=False
#     )
    
    
#     train_y_ds.to_csv(
#         f"{base_path_final}/{injury}/y_train_ds_random_final_{injury}.csv",
#         index=False

#     # Saving the inc_keys and deleting column for X dataframes
#     data_path_random = f"/C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST IMPUTATION PROCESSING/{injury}/"
#     for filename in ["train_X", "test_X"]:
#         inc_keys_filename = vars()[filename].inc_key.reset_index()
#         inc_keys_filename.to_csv(
#             data_path_random + f"inc_keys_{filename}_{injury}_final_random.csv", header=True
#         )
#         vars()[filename].drop("inc_key", axis=1, inplace=True)

#     ## Saving file
#     for filename in [
#         f"train_X_{injury}_final_random",
#         f"test_X_{injury}_final_random",
#     ]:
#         vars()[filename].reset_index(drop=True, inplace=True)
#         vars()[filename].to_csv(
#             data_path_random + f"{filename}.csv", header=True
#         )
#         print(f"Saved file {filename}")

      
    
    

In [35]:
# Re-Processing imputed data
def preprocessing_imputed_data(injury: str):
    # These columns will be remapped the other way around when OptImpute has been performed
#     str_columns_to_map = [
#         "gender",
#         "race1",
#         #"acslevel",
#         "signsoflife",
#         # "alcohol",
#         "method_of_injury",
#         "eddisp"
#     ]
    str_columns_to_map = ["gender", "race1", "signsoflife",
                       "eddisp","tmode1","tmode2","drug1"]
    # Ordered:
    #mapping_acslevel = {"I": 1, "II": 2, "III": 3, "Unknown": -1}
    #mapping_acslevel_other_way = {v: k for k, v in mapping_acslevel.items()}

    # Non-ordered
    mapping_gender = {"Female": 1, "Male": 0}
    mapping_gender_other_way = {v: k for k, v in mapping_gender.items()}

    mapping_race1 = {
        "Other Race": 0,
        "Black or African American": 1,
        "White": 2,
        "American Indian": 3,
        "Native Hawaiian or Other Pacific Islander": 4,
        "Asian": 5,
    }
    mapping_race1_other_way = {v: k for k, v in mapping_race1.items()}

    mapping_signsoflife = {
        "Unknown": -1,
        "Arrived with signs of life": 1,
        "Arrived with NO signs of life": 0,
    }
    mapping_signsoflife_other_way = {v: k for k, v in mapping_signsoflife.items()}

    # mapping_alcohol = {"Alcohol": 1, "Residual/no alcohol": 0, "Unknown": -1}
    # mapping_alcohol_other_way = {v: k for k, v in mapping_alcohol.items()}

    mapping_method_of_injury_penetrating = {
        "Penetrating - Stab Wound": 1,
        "Penetrating - Gunshot Wound": 2,
        "Penetrating - Other/Mixed": 3,
    }
    mapping_method_of_injury_penetrating_other_way = {
        v: k for k, v in mapping_method_of_injury_penetrating.items()
    }

    mapping_method_of_injury_blunt = {
        "Blunt - MVT occupant": 1,
        "Blunt - Fall": 2,
        "Blunt - MVT motorcyclist": 3,
        "Blunt - Other": 4,
        "Blunt - MVT Pedal cyclist/pedestrian": 5,
    }
    mapping_method_of_injury_blunt_other_way = {
        v: k for k, v in mapping_method_of_injury_blunt.items()
    }
    
        
    mapping_eddisp = {'Operating Room' :0,
       'Floor bed (general admission, non specialty unit bed)': 1,
       'Telemetry/step-down unit (less acuity than ICU)': 2,
       'Intensive Care Unit (ICU)': 3,
       'Observation unit (unit that provides &lt; 24 hour stays)': 4}
    
    mapping_eddisp_other_way = {
        v: k for k, v in mapping_eddisp.items()
    }
    
    mapping_drug1 = {'No (not tested)': 0,
       'Yes (confirmed by test [illegal use drug])': 1,
       'No (confirmed by test)': 2,
       'Yes (confirmed by test [prescription drug])': 3}

    mapping_tmode1 = {'Ground Ambulance': 0, 'Helicopter Ambulance' :1,
           'Private/Public Vehicle/Walk-in': 2, 'Other': 3, 'Police': 4,
           'Fixed-wing Ambulance': 5}

    mapping_tmode2 = mapping_tmode1
    
    mapping_drug1_other_way = {
        v: k for k, v in mapping_drug1.items()
    }
    
    mapping_tmode1_other_way = {
        v: k for k, v in mapping_tmode1.items()
    }
    
    mapping_tmode2_other_way = {
        v: k for k, v in mapping_tmode2.items()
    }
    
    

    test_X_time_injury_imputed = pd.read_csv(
        f"C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST IMPUTATION/{injury}/{injury}_imputed_test.csv"
    )
    train_X_time_injury_imputed = pd.read_csv(
        f"C:/Users/15125/Desktop/CAPSTONE PROJECT/CLEANED DATA/POST IMPUTATION/{injury}/{injury}_imputed_train.csv"
    )
    
    # Replacing systolic blood pressure of less than 60 by -1 (<=> unknown or error)
    train_X_time_injury_imputed.loc[train_X_time_injury_imputed.sbp1 < 60, "sbp1"] = -1
    test_X_time_injury_imputed.loc[test_X_time_injury_imputed.sbp1 < 60, "sbp1"] = -1
    
    train_X_time_injury_imputed.reset_index(inplace=True, drop=True)
    test_X_time_injury_imputed.reset_index(inplace=True, drop=True)

    # Dropped alcohol on 09/01/2020
    new_columns = [
        "age", "gender", "race1",
        # "teachsta", # "region",
        # "transfer",
        "signsoflife","eddisp", "sbp1",  
        "pulse1",  
        "oxysat1",  
        "temp1", "gcstot1",  
         "bleeding_disorder",
        "current_chemotherapy", "congestive_heart_failure",
        "current_smoker", "chronic_renal_failure",
        "history_cva", "diabetes", "disseminated_cancer",
        "copd", "steroid", "cirrhosis", "history_MI",
        "history_pvd", "hypertension_medication",  
        "method_of_injury",  # new AIS"
        "Head_severity", "Face_severity", "Neck_severity", "Thorax_severity",
        "Abdomen_severity", "Spine_severity",
        "Upper_Extremity_severity", "Lower_Extremity_severity",
        "Pelvis_Perineum_severity", "External_severity", "sbp2", "rr2", "drug_use_disorder",
        "issais","morbidity", "drug1",  "rr1", "pulse2", "acslevel", 
        "tmode1", "gcstot2", "oxysat2","alcohol_use_disorder", "tmode2", "morbidity_splits"
        #"hemorrhage_ctrl_type","hospdisp"
    ]
    
    test_X_time_injury_imputed = test_X_time_injury_imputed.drop(columns= ['x1'])
    train_X_time_injury_imputed = train_X_time_injury_imputed.drop(columns= ['x1'])
    
    test_X_time_injury_imputed.columns = new_columns
    train_X_time_injury_imputed.columns = new_columns

    for t_set in ["train", "test"]:
        str_columns_to_map = ["gender", "race1", "signsoflife","eddisp","tmode1","tmode2","drug1"]# ,"alcohol", "acslevel"]
        vars()[f"{t_set}_X_time_injury_imputed"]["method_of_injury"] = vars()[
            f"{t_set}_X_time_injury_imputed"
        ]["method_of_injury"].map(
            vars()[f"mapping_method_of_injury_{injury}_other_way"]
        )

        # Map the other way around the categorical features that have been imputed
        for col_map in str_columns_to_map:
            vars()[f"{t_set}_X_time_injury_imputed"][col_map] = (
                vars()[f"{t_set}_X_time_injury_imputed"][col_map].round(0).astype(int)
            )
            vars()[f"{t_set}_X_time_injury_imputed"][col_map] = vars()[
                f"{t_set}_X_time_injury_imputed"
            ][col_map].map(vars()[f"mapping_{col_map}_other_way"])
        print(f"Preprocessed {t_set} dataset for {injury} injuries")

#     save_time_train_test_split(injury, train_X_time_injury_imputed, test_X_time_injury_imputed)
    save_random_train_test_split(
        injury, train_X_time_injury_imputed, test_X_time_injury_imputed
    )

In [36]:
preprocessing_imputed_data("penetrating")
preprocessing_imputed_data("blunt")

Preprocessed train dataset for penetrating injuries
Preprocessed test dataset for penetrating injuries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Preprocessed train dataset for blunt injuries
Preprocessed test dataset for blunt injuries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user