In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Code for preprocessing

In [5]:
os.listdir("../Data/original_data/")

['icd_ecode2.csv',
 '.DS_Store',
 'ais_inputs.csv',
 'excel',
 'TQIP_2010_2016_Merged_MGHTrauma2019Jan.csv',
 'icd.csv',
 'icd_additional_ecodes.csv',
 'icd_primary_ecodes.csv']

In [6]:
df_trauma = pd.read_csv("../Data/original_data/TQIP_2010_2016_Merged_MGHTrauma2019Jan.csv", nrows=50000)
initial_length = len(df_trauma)
df_trauma.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,inc_key,yobirth,age,gender,race1,race2,ethnic,teachsta,acslevel,acspedl,...,compldes8,complkey9,compldes9,complkey10,compldes10,complkey11,compldes11,complkey12,compldes12,_merge
0,00000ae7-4797,1985,28,Female,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
1,00001376-9675,1960,54,Male,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,I,...,,,,,,,,,,matched (3)
2,00001774-08a9,1975,40,Male,Other Race,Not Known/Not Recorded BIU 2,Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
3,00002cba-17d4,1996,16,Male,Black or African American,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,I,Not Applicable,...,,,,,,,,,,matched (3)
4,000048a2-1679,1993,20,Male,White,Not Known/Not Recorded BIU 2,Not Hispanic or Latino,University,Not Applicable,Not Applicable,...,,,,,,,,,,matched (3)


# General preprocessing

In [7]:
allowed_eddisp = [
  "Operating Room",
  # "Transferred to another hospital",
  "Observation unit (unit that provides &lt; 24 hour stays)",
  "Intensive Care Unit (ICU)",
  "Telemetry/step-down unit (less acuity than ICU)",
  "Floor bed (general admission, non specialty unit bed)"
  # "Home without services",
  # "Other (jail, institutional care facility, mental health, etc)",
  # "Home with services",
  # "Left against medical advice",
]
df_trauma = df_trauma[df_trauma["eddisp"].isin(allowed_eddisp)]
df_trauma.loc[(df_trauma.tmode1.isnull()) & ~(df_trauma.tmode2.isnull()), "tmode1"] = df_trauma.loc[
    (df_trauma.tmode1.isnull()) & ~(df_trauma.tmode2.isnull()), "tmode2"
]

In [8]:
comorkeys = [x for x in df_trauma.columns if "comorkey" in x]
complkeys = [x for x in df_trauma.columns if "complkey" in x]
predotkeys = [x for x in df_trauma.columns if "predot" in x]
severitykeys = [x for x in df_trauma.columns if "severity" in x]
columns_kept_daisy = (
    comorkeys
    + complkeys
    + predotkeys
    + severitykeys
    + [
        "issais", "age", "gender", "race1", "ethnic", "acslevel",
        "tmode1", "tmode2", "transfer", "alcohol", "drug1", "signsoflife",
        "sbp1", "sbp2", "pulse1", "pulse2", "rr1", "rr2",
        "oxysat1", "oxysat2", "temp1", "gcstot1", "gcstot2",
        "ecode", "icd10_primary_ecode", "icd10_additonal_ecode",
        "eddisp", "hospdisp", "yoadmit", "teachsta", "region", "hemorrhage_ctrl_type"
    ]
)
df_trauma = df_trauma[columns_kept_daisy]

In [39]:
##### Mapping values to NaNs #####
for col_severity in severitykeys:
    df_trauma[col_severity] = df_trauma[col_severity].replace({9: np.nan})
df_trauma = df_trauma.replace({
    "Not Applicable BIU 1": np.nan,
    "Not Known/Not Recorded BIU 2": np.nan,
    "Not Applicable": np.nan,
    -99: np.nan,
    -1: np.nan,
    -2: np.nan,
})

# Comorbidities

In [40]:
for new_column, value in zip(
    ["alcohol_use_disorder", "bleeding_disorder", "current_chemotherapy", "congestive_heart_failure",
    "current_smoker", "chronic_renal_failure", "history_cva", "diabetes", "disseminated_cancer", "copd",
    "steroid", "cirrhosis", "drug_use_disorder", "history_MI", "history_pvd", "hypertension_medication"], 
    [2, 4, 5, 7, 8, 9, 10, 11, 12, 23, 24, 25, 28, 17, 18, 19]
):
    df_trauma[new_column] = 0
    df_trauma.loc[
        ((df_trauma["comorkey1"] == value) | (df_trauma["comorkey2"] == value) | 
        (df_trauma["comorkey3"] == value) | (df_trauma["comorkey4"] == value) | 
        (df_trauma["comorkey5"] == value) | (df_trauma["comorkey6"] == value) | 
        (df_trauma["comorkey7"] == value) | (df_trauma["comorkey8"] == value) | 
        (df_trauma["comorkey9"] == value) | (df_trauma["comorkey10"] == value) | 
        (df_trauma["comorkey11"] == value) | (df_trauma["comorkey12"] == value)), 
        new_column
    ] = 1
    print(f"Created new column {new_column}")
df_trauma = df_trauma.loc[:, [col for col in df_trauma.columns if "comorkey" not in col]]

Created new column alcohol_use_disorder
Created new column bleeding_disorder
Created new column current_chemotherapy
Created new column congestive_heart_failure
Created new column current_smoker
Created new column chronic_renal_failure
Created new column history_cva
Created new column diabetes
Created new column disseminated_cancer
Created new column copd
Created new column steroid
Created new column cirrhosis
Created new column drug_use_disorder
Created new column history_MI
Created new column history_pvd
Created new column hypertension_medication


# Morbidities

In [41]:
# 0 means that the patient has no allowed morbidity, 1 means he has one
df_trauma["morbidity"] = 0
# 11, 15, 18, 22, 23, 30, 31 and 35 should be = 0 for Majed (14/12)
allowed_morbidities = [4, 5, 8, 11, 12, 14, 15, 18, 19, 21, 22, 23, 25, 30, 31, 32, 35]
for col_compl in complkeys:
    df_trauma.loc[(df_trauma[col_compl].isin(allowed_morbidities)), "morbidity"] = 1

# Blunt/Penetration Feature (method_of_injury)

## Mapping ecode 
(ecode2 has exactly the sames keys so it's useless to do any mapping)

In [84]:
icd_mapping_ecode = pd.read_csv("../data/icd.csv", sep=";")
dict_icd_mapping_ecode = {}
for j in range(len(icd_mapping_ecode.columns)):
    injury_type = icd_mapping_ecode.columns[j]
    print(injury_type)
    ecode_values = icd_mapping_ecode.iloc[0, j].split(' ')
    ecode_values = [x[6:-1].split("\n")[0] if "float" in x else x.split("\n")[0] for x in ecode_values]
    ecode_values = [float(x) if ')' not in x else float(x[:-1]) for x in ecode_values]
    dict_icd_mapping_ecode.update({ecode: injury_type for ecode in ecode_values})
print("===> Preprocessed the ICD mapping")

Blunt - Fall
Blunt - Other
Blunt - MVT occupant
Blunt - MVT Pedal cyclist/pedestrian
Blunt - MVT motorcyclist
Penetrating - Gunshot Wound
Penetrating - Stab Wound
Penetrating - Other/Mixed
===> Preprocessed the ICD mapping


In [85]:
# We replace all the NaNs with "Other"
df_trauma["method_of_injury_ecode"] = df_trauma["ecode"].astype(float).round(1).map(dict_icd_mapping_ecode)
df_trauma["method_of_injury_ecode"] = df_trauma["method_of_injury_ecode"].replace({np.nan: "Unknown"})
print(df_trauma["method_of_injury_ecode"].value_counts())

Blunt - Fall                            17490
Unknown                                  8775
Blunt - MVT occupant                     8459
Blunt - Other                            5270
Blunt - MVT motorcyclist                 2784
Penetrating - Gunshot Wound              2075
Blunt - MVT Pedal cyclist/pedestrian     1927
Penetrating - Stab Wound                 1132
Penetrating - Other/Mixed                   1
Name: method_of_injury_ecode, dtype: int64


## Mapping icd10_primary_code
(icd10_additional_code has exactly the sames keys so it's useless to do any mapping)

In [86]:
icd_mapping_primary = pd.read_csv("../data/icd_primary_ecodes.csv", sep=";")
dict_icd_mapping_primary_icd = {}
for j in range(len(icd_mapping_primary.columns)):
    injury_type = icd_mapping_primary.columns[j]
    print(injury_type)
    primary_icd_values = icd_mapping_primary.iloc[0, j].split(' ')
    primary_icd_values = [x.split("\n")[0] if "\n" in x else x for x in primary_icd_values]
    dict_icd_mapping_primary_icd.update({primary_icd: injury_type for primary_icd in primary_icd_values})
print("===> Preprocessed the ICD mapping primary icd")
dict_icd_mapping_primary_icd.update({-1: "Unknown", -2: "Unknown"})

Blunt - Fall
Blunt - Other
Blunt - MVT occupant
Blunt - MVT Pedal cyclist/pedestrian
Blunt - MVT motorcyclist
Penetrating - Gunshot Wound
Penetrating - Stab Wound
Penetrating - Other/Mixed
===> Preprocessed the ICD mapping primary icd


In [87]:
df_trauma["method_of_injury_icd_primary"] = df_trauma["icd10_primary_ecode"].map(
    dict_icd_mapping_primary_icd
)
df_trauma["method_of_injury_icd_primary"] = df_trauma["method_of_injury_icd_primary"].replace({np.nan: "Unknown"})
print(df_trauma["method_of_injury_icd_primary"].value_counts())

Unknown                                 38319
Blunt - Fall                             4775
Blunt - MVT occupant                     1842
Blunt - Other                            1212
Penetrating - Gunshot Wound               598
Blunt - MVT motorcyclist                  549
Blunt - MVT Pedal cyclist/pedestrian      333
Penetrating - Stab Wound                  280
Penetrating - Other/Mixed                   5
Name: method_of_injury_icd_primary, dtype: int64


## Creating final method_of_injury from all others

In [83]:
df_trauma["method_of_injury"] = df_trauma.method_of_injury_ecode
# Completing ecode with primary_icd10_code
df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode == "Unknown") & 
         (df_trauma.method_of_injury_icd_primary != "Unknown") & 
         (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury"
] = df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode == "Unknown") & 
         (df_trauma.method_of_injury_icd_primary != "Unknown") & 
         (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury_icd_primary"
]

# Completing primary_icd10_code with ecode
df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode != "Unknown") & 
        (df_trauma.method_of_injury_icd_primary == "Unknown") & 
        (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury"
] = df_trauma.loc[
    (
        (df_trauma.method_of_injury_ecode != "Unknown") & 
        (df_trauma.method_of_injury_icd_primary == "Unknown") & 
        (df_trauma.method_of_injury_ecode != df_trauma.method_of_injury_icd_primary)
    )
    , "method_of_injury_ecode"
]
print(df_trauma.method_of_injury.value_counts())
# Dropping these two intermediary columns
df_trauma.drop(["method_of_injury_ecode", "method_of_injury_icd_primary"], axis=1, inplace=True)

Blunt - Fall                            21515
Blunt - MVT occupant                    10027
Blunt - Other                            6289
Blunt - MVT motorcyclist                 3258
Penetrating - Gunshot Wound              2588
Blunt - MVT Pedal cyclist/pedestrian     2216
Penetrating - Stab Wound                 1375
Unknown                                   639
Penetrating - Other/Mixed                   6
Name: method_of_injury_final, dtype: int64

# Alcohol

In [10]:
df_trauma["alcohol"] = df_trauma["alcohol"].map({
    "Yes (confirmed by test [beyond legal limit])": "Alcohol",
    "No (confirmed by test)": "Residual/no alcohol",
    "Yes (confirmed by test [trace levels])": "Residual/no alcohol",
    "No (not tested)": np.nan
})

# Predots Cleaning (severity)

In [15]:
ais_inputs = pd.read_csv("../Data/original_data/ais_inputs.csv", sep=";")
columns_ais = list(ais_inputs.columns)[1:]
injury_locations = [x[9:] for x in columns_ais]

In [16]:
for col, location in zip(columns_ais, injury_locations):
    vars()[f"dict_ais_inputs_{location}"] = {}
    ais_temp = ais_inputs.loc[~ais_inputs[col].isnull(), ["AIS_Predots", col]]
    predots = ais_temp.AIS_Predots.tolist()
    severity = ais_temp[col].tolist()
    vars()[f"dict_ais_inputs_{location}"].update({k:v for k,v in zip(predots, severity)})

In [18]:
for location in injury_locations:
    print(location)
    df_temp = df_trauma.copy()
    df_temp[f"{location}_severity"] = np.nan
    for predot_col in predotkeys:
        df_temp[predot_col] = df_temp[predot_col].astype(float)
        df_temp[predot_col] = df_temp[predot_col].map(vars()[f"dict_ais_inputs_{location}"])
        df_temp[predot_col] = df_temp[predot_col].replace({9: np.nan})
    df_temp[f"{location}_severity"] = df_temp[predotkeys].max(axis=1)
    df_trauma[f"{location}_severity"] = df_temp[f"{location}_severity"].copy()
    print(df_trauma[f"{location}_severity"].value_counts()/len(df_trauma)*100)
    print("% of NaNs: ",df_trauma[f"{location}_severity"].isnull().sum()/len(df_trauma)*100)
    print('----------------------------------')

# Taking the maximum severity for a patient over all new severity columns
df_trauma["severity_max"] = df_trauma[[f"{location}_severity" for location in injury_locations]].max(axis=1)

Face
1.0    22.659821
2.0     6.092292
3.0     0.160708
4.0     0.010436
Name: Face_severity, dtype: float64
% of NaNs:  71.07674326383237
----------------------------------
Neck
1.0    0.768059
3.0    0.546824
2.0    0.256715
4.0    0.037568
5.0    0.004174
Name: Neck_severity, dtype: float64
% of NaNs:  98.38665915304823
----------------------------------
Head
3.0    25.698662
1.0     7.607539
4.0     3.059712
2.0     1.604992
5.0     1.586208
6.0     0.025045
Name: Head_severity, dtype: float64
% of NaNs:  60.417840669546884
----------------------------------
Thorax
3.0    16.719888
2.0     9.604909
1.0     1.982760
5.0     1.565337
4.0     0.935028
6.0     0.586480
Name: Thorax_severity, dtype: float64
% of NaNs:  68.60559764573289
----------------------------------
Abdomen
2.0    6.138209
1.0    4.652182
4.0    2.725774
3.0    2.610982
5.0    0.638658
Name: Abdomen_severity, dtype: float64
% of NaNs:  83.23419531233695
----------------------------------
Spine
2.0    17.133137
3.0 

# Transfers Cleaning

In [14]:
print(f"Length before transfer cleaning (keeping only 'No'): {len(df_trauma)}")
df_trauma = df_trauma[df_trauma.transfer == "No"]
print(f"Length after transfer cleaning (keeping only 'No'): {len(df_trauma)}")

47913
34110


# Filtering nan hospdisp

In [21]:
print(f"Length before filtering NaNs from hospdisp: {len(df_trauma)}")
df_trauma = df_trauma[~df_trauma.hospdisp.isnull()]
print(f"Length after filtering NaNs from hospdisp: {len(df_trauma)}")

Length before filtering NaNs from hospdisp: 34110
Length after filtering NaNs from hospdisp: 34097


In [28]:
df_trauma = df_trauma.sort_values(by="yoadmit").reset_index(drop=True)
df_trauma.to_csv("./trauma_data_preprocessed.csv")
hosp_mortality = ((df_trauma.hospdisp == "Expired") | (df_trauma.hospdisp == "Deceased/Expired"))*1
hosp_morbidity = df_trauma.morbidity
df_trauma_train = df_trauma[df_trauma.yoadmit < 2016].reset_index(drop=True)
df_trauma_test = df_trauma[df_trauma.yoadmit >= 2016].reset_index(drop=True)
columns_to_keep = [
    "age",
    "gender",
    "race1",
    # "teachsta",
    # "region",
    "acslevel",
    "tmode1",
    # "transfer",
    "signsoflife",
    "sbp1",
    # "sbp2",
    "pulse1",
    # "pulse2",
    "oxysat1",
    # "oxysat2",
    "temp1",
    "gcstot1",
    # "gcstot2",
    "alcohol",
    "bleeding_disorder",
    "current_chemotherapy",
    "congestive_heart_failure",
    "current_smoker",
    "chronic_renal_failure",
    "history_cva",
    "diabetes",
    "disseminated_cancer",
    "copd",
    "steroid",
    "cirrhosis",
    "history_MI",
    "history_pvd",
    "hypertension_medication",
    # "eddisp",
    "method_of_injury",
    # new AIS"
    "Face_severity",
    "Neck_severity",
    "Thorax_severity",
    "Abdomen_severity",
    "Spine_severity",
    "Upper_Extremity_severity",
    "Lower_Extremity_severity",
    "Pelvis_Perineum_severity",
    "External_severity",
    "severity_max",
    "hemorrhage_ctrl_type",
]
df_trauma_train = df_trauma_train[columns_to_keep]
df_trauma_test = df_trauma_test[columns_to_keep]

In [22]:
print(f"Size of training set: {len(df_trauma_train)}")
print(f"Size of testing set: {len(df_trauma_test)}")

Size of training set: 26605
Size of testing set: 7505
