In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Creating train/test data for blunt/penetrating before Imputation

In [153]:
df = pd.read_csv("../Data/trauma_data_preprocessed.csv")
df.head()

Unnamed: 0,complkey1,complkey2,complkey3,complkey4,complkey5,complkey6,complkey7,complkey8,complkey9,complkey10,...,Neck_severity,Head_severity,Thorax_severity,Abdomen_severity,Spine_severity,Upper_Extremity_severity,Lower_Extremity_severity,Pelvis_Perineum_severity,External_severity,severity_max
0,1.0,,,,,,,,,,...,,,,,,,,,,
1,1.0,,,,,,,,,,...,,,2.0,,,,,,,2.0
2,,,,,,,,,,,...,,,,2.0,,,,,,2.0
3,,,,,,,,,,,...,,,,,,1.0,,,,1.0
4,1.0,,,,,,,,,,...,,,,,,,,,,


In [154]:
blunt_injury = ['Blunt - MVT occupant', 'Blunt - Fall', 'Blunt - MVT motorcyclist', 'Blunt - Other',
               'Blunt - MVT Pedal cyclist/pedestrian']
penetrating_injury = ['Penetrating - Gunshot Wound', 'Penetrating - Stab Wound', 'Penetrating - Other/Mixed']
df_blunt = df[df.method_of_injury.isin(blunt_injury)].reset_index(drop=True)
df_penetrating = df[df.method_of_injury.isin(penetrating_injury)].reset_index(drop=True)
print(len(df_blunt)/len(df))
print(len(df_penetrating)/len(df))

0.8914840906114967
0.09502579546942516


In [155]:
# These columns will have to be remapped the other way around when OptImpute has been performed
str_columns_to_map = ["gender", "race1", "acslevel", "signsoflife", 
                      "alcohol", "method_of_injury"]
# Ordered:
mapping_acslevel = {"I": 1, "II":2, "III":3, "Unknown":-1}

#Non-ordered
mapping_gender = {"Female":1, "Male":0}
mapping_race1 = {'Other Race': 0, 'Black or African American':1, 'White':2, 'American Indian':3,
               'Native Hawaiian or Other Pacific Islander':4, 'Asian': 5}
mapping_signsoflife = {'Unknown':-1, 'Arrived with signs of life': 1, 'Arrived with NO signs of life':0}
mapping_alcohol = {"Alcohol": 1, "Residual/no alcohol": 0, "Unknown":-1}
mapping_method_of_injury_penetrating = {'Penetrating - Stab Wound': 1, 'Penetrating - Gunshot Wound': 2,
                                       'Penetrating - Other/Mixed':3}
mapping_method_of_injury_blunt = {'Blunt - MVT occupant':1, 'Blunt - Fall':2, 'Blunt - MVT motorcyclist':3,
 'Blunt - Other':4, 'Blunt - MVT Pedal cyclist/pedestrian':5}

In [156]:
#Before the filtering asked by Majed we had the following ratios of classes:
#BLUNT - MORTALITY
#0    0.950523
#1    0.049477
#Name: hospdisp, dtype: float64
#BLUNT - MORBIDITY
#0    0.914447
#1    0.085553
#Name: morbidity, dtype: float64
#Size of train set with time separation for injury blunt: 671236
#Size of test set with time separation for injury blunt: 183321

#///////////////////////////////////////////////////////////////////

#PENETRATING - MORTALITY
#0    0.903374
#1    0.096626
#Name: hospdisp, dtype: float64
#PENETRATING - MORBIDITY
#0    0.874219
#1    0.125781
#Name: morbidity, dtype: float64
#Size of train set with time separation for injury penetrating: 71027
#Size of test set with time separation for injury penetrating: 20211

In [157]:
# With the new morbidity !
dict_injury = {0: "blunt", 1: "penetrating"}
for i, df_injury in enumerate([df_blunt, df_penetrating]):
    print(dict_injury[i])
    columns_to_keep = [
        "inc_key","age", "gender", "race1",
        # "teachsta", # "region",
        "acslevel", #"tmode1", # "transfer",
        "signsoflife", "sbp1", # "sbp2",
        "pulse1", # "pulse2",
        "oxysat1", # "oxysat2",
        "temp1", "gcstot1", # "gcstot2",
        "alcohol", "bleeding_disorder",
        "current_chemotherapy", "congestive_heart_failure",
        "current_smoker", "chronic_renal_failure",
        "history_cva", "diabetes", "disseminated_cancer",
        "copd", "steroid", "cirrhosis", "history_MI",
        "history_pvd", "hypertension_medication", # "eddisp",
        "method_of_injury", # new AIS"
        "Face_severity", "Neck_severity", "Thorax_severity",
        "Abdomen_severity", "Spine_severity",
        "Upper_Extremity_severity", "Lower_Extremity_severity",
        "Pelvis_Perineum_severity", "External_severity", "severity_max",
        "hemorrhage_ctrl_type"
    ]
    # Creating target values for mortality & morbidity
    hosp_mortality = ((df_injury.hospdisp == "Expired") 
                      | (df_injury.hospdisp == "Deceased/Expired"))*1
    print(hosp_mortality.value_counts()/len(hosp_mortality))
    hosp_morbidity = df_injury.morbidity
    print(hosp_morbidity.value_counts()/len(hosp_morbidity))
    # Getting indices of the two time periods for train/test split
    index_before_2016 = df_injury[
        df_injury.yoadmit < 2016
    ].index.values
    index_from_2016 = df_injury[
        df_injury.yoadmit >= 2016
    ].index.values
    
    # Imputing categorical variables missing values with other categories
    severities = [
        "Face_severity",
        "Neck_severity",
        "Thorax_severity",
        "Abdomen_severity",
        "Spine_severity",
        "Upper_Extremity_severity",
        "Lower_Extremity_severity",
        "Pelvis_Perineum_severity",
        "External_severity",
        "severity_max"
    ]
    for col in severities:
        df_injury[col] = df_injury[col].fillna(0).astype(int)
    df_injury["acslevel"] = df_injury["acslevel"].fillna("Unknown")
    df_injury["race1"] = df_injury["race1"].fillna("Other Race")
    df_injury["signsoflife"] = df_injury["signsoflife"].fillna("Unknown")
    df_injury["alcohol"] = df_injury["alcohol"].fillna("Unknown")
    
    
    for col in str_columns_to_map:
        if col == "method_of_injury":
            print(col)
            print(df_injury[col].isnull().sum())
            df_injury[col] = df_injury[col].replace(vars()[f"mapping_{col}_{dict_injury[i]}"]).astype(int)
        else:
            print(col)
            print(df_injury[col].isnull().sum())
            df_injury[col] = df_injury[col].replace(vars()[f"mapping_{col}"])
            try:
                df_injury[col] = df_injury[col].astype(int)
            except:
                continue

    # X data train/test split
    df_injury_train = df_injury.iloc[
        index_before_2016, :
    ].reset_index(drop=True)
    df_injury_test = df_injury.iloc[
        index_from_2016, :
    ].reset_index(drop=True)

    # Mortality train/test split
    hosp_mortality_train = hosp_mortality[index_before_2016]
    hosp_mortality_test = hosp_mortality[index_from_2016]

    # Morbidity train/test split
    hosp_morbidity_train = hosp_morbidity[index_before_2016]
    hosp_morbidity_test = hosp_morbidity[index_from_2016]


    # Saving train data with time split
    df_injury_train = df_injury_train[columns_to_keep]
    df_injury_train.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_X_train_time_{dict_injury[i]}.csv",
        index=False
    )
    hosp_morbidity_train.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_y_train_morbidity_time_{dict_injury[i]}.csv",
        index=False
    )
    hosp_mortality_train.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_y_train_mortality_time_{dict_injury[i]}.csv",
        index=False
    )
    print(f"Size of train set with time separation for injury {dict_injury[i]}: {len(df_injury_train)}")


    # Saving test data with time split
    df_injury_test = df_injury_test[columns_to_keep]
    df_injury_test.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_X_test_time_{dict_injury[i]}.csv",
        index=False
    )
    hosp_morbidity_test.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_y_test_morbidity_time_{dict_injury[i]}.csv",
        index=False
    )
    hosp_mortality_test.to_csv(
        f"../Data/time_split_per_injury_new_morbidity/{dict_injury[i]}/trauma_y_test_mortality_time_{dict_injury[i]}.csv",
        index=False
    )
    print(f"Size of test set with time separation for injury {dict_injury[i]}: {len(df_injury_test)}")
    print("--------------------------------------------------------")

blunt
0    0.950792
1    0.049208
Name: hospdisp, dtype: float64
0    0.937696
1    0.062304
Name: morbidity, dtype: float64
gender
281
race1
0
acslevel
0
signsoflife
0
alcohol
0
method_of_injury
0




Size of train set with time separation for injury blunt: 671199




Size of test set with time separation for injury blunt: 179834
--------------------------------------------------------
penetrating
0    0.903863
1    0.096137
Name: hospdisp, dtype: float64
0    0.905737
1    0.094263
Name: morbidity, dtype: float64
gender
28
race1
0
acslevel
0
signsoflife
0
alcohol
0
method_of_injury
0
Size of train set with time separation for injury penetrating: 71026
Size of test set with time separation for injury penetrating: 19688
--------------------------------------------------------


In [158]:
100*df_injury_train.isnull().sum()/len(df_injury_train)

inc_key                      0.000000
age                          0.604004
gender                       0.035198
race1                        0.000000
acslevel                     0.000000
signsoflife                  0.000000
sbp1                         1.650100
pulse1                       1.375553
oxysat1                     10.597528
temp1                       22.804325
gcstot1                      1.386816
alcohol                      0.000000
bleeding_disorder            0.000000
current_chemotherapy         0.000000
congestive_heart_failure     0.000000
current_smoker               0.000000
chronic_renal_failure        0.000000
history_cva                  0.000000
diabetes                     0.000000
disseminated_cancer          0.000000
copd                         0.000000
steroid                      0.000000
cirrhosis                    0.000000
history_MI                   0.000000
history_pvd                  0.000000
hypertension_medication      0.000000
method_of_in

# Re-processing Imputed Data

In [48]:
# These columns will have to be remapped the other way around when OptImpute has been performed
str_columns_to_map = ["gender", "race1", "acslevel", "signsoflife", 
                      "alcohol", "method_of_injury"]
# Ordered:
mapping_acslevel = {"I": 1, "II":2, "III":3, "Unknown":-1}
mapping_acslevel_other_way = {v:k for k,v in mapping_acslevel.items()}

#Non-ordered
mapping_gender = {"Female":1, "Male":0}
mapping_gender_other_way = {v:k for k,v in mapping_gender.items()}

mapping_race1 = {'Other Race': 0, 'Black or African American':1, 'White':2, 'American Indian':3,
               'Native Hawaiian or Other Pacific Islander':4, 'Asian': 5}
mapping_race1_other_way = {v:k for k,v in mapping_race1.items()}

mapping_signsoflife = {'Unknown':-1, 'Arrived with signs of life': 1, 'Arrived with NO signs of life':0}
mapping_signsoflife_other_way = {v:k for k,v in mapping_signsoflife.items()}

mapping_alcohol = {"Alcohol": 1, "Residual/no alcohol": 0, "Unknown":-1}
mapping_alcohol_other_way = {v:k for k,v in mapping_alcohol.items()}

mapping_method_of_injury_penetrating = {'Penetrating - Stab Wound': 1, 'Penetrating - Gunshot Wound': 2,
                                       'Penetrating - Other/Mixed':3}
mapping_method_of_injury_penetrating_other_way = {v:k for k,v in mapping_method_of_injury_penetrating.items()}

mapping_method_of_injury_blunt = {'Blunt - MVT occupant':1, 'Blunt - Fall':2, 'Blunt - MVT motorcyclist':3,
 'Blunt - Other':4, 'Blunt - MVT Pedal cyclist/pedestrian':5}
mapping_method_of_injury_blunt_other_way = {v:k for k,v in mapping_method_of_injury_blunt.items()}


## Penetrating

In [3]:
test_X_time_penetrating_imputed = pd.read_csv("../Data/imputed_non_processed/penetrating/test_X_time_penetrating_imputed.csv")
train_X_time_penetrating_imputed = pd.read_csv("../Data/imputed_non_processed/penetrating/train_X_time_penetrating_imputed.csv")
# Replacing systolic blood pressure of less than 60 by -1 (<=> unknown or error)
train_X_time_penetrating_imputed.loc[train_X_time_penetrating_imputed.sbp1 < 60, "sbp1"] = -1
test_X_time_penetrating_imputed.loc[test_X_time_penetrating_imputed.sbp1 < 60, "sbp1"] = -1
train_X_time_penetrating_imputed.reset_index(inplace=True, drop=True)
test_X_time_penetrating_imputed.reset_index(inplace=True, drop=True)
# Dropping max severity of 6:
#columns_severity = ['Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
#       'Spine_severity', 'Upper_Extremity_severity',
#       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
#       'External_severity']
#train_X_time_penetrating_imputed["severity_max"] = np.max(train_X_time_penetrating_imputed[columns_severity])
#index_severity_6_train = train_X_time_penetrating_imputed[
#    train_X_time_penetrating_imputed.severity_max == 6
#].index.values
#train_X_time_penetrating_imputed.drop(index_severity_6_train, inplace=True)

In [50]:
test_X_time_penetrating_imputed.columns = ['age', 'alcohol', 'gender', 'race1', 'acslevel', 'signsoflife', 'sbp1',
       'pulse1', 'oxysat1', 'temp1', 'gcstot1', 'bleeding_disorder',
       'current_chemotherapy', 'congestive_heart_failure', 'current_smoker',
       'chronic_renal_failure', 'history_cva', 'diabetes',
       'disseminated_cancer', 'copd', 'steroid', 'cirrhosis', 'history_MI',
       'history_pvd', 'hypertension_medication', 'method_of_injury',
       'Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity', 'inc_key']
train_X_time_penetrating_imputed.columns = ['age', 'alcohol', 'gender', 'race1', 'acslevel', 'signsoflife', 'sbp1',
       'pulse1', 'oxysat1', 'temp1', 'gcstot1', 'bleeding_disorder',
       'current_chemotherapy', 'congestive_heart_failure', 'current_smoker',
       'chronic_renal_failure', 'history_cva', 'diabetes',
       'disseminated_cancer', 'copd', 'steroid', 'cirrhosis', 'history_MI',
       'history_pvd', 'hypertension_medication', 'method_of_injury',
       'Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity', 'inc_key']

### Time train/test/split

In [51]:
for t_set in ["train", "test"]:
    str_columns_to_map = ["gender", "race1", "acslevel", "signsoflife", "alcohol"]
    vars()[f"{t_set}_X_time_penetrating_imputed"]["method_of_injury"] = vars()[f"{t_set}_X_time_penetrating_imputed"][
        "method_of_injury"
    ].map(mapping_method_of_injury_penetrating_other_way)
    
    # Map the other way around the categorical features that have been imputed
    for col_map in str_columns_to_map:
        vars()[f"{t_set}_X_time_penetrating_imputed"][col_map] = vars()[f"{t_set}_X_time_penetrating_imputed"][
            col_map
        ].round(0).astype(int)
        vars()[f"{t_set}_X_time_penetrating_imputed"][col_map] = vars()[f"{t_set}_X_time_penetrating_imputed"][
            col_map
        ].map(vars()[f"mapping_{col_map}_other_way"])
    print(f"Preprocessed {t_set} dataset for penetrating injuries")

inc_keys_test_X_time_penetrating_imputed = test_X_time_penetrating_imputed.inc_key
inc_keys_test_X_time_penetrating_imputed.to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/inc_keys_test_X_time_penetrating_imputed"
)
test_X_time_penetrating_imputed.drop("inc_key", axis=1).to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/test_X_time_penetrating_imputed.csv"
)

inc_keys_train_X_time_penetrating_imputed = train_X_time_penetrating_imputed.inc_key
inc_keys_train_X_time_penetrating_imputed.to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/inc_keys_train_X_time_penetrating_imputed"
)
train_X_time_penetrating_imputed.drop("inc_key", axis=1).to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/train_X_time_penetrating_imputed.csv"
)

Preprocessed train dataset for penetrating injuries
Preprocessed test dataset for penetrating injuries




### Random train/test/split

In [52]:
os.listdir("../Data/imputed_time_split_per_injury_new_morbidity/penetrating/")
y_train_mortality_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/trauma_y_train_mortality_time_penetrating.csv", header=None
)
y_test_mortality_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/trauma_y_test_mortality_time_penetrating.csv", header=None
)
y_train_morbidity_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/trauma_y_train_morbidity_time_penetrating.csv", header=None
)
y_test_morbidity_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/penetrating/trauma_y_test_morbidity_time_penetrating.csv", header=None
)

In [53]:
X = pd.concat([train_X_time_penetrating_imputed, test_X_time_penetrating_imputed]).reset_index(drop=True)
y_mortality = pd.concat([y_train_mortality_time, y_test_mortality_time]).reset_index(drop=True)
y_morbidity = pd.concat([y_train_morbidity_time, y_test_morbidity_time]).reset_index(drop=True)

# Filter out severity 6
X["severity_max"] = X[['Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity']].max(axis=1)
indices_severity_6_to_drop = X[X.severity_max == 6].index.values
X.drop(indices_severity_6_to_drop, axis=0, inplace=True)
y_mortality.drop(indices_severity_6_to_drop, axis=0, inplace=True)
y_morbidity.drop(indices_severity_6_to_drop, axis=0, inplace=True)
X.drop("severity_max", axis=1, inplace=True)

# Reset index
X.reset_index(inplace=True, drop=True)
y_mortality.reset_index(inplace=True, drop=True)
y_morbidity.reset_index(inplace=True, drop=True)

In [54]:
train_X_morbid, test_X_morbid, train_y_morbid, test_y_morbid = train_test_split(
    X, y_morbidity, stratify=y_morbidity, random_state=7, train_size=0.8)
train_X_mortal, test_X_mortal, train_y_mortal, test_y_mortal = train_test_split(
    X, y_mortality, stratify=y_mortality, random_state=7, train_size=0.8)
train_y_morbid.columns = ["label"]
train_y_morbid = train_y_morbid["label"]
test_y_morbid.columns = ["label"]
test_y_morbid = test_y_morbid["label"]
train_y_mortal.columns = ["label"]
train_y_mortal = train_y_mortal["label"]
test_y_mortal.columns = ["label"]
test_y_mortal = test_y_mortal["label"]

In [55]:
# Saving the inc_keys and deleting column for X dataframes
data_path_random = "../Data/imputed_random_split_per_injury_without_severity_6/penetrating/"
for filename in ["train_X_morbid", "test_X_morbid"]:
    inc_keys_filename = vars()[filename].inc_key.reset_index()
    inc_keys_filename.to_csv(data_path_random + f"morbidity/inc_keys_{filename}.csv", header=True)
    vars()[filename].drop("inc_key", axis=1, inplace=True)
    
for filename in ["train_X_mortal", "test_X_mortal"]:
    inc_keys_filename = vars()[filename].inc_key.reset_index()
    inc_keys_filename.to_csv(data_path_random + f"mortality/inc_keys_{filename}.csv", header=True)
    vars()[filename].drop("inc_key", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [56]:
data_path_random = "../Data/imputed_random_split_per_injury_without_severity_6/penetrating/"
#Morbidity first
for filename in ["train_X_morbid", "test_X_morbid", "train_y_morbid", "test_y_morbid"]:
    vars()[filename].reset_index(drop=True, inplace=True)
    vars()[filename].to_csv(data_path_random + f"morbidity/{filename}.csv", header=True)
    print(f"Saved file {filename}")

#Then Mortality
for filename in ["train_X_mortal", "test_X_mortal", "train_y_mortal", "test_y_mortal"]:
    vars()[filename].reset_index(drop=True, inplace=True)
    vars()[filename].to_csv(data_path_random + f"mortality/{filename}.csv", header=True)
    print(f"Saved file {filename}")

Saved file train_X_morbid
Saved file test_X_morbid
Saved file train_y_morbid
Saved file test_y_morbid
Saved file train_X_mortal
Saved file test_X_mortal
Saved file train_y_mortal
Saved file test_y_mortal


## Blunt

### Time train/test split

In [57]:
test_X_time_blunt_imputed = pd.read_csv("../Data/imputed_non_processed/blunt/test_X_time_blunt_imputed.csv")
train_X_time_blunt_imputed = pd.read_csv("../Data/imputed_non_processed/blunt/train_X_time_blunt_imputed.csv")
# Replacing systolic blood pressure of less than 60 by -1 (<=> unknown or error)
train_X_time_blunt_imputed.loc[train_X_time_blunt_imputed.sbp1 < 60, "sbp1"] = -1
test_X_time_blunt_imputed.loc[test_X_time_blunt_imputed.sbp1 < 60, "sbp1"] = -1
train_X_time_blunt_imputed.reset_index(inplace=True, drop=True)
test_X_time_blunt_imputed.reset_index(inplace=True, drop=True)

In [58]:
test_X_time_blunt_imputed.columns = ['age', 'alcohol', 'gender', 'race1', 'acslevel', 'signsoflife', 'sbp1',
       'pulse1', 'oxysat1', 'temp1', 'gcstot1', 'bleeding_disorder',
       'current_chemotherapy', 'congestive_heart_failure', 'current_smoker',
       'chronic_renal_failure', 'history_cva', 'diabetes',
       'disseminated_cancer', 'copd', 'steroid', 'cirrhosis', 'history_MI',
       'history_pvd', 'hypertension_medication', 'method_of_injury',
       'Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity', 'inc_key']
train_X_time_blunt_imputed.columns = ['age', 'alcohol', 'gender', 'race1', 'acslevel', 'signsoflife', 'sbp1',
       'pulse1', 'oxysat1', 'temp1', 'gcstot1', 'bleeding_disorder',
       'current_chemotherapy', 'congestive_heart_failure', 'current_smoker',
       'chronic_renal_failure', 'history_cva', 'diabetes',
       'disseminated_cancer', 'copd', 'steroid', 'cirrhosis', 'history_MI',
       'history_pvd', 'hypertension_medication', 'method_of_injury',
       'Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity', 'inc_key']

In [59]:
for t_set in ["train", "test"]:
    str_columns_to_map = ["gender", "race1", "acslevel", "signsoflife", "alcohol"]
    vars()[f"{t_set}_X_time_blunt_imputed"]["method_of_injury"] = vars()[f"{t_set}_X_time_blunt_imputed"][
        "method_of_injury"
    ].map(mapping_method_of_injury_blunt_other_way)
    
    # Map the other way around the categorical features that have been imputed
    for col_map in str_columns_to_map:
        vars()[f"{t_set}_X_time_blunt_imputed"][col_map] = vars()[f"{t_set}_X_time_blunt_imputed"][
            col_map
        ].round(0).astype(int)
        vars()[f"{t_set}_X_time_blunt_imputed"][col_map] = vars()[f"{t_set}_X_time_blunt_imputed"][
            col_map
        ].map(vars()[f"mapping_{col_map}_other_way"])
    print(f"Preprocessed {t_set} dataset for blunt injuries")
test_X_time_blunt_imputed.drop("inc_key", axis=1).to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/test_X_time_blunt_imputed.csv"
)
train_X_time_blunt_imputed.drop("inc_key", axis=1).to_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/train_X_time_blunt_imputed.csv"
)

Preprocessed train dataset for blunt injuries
Preprocessed test dataset for blunt injuries


### Random train/test split

In [60]:
os.listdir("../Data/imputed_time_split_per_injury_new_morbidity/blunt/")
y_train_mortality_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/trauma_y_train_mortality_time_blunt.csv", header=None
)
y_test_mortality_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/trauma_y_test_mortality_time_blunt.csv", header=None
)
y_train_morbidity_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/trauma_y_train_morbidity_time_blunt.csv", header=None
)
y_test_morbidity_time = pd.read_csv(
    "../Data/imputed_time_split_per_injury_new_morbidity/blunt/trauma_y_test_morbidity_time_blunt.csv", header=None
)

In [61]:
X = pd.concat([train_X_time_blunt_imputed, test_X_time_blunt_imputed]).reset_index(drop=True)
y_mortality = pd.concat([y_train_mortality_time, y_test_mortality_time]).reset_index(drop=True)
y_morbidity = pd.concat([y_train_morbidity_time, y_test_morbidity_time]).reset_index(drop=True)

# Filter out severity 6
X["severity_max"] = X[['Face_severity', 'Neck_severity', 'Thorax_severity', 'Abdomen_severity',
       'Spine_severity', 'Upper_Extremity_severity',
       'Lower_Extremity_severity', 'Pelvis_Perineum_severity',
       'External_severity']].max(axis=1)
indices_severity_6_to_drop = X[X.severity_max == 6].index.values
X.drop(indices_severity_6_to_drop, axis=0, inplace=True)
y_mortality.drop(indices_severity_6_to_drop, axis=0, inplace=True)
y_morbidity.drop(indices_severity_6_to_drop, axis=0, inplace=True)
X.drop("severity_max", axis=1, inplace=True)

# Reset index
X.reset_index(inplace=True, drop=True)
y_mortality.reset_index(inplace=True, drop=True)
y_morbidity.reset_index(inplace=True, drop=True)


In [62]:
train_X_morbid, test_X_morbid, train_y_morbid, test_y_morbid = train_test_split(
    X, y_morbidity, stratify=y_morbidity, random_state=7, train_size=0.8)
train_X_mortal, test_X_mortal, train_y_mortal, test_y_mortal = train_test_split(
    X, y_mortality, stratify=y_mortality, random_state=7, train_size=0.8)
train_y_morbid.columns = ["label"]
train_y_morbid = train_y_morbid["label"]
test_y_morbid.columns = ["label"]
test_y_morbid = test_y_morbid["label"]
train_y_mortal.columns = ["label"]
train_y_mortal = train_y_mortal["label"]
test_y_mortal.columns = ["label"]
test_y_mortal = test_y_mortal["label"]

In [63]:
# Saving the inc_keys and deleting column for X dataframes
data_path_random = "../Data/imputed_random_split_per_injury_without_severity_6/blunt/"
for filename in ["train_X_morbid", "test_X_morbid"]:
    inc_keys_filename = vars()[filename].inc_key.reset_index()
    inc_keys_filename.to_csv(data_path_random + f"morbidity/inc_keys_{filename}.csv", header=True)
    vars()[filename].drop("inc_key", axis=1, inplace=True)
    
for filename in ["train_X_mortal", "test_X_mortal"]:
    inc_keys_filename = vars()[filename].inc_key.reset_index()
    inc_keys_filename.to_csv(data_path_random + f"mortality/inc_keys_{filename}.csv", header=True)
    vars()[filename].drop("inc_key", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [64]:
data_path_random = "../Data/imputed_random_split_per_injury_without_severity_6/blunt/"
#Morbidity first
for filename in ["train_X_morbid", "test_X_morbid", "train_y_morbid", "test_y_morbid"]:
    vars()[filename].reset_index(drop=True, inplace=True)
    vars()[filename].to_csv(data_path_random + f"morbidity/{filename}.csv", header=True)
    print(f"Saved file {filename}")

#Then Mortality
for filename in ["train_X_mortal", "test_X_mortal", "train_y_mortal", "test_y_mortal"]:
    vars()[filename].reset_index(drop=True, inplace=True)
    vars()[filename].to_csv(data_path_random + f"mortality/{filename}.csv", header=True)
    print(f"Saved file {filename}")

Saved file train_X_morbid
Saved file test_X_morbid
Saved file train_y_morbid
Saved file test_y_morbid
Saved file train_X_mortal
Saved file test_X_mortal
Saved file train_y_mortal
Saved file test_y_mortal
