In [15]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

# Preprocessed df

In [16]:
df_trauma_preprocessed = pd.read_csv("../Data/trauma_data_preprocessed.csv")
df_trauma_preprocessed.head()

Unnamed: 0,complkey1,complkey2,complkey3,complkey4,complkey5,complkey6,complkey7,complkey8,complkey9,complkey10,...,Neck_severity,Head_severity,Thorax_severity,Abdomen_severity,Spine_severity,Upper_Extremity_severity,Lower_Extremity_severity,Pelvis_Perineum_severity,External_severity,severity_max
0,1.0,,,,,,,,,,...,,,2.0,4.0,3.0,,,2.0,,4.0
1,,,,,,,,,,,...,,,2.0,,,,,,,2.0
2,,,,,,,,,,,...,,3.0,,,2.0,,,,,3.0
3,,,,,,,,,,,...,,,1.0,,2.0,,,,1.0,2.0
4,,,,,,,,,,,...,,,,,,2.0,,,,2.0


## Statistics on Blunt/Penetration

In [17]:
print("########## Statistics on Train+Test (< & >= 2016) ##########")
df_values = df_trauma_preprocessed["method_of_injury"].value_counts().reset_index()
df_values.columns = ["method_of_injury", "count"]
blunt_values = df_values[df_values.method_of_injury.str.contains("Blunt")]["count"].sum()
penetration_values = df_values[df_values.method_of_injury.str.contains("Penetr")]["count"].sum()
print(f"Number of blunt observations: {blunt_values}\nNumber of penetration observations: {penetration_values}")
print("Considering only blunt & penetration methods of injuries, not the category 'Other':")
print(f"Percentage penetration: {penetration_values/(penetration_values+blunt_values)*100} %")
print(f"Percentage blunt: {blunt_values/(penetration_values+blunt_values)*100} %")

########## Statistics on Train+Test (< & >= 2016) ##########
Number of blunt observations: 854557
Number of penetration observations: 91238
Considering only blunt & penetration methods of injuries, not the category 'Other':
Percentage penetration: 9.646699337594299 %
Percentage blunt: 90.3533006624057 %


In [18]:
# Train only
print("########## Statistics on Train only ##########")
df_values_train = df_trauma_preprocessed[
    df_trauma_preprocessed.yoadmit < 2016
]["method_of_injury"].value_counts().reset_index()
df_values_train.columns = ["method_of_injury", "count"]
blunt_values = df_values_train[df_values_train.method_of_injury.str.contains("Blunt")]["count"].sum()
penetration_values = df_values_train[df_values_train.method_of_injury.str.contains("Penetr")]["count"].sum()
print(f"Number of blunt observations: {blunt_values}\nNumber of penetration observations: {penetration_values}")
print("Considering only blunt & penetration methods of injuries, not the category 'Other':")
print(f"Percentage penetration: {penetration_values/(penetration_values+blunt_values)*100} %")
print(f"Percentage blunt: {blunt_values/(penetration_values+blunt_values)*100} %")

########## Statistics on Train only ##########
Number of blunt observations: 671236
Number of penetration observations: 71027
Considering only blunt & penetration methods of injuries, not the category 'Other':
Percentage penetration: 9.568980267102093 %
Percentage blunt: 90.43101973289791 %


In [19]:
# Test only
print("########## Statistics on Test only ##########")
df_values_test = df_trauma_preprocessed[
    df_trauma_preprocessed.yoadmit >= 2016
]["method_of_injury"].value_counts().reset_index()
df_values_test.columns = ["method_of_injury", "count"]
blunt_values = df_values_test[df_values_test.method_of_injury.str.contains("Blunt")]["count"].sum()
penetration_values = df_values_test[df_values_test.method_of_injury.str.contains("Penetr")]["count"].sum()
print(f"Number of blunt observations: {blunt_values}\nNumber of penetration observations: {penetration_values}")
print("Considering only blunt & penetration methods of injuries, not the category 'Other':")
print(f"Percentage penetration: {penetration_values/(penetration_values+blunt_values)*100} %")
print(f"Percentage blunt: {blunt_values/(penetration_values+blunt_values)*100} %")

########## Statistics on Test only ##########
Number of blunt observations: 183321
Number of penetration observations: 20211
Considering only blunt & penetration methods of injuries, not the category 'Other':
Percentage penetration: 9.930133836448322 %
Percentage blunt: 90.06986616355168 %


## Final choice of features & train/test split on time

In [7]:
# Dropping the "Unknown" type of injuries
df_trauma_preprocessed_filtered = df_trauma_preprocessed[
    df_trauma_preprocessed.method_of_injury != "Unknown"
].reset_index(drop=True)

In [15]:
columns_to_keep = [
    "age", "gender", "race1",
    # "teachsta", # "region",
    "acslevel", #"tmode1", # "transfer",
    "signsoflife", "sbp1", # "sbp2",
    "pulse1", # "pulse2",
    "oxysat1", # "oxysat2",
    "temp1", "gcstot1", # "gcstot2",
    "alcohol", "bleeding_disorder",
    "current_chemotherapy", "congestive_heart_failure",
    "current_smoker", "chronic_renal_failure",
    "history_cva", "diabetes", "disseminated_cancer",
    "copd", "steroid", "cirrhosis", "history_MI",
    "history_pvd", "hypertension_medication", # "eddisp",
    "method_of_injury", # new AIS"
    "Face_severity", "Neck_severity", "Thorax_severity",
    "Abdomen_severity", "Spine_severity",
    "Upper_Extremity_severity", "Lower_Extremity_severity",
    "Pelvis_Perineum_severity", "External_severity", "severity_max"
    "hemorrhage_ctrl_type"
]
# Creating target values for mortality & morbidity
hosp_mortality = ((df_trauma_preprocessed_filtered.hospdisp == "Expired") 
                  | (df_trauma_preprocessed_filtered.hospdisp == "Deceased/Expired"))*1
hosp_morbidity = df_trauma_preprocessed_filtered.morbidity

In [20]:
# Getting indices of the two time periods for train/test split
index_before_2016 = df_trauma_preprocessed_filtered[
    df_trauma_preprocessed_filtered.yoadmit < 2016
].index.values
index_from_2016 = df_trauma_preprocessed_filtered[
    df_trauma_preprocessed_filtered.yoadmit >= 2016
].index.values

# X data train/test split
df_trauma_preprocessed_filtered_train = df_trauma_preprocessed_filtered.iloc[
    index_before_2016, :
].reset_index(drop=True)
df_trauma_preprocessed_filtered_test = df_trauma_preprocessed_filtered.iloc[
    index_from_2016, :
].reset_index(drop=True)

# Mortality train/test split
hosp_mortality_train = hosp_mortality[index_before_2016]
hosp_mortality_test = hosp_mortality[index_from_2016]

# Morbidity train/test split
hosp_morbidity_train = hosp_morbidity[index_before_2016]
hosp_morbidity_test = hosp_morbidity[index_from_2016]


# Saving train data with time split
df_trauma_preprocessed_filtered_train = df_trauma_preprocessed_filtered_train[columns_to_keep]
df_trauma_preprocessed_filtered_train.to_csv("../Data/time_split/trauma_X_train_time.csv", index=False)
hosp_morbidity_train.to_csv("../Data/time_split/trauma_y_train_morbidity_time.csv", index=False)
hosp_mortality_train.to_csv("../Data/time_split/trauma_y_train_mortality_time.csv", index=False)
print(f"Size of train set with time separation: {len(df_trauma_preprocessed_filtered_train)}")


# Saving test data with time split
df_trauma_preprocessed_filtered_test = df_trauma_preprocessed_filtered_test[columns_to_keep]
df_trauma_preprocessed_filtered_test.to_csv("../Data/time_split/trauma_X_test_time.csv", index=False)
hosp_morbidity_test.to_csv("../Data/time_split/trauma_y_test_morbidity_time.csv", index=False)
hosp_mortality_test.to_csv("../Data/time_split/trauma_y_test_mortality_time.csv", index=False)
print(f"Size of test set with time separation: {len(df_trauma_preprocessed_filtered_test)}")



Size of train set with time separation: 742263




Size of test set with time separation: 203532


## Train/Test split on Blunt/Penetration

In [20]:
# Dropping the "Unknown" type of injuries
df_trauma_preprocessed_filtered = df_trauma_preprocessed[
    df_trauma_preprocessed.method_of_injury != "Unknown"
].reset_index(drop=True)
columns_to_keep = [
    "age", "gender", "race1",
    # "teachsta", # "region",
    "acslevel", #"tmode1", # "transfer",
    "signsoflife", "sbp1", # "sbp2",
    "pulse1", # "pulse2",
    "oxysat1", # "oxysat2",
    "temp1", "gcstot1", # "gcstot2",
    "alcohol", "bleeding_disorder",
    "current_chemotherapy", "congestive_heart_failure",
    "current_smoker", "chronic_renal_failure",
    "history_cva", "diabetes", "disseminated_cancer",
    "copd", "steroid", "cirrhosis", "history_MI",
    "history_pvd", "hypertension_medication", # "eddisp",
    "method_of_injury", # new AIS"
    #"Face_severity", "Neck_severity", "Thorax_severity",
    #"Abdomen_severity", "Spine_severity",
    #"Upper_Extremity_severity", "Lower_Extremity_severity",
    #"Pelvis_Perineum_severity", "External_severity", 
    "severity_max",
    "hemorrhage_ctrl_type"
]
# Creating target values for mortality & morbidity
hosp_mortality = ((df_trauma_preprocessed_filtered.hospdisp == "Expired") 
                  | (df_trauma_preprocessed_filtered.hospdisp == "Deceased/Expired"))*1
hosp_morbidity = df_trauma_preprocessed_filtered.morbidity

### Mortality

In [22]:
X_train_mortality_strat, X_test_mortality_strat, y_train_mortality_strat, y_test_mortality_strat = train_test_split(
    df_trauma_preprocessed_filtered[columns_to_keep], hosp_mortality, test_size = 0.2, stratify = hosp_mortality
)

In [23]:
X_train_mortality_strat.to_csv("../Data/stratified_method_of_injury_split/mortality/X_train_mortality_strat.csv", index=False)
X_test_mortality_strat.to_csv("../Data/stratified_method_of_injury_split/mortality/X_test_mortality_strat.csv", index=False)
y_train_mortality_strat.to_csv("../Data/stratified_method_of_injury_split/mortality/y_train_mortality_strat.csv", index=False)
y_test_mortality_strat.to_csv("../Data/stratified_method_of_injury_split/mortality/y_test_mortality_strat.csv", index=False)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [24]:
print(y_train_mortality_strat.value_counts()/len(y_train_mortality_strat))
print(y_test_mortality_strat.value_counts()/len(y_test_mortality_strat))

0    0.945974
1    0.054026
Name: hospdisp, dtype: float64
0    0.945977
1    0.054023
Name: hospdisp, dtype: float64


### Morbidity

In [25]:
X_train_morbidity_strat, X_test_morbidity_strat, y_train_morbidity_strat, y_test_morbidity_strat = train_test_split(
    df_trauma_preprocessed_filtered[columns_to_keep], hosp_morbidity, test_size = 0.2, stratify = hosp_morbidity
)

In [26]:
X_train_morbidity_strat.to_csv("../Data/stratified_method_of_injury_split/morbidity/X_train_morbidity_strat.csv", index=False)
X_test_morbidity_strat.to_csv("../Data/stratified_method_of_injury_split/morbidity/X_test_morbidity_strat.csv", index=False)
y_train_morbidity_strat.to_csv("../Data/stratified_method_of_injury_split/morbidity/y_train_morbidity_strat.csv", index=False)
y_test_morbidity_strat.to_csv("../Data/stratified_method_of_injury_split/morbidity/y_test_morbidity_strat.csv", index=False)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [27]:
print(y_train_morbidity_strat.value_counts()/len(y_train_morbidity_strat))
print(y_test_morbidity_strat.value_counts()/len(y_test_morbidity_strat))

0    0.910566
1    0.089434
Name: morbidity, dtype: float64
0    0.910567
1    0.089433
Name: morbidity, dtype: float64


In [34]:
X_train_mortality_strat.severity_max.unique()

array([ 1.,  3.,  2., nan,  4.,  5.,  6.])