In [2]:

import os
os.system('pip3 install Seaborn')
os.system('pip3 install xgboost')

0

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Load in all the data
samplesubmission = pd.read_csv('data/samplesubmission.csv')
solution_template = pd.read_csv('data/solution_template.csv')
training = pd.read_csv('data/training_v2.csv')
unlabeled = pd.read_csv('data/unlabeled.csv')
wids = pd.read_csv('data/WiDS_Datathon_2020_Dictionary.csv')

# CSV files must be in folder called 'data' in the same directory as the notebook 

In [4]:
# Find all the categorical features and set them to their appropriate type

num_unique_values = training.nunique().sort_values(ascending=False)
display(num_unique_values)
# After manually inspecting it seems that hospital_admit_source is the last categorical feature
# So we'll convert all columns with less unique values to a categorical feature as well
categorical_features = num_unique_values[num_unique_values <= 15].index
training_typed = training.astype(dict(zip(categorical_features, ['category'] * len(categorical_features))))

encounter_id             91713
patient_id               91713
bmi                      34888
urineoutput_apache       24772
pre_icu_los_days          9757
                         ...  
gcs_unable_apache            2
elective_surgery             2
gender                       2
apache_post_operative        2
readmission_status           1
Length: 186, dtype: int64

In [5]:
# Show the percentage of non missing values for each feature
missing = (training_typed.notna().sum() / len(training)) * 100

# Looking at the data there is a big gap in data availability between wbc_apache and urineoutput_apache, namely 75% and then 46%
# To make analysis easier we'll only consider features with more than 50% data availability
chosen_features = list(missing[missing > 50].index)

In [6]:
df_training = training_typed[chosen_features].fillna(training_typed[chosen_features].mean(numeric_only=True))
# df_training.select_dtypes(include=['category'])
# df_final = pd.get_dummies(df_training)

In [7]:
test_size = 0.2 # proportion for train versus test+val split
val_size = 0.5 # proportion for test versus val split
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes

In [8]:
from sklearn.model_selection import train_test_split

X = df_training.copy().drop(['hospital_death', 'patient_id', 'encounter_id', 'hospital_id', 'icu_id', # drop identifiers
                    'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob'], # drop APACHE scores
                   axis=1)
y = df_training['hospital_death'].copy()
y_apache = df_training['apache_4a_hospital_death_prob'].copy()

X = pd.get_dummies(X, drop_first=True)

# split the dataset into train and test+validation set
(X_train, X_test,
 y_train, y_test,
 y_apache_train, y_apache_test
) = train_test_split(X, y, y_apache, test_size=test_size, random_state=random_state)

# split the test set into test + validation set
(X_val, X_test,
 y_val, y_test,
 y_apache_val, y_apache_test,
) = train_test_split(X_test, y_test, y_apache_test, test_size=val_size, random_state=random_state)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print(f"RFC Accuracy: {accuracy_score(y_test, rfc.predict(X_test))}")

lg = LogisticRegression(solver='liblinear')
lg.fit(X_train, y_train)
print(f"LG Accuracy: {accuracy_score(y_test, lg.predict(X_test))}")

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print(f"XGB Accuracy: {accuracy_score(y_test, xgb.predict(X_test))}")

RFC Accuracy: 0.9262974269515918
LG Accuracy: 0.9218273004797208
XGB Accuracy: 0.9249890972525077


In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
# Define a dictionary to map ethnicity names to their corresponding boolean columns
ethnicity_columns = {
    'Asian': 'ethnicity_Asian',
    'Caucasian': 'ethnicity_Caucasian',
    'Hispanic': 'ethnicity_Hispanic',
    'Native American': 'ethnicity_Native American',
    'Other/Unknown': 'ethnicity_Other/Unknown'
}

# Loop through each ethnicity and evaluate accuracy
for ethnicity, column_name in ethnicity_columns.items():
    # Filter test data based on the ethnicity column
    mask = X_test[column_name] == 1  # Filter for rows where the ethnicity column is 1 (True)
    X_test_ethnicity = X_test[mask]
    y_test_ethnicity = y_test[mask]
    
    # Calculate accuracy for each model
    rfc_accuracy = accuracy_score(y_test_ethnicity, rfc.predict(X_test_ethnicity))
    lg_accuracy = accuracy_score(y_test_ethnicity, lg.predict(X_test_ethnicity))
    xgb_accuracy = accuracy_score(y_test_ethnicity, xgb.predict(X_test_ethnicity))

    #Calculate Recall for each model
    rfc_recall = recall_score(y_test_ethnicity, rfc.predict(X_test_ethnicity))
    lg_recall = recall_score(y_test_ethnicity, lg.predict(X_test_ethnicity))
    xgb_recall = recall_score(y_test_ethnicity, xgb.predict(X_test_ethnicity))

    # Calculate F1 score for each model
    rfc_f1 = f1_score(y_test_ethnicity, rfc.predict(X_test_ethnicity))
    lg_f1 = f1_score(y_test_ethnicity, lg.predict(X_test_ethnicity))
    xgb_f1 = f1_score(y_test_ethnicity, xgb.predict(X_test_ethnicity))

    # Calculate Area under ROC curve for each model
    rfc_auc = roc_auc_score(y_test_ethnicity, rfc.predict(X_test_ethnicity))
    lg_auc = roc_auc_score(y_test_ethnicity, lg.predict(X_test_ethnicity))
    xgb_auc = roc_auc_score(y_test_ethnicity, xgb.predict(X_test_ethnicity))

    # print(f"RFC Recall: {rfc_recall}")
    # print(f"LG Recall: {lg_recall}")
    # print(f"XGB Recall: {xgb_recall}")

    # print(f"RFC F1 score: {rfc_f1}")
    # print(f"LG F1 score: {lg_f1}")
    # print(f"XGB F1 score: {xgb_f1}")

    print(f"RFC Area under ROC: {rfc_auc}")
    print(f"LG Area under ROC: {lg_auc}")
    print(f"XGB Area under ROC: {xgb_auc}")

In [15]:
wids.iloc[10:20]
wids.loc[16, 'Description']

'The length of stay of the patient between hospital admission and unit admission'