# Early Prediction of Sepsis from Clinical Data

In [1]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, accuracy_score, \
    confusion_matrix
from sklearn.naive_bayes import GaussianNB

## Load Data


In [2]:
train_data = pd.read_csv(f'/Users/ofrihefetz/PycharmProjects/lab2_hw1/data/train_df.csv')
test_data = pd.read_csv('/Users/ofrihefetz/PycharmProjects/lab2_hw1/data/test_df.csv')

# Data Preperations
In this part we will prepare the different forms of the data set, which we will examine over different models.

In [3]:
def data_prep(data):
    #  add a new column for each feature that indicates if the value is missing
    for col in data.columns:
        data[col + '_missing'] = data[col].isna().astype(int)
    return data

##  Train data sets


In [4]:
col_to_drop_new = ['SepsisPatient', 'SepsisPatient_missing', 'SepsisLabel', 'SepsisLabel_missing',
                   'filename_missing', 'filename']
col_to_drop = ['SepsisPatient', 'SepsisLabel', 'filename']
# ------------------------------------------------------------------------------------#
# original data no manipulation
train_data_x = train_data.drop(col_to_drop, axis=1)
train_data_y = train_data['SepsisLabel']
# # ------------------------------------------------------------------------------------#
# original data with additional cols indicating for missing values per feature
train_data_new = data_prep(train_data)
train_data_new_x = train_data_new.drop(col_to_drop_new, axis=1)
train_data_new_y = train_data_new['SepsisPatient']
# # ------------------------------------------------------------------------------------#
# original data aggregated by the patient id (using median values)
train_data_agg_x = train_data.groupby(by=['filename']).median()
train_data_agg_x = train_data_agg_x.drop(['SepsisPatient', 'SepsisLabel'], axis=1)
train_data_agg_y = train_data.groupby(by=['filename']).max()['SepsisPatient']
# # ------------------------------------------------------------------------------------#
# # data with additional cols aggregated  by the patient id (using median values)
train_data_new_agg_x = train_data_new.groupby(by=['filename']).median()
train_data_new_agg_x = train_data_new_agg_x.drop(
    ['SepsisPatient', 'SepsisPatient_missing', 'SepsisLabel', 'SepsisLabel_missing', 'filename_missing'], axis=1)
train_data_new_agg_y = train_data_new.groupby(by=['filename']).max()['SepsisPatient']

## Test data sets

In [5]:
# original data no manipulation
test_data_x = test_data.drop(col_to_drop, axis=1)
test_data_y = test_data['SepsisLabel']
# # ------------------------------------------------------------------------------------#
# original data with additional cols indicating for missing values per feature
test_data_new = data_prep(test_data)
test_data_new_x = test_data_new.drop(col_to_drop_new, axis=1)
test_data_new_y = test_data_new['SepsisPatient']
# # ------------------------------------------------------------------------------------#
# original data aggregated by the patient id (using median values)
test_data_agg_x = test_data.groupby(by=['filename']).median()
test_data_agg_x = test_data_agg_x.drop(['SepsisPatient', 'SepsisLabel'], axis=1)
test_data_agg_y = test_data.groupby(by=['filename']).max()['SepsisPatient']
# # ------------------------------------------------------------------------------------#
# data with additional cols aggregated  by the patient id (using median values)
test_data_new_agg_x = test_data_new.groupby(by=['filename']).median()
test_data_new_agg_x = test_data_new_agg_x.drop(
    ['SepsisPatient', 'SepsisPatient_missing', 'SepsisLabel', 'SepsisLabel_missing', 'filename_missing'], axis=1)
test_data_new_agg_y = test_data_new.groupby(by=['filename']).max()['SepsisPatient']
# # ------------------------------------------------------------------------------------#

# Methods
1. Logistic_Regression
2. Gaussian_Naive_Bayes_Classifier
3. XGBoost
4. DecisionTree

In [14]:
def evaluation(y_test, predicted):
    accuracy_score(y_test, predicted)
    print(f"Accuracy is {accuracy_score(y_test, predicted)}")
    print(f"F1 score is {f1_score(y_test, predicted)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predicted))

## Gaussian Naive Bayes Classifier

In [15]:
def Gaussian_Naive_Bayes_Classifier(X_train, y_train, X_test, y_test):
    X_train = X_train.fillna(-1)
    X_test = X_test.fillna(-1)
    y_train= y_train.fillna(0)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predicted = gnb.predict(X_test)
    evaluation(y_test, predicted)

In [16]:
Gaussian_Naive_Bayes_Classifier(train_data_x,train_data_y,test_data_x,test_data_y)

Accuracy is 0.8228807954871805
F1 score is 0.3106644790812141
Confusion Matrix:
[[297169  45400]
 [ 21824  15148]]


In [17]:
Gaussian_Naive_Bayes_Classifier(train_data_new_x,train_data_new_y,test_data_new_x,test_data_new_y)

Accuracy is 0.8096332148568929
F1 score is 0.2939037976662823
Confusion Matrix:
[[292252  50317]
 [ 21935  15037]]


In [18]:
Gaussian_Naive_Bayes_Classifier(train_data_agg_x,train_data_agg_y,test_data_agg_x,test_data_agg_y)

Accuracy is 0.9135
F1 score is 0.3268482490272374
Confusion Matrix:
[[8925  334]
 [ 531  210]]


In [19]:
Gaussian_Naive_Bayes_Classifier(train_data_new_agg_x,train_data_new_agg_y,test_data_new_agg_x,test_data_new_agg_y)

Accuracy is 0.9135
F1 score is 0.3268482490272374
Confusion Matrix:
[[8925  334]
 [ 531  210]]
