(update based on suraj's notebook: https://github.com/spass-med/MLA/blob/classification_poc_surajp/classification/03_POC/surajp/0323032709_ExperimentWithLabels.ipynb)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, gc
import sys
from copy import deepcopy
import random

from tqdm import tqdm
from joblib import Parallel, delayed
import multiprocessing

import lightgbm as lgb
import xgboost as xgb
from scipy import sparse
import pickle
# import catboost as ctb

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# import optuna

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
sys.path.append('/home/daisy/MLA_test/classification/03_POC/cmkim/')

from features import (
    LaggedFeatures,
    LaggedDiffFeatures,
    running_stats,
    fourier_transform,
    lagged_fourier_transform,
)

import torch

# import shap

# set all random seeds
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

tqdm.pandas()
from timeit import default_timer as timer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from typing import List

### 1. Process eICU-CRD

In [2]:
# Define the data directory and file name
DATA_DIR = "/home/daisy/MLA_dataset/"
FILE_NAME = "sepsis_classification_trainDataset_9hrs.parquet.gzip"

# Load the data
labels_df = pd.read_parquet(os.path.join(DATA_DIR, FILE_NAME))
labels_df.head()

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,category3,category4,label,hospitaladmitoffset,gender,age,ethnicity,admissionweight,dischargeweight,admissionheight
0,0,141288.0,360,15.0,109.0,56.0,73.666664,53.0,101.0,21.0,...,sepsis,,sepsis,-171,Female,61,Caucasian,,92.2,162.6
1,0,141288.0,365,15.0,106.0,55.0,72.0,51.0,102.0,19.0,...,sepsis,,sepsis,-171,Female,61,Caucasian,,92.2,162.6
2,0,141288.0,370,15.0,103.0,53.0,69.666664,50.0,104.0,21.0,...,sepsis,,sepsis,-171,Female,61,Caucasian,,92.2,162.6
3,0,141288.0,375,15.0,107.0,55.0,72.333336,52.0,102.0,21.0,...,sepsis,,sepsis,-171,Female,61,Caucasian,,92.2,162.6
4,0,141288.0,380,15.0,104.0,54.0,70.666664,50.0,102.0,18.0,...,sepsis,,sepsis,-171,Female,61,Caucasian,,92.2,162.6


In [3]:
labels_df[labels_df['patientunitstayid'] == 251494]

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,category3,category4,label,hospitaladmitoffset,gender,age,ethnicity,admissionweight,dischargeweight,admissionheight
6588,61,251494.0,3930,12.0,101.0,43.0,62.333332,58.0,109.0,9.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
6589,61,251494.0,3935,12.0,96.0,40.0,58.666668,56.0,113.0,9.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
6590,61,251494.0,3940,12.0,98.0,43.0,61.333332,55.0,124.0,18.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
6591,61,251494.0,3945,12.0,96.0,40.0,58.666668,56.0,116.0,8.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
6592,61,251494.0,3950,12.0,94.0,40.0,58.000000,54.0,117.0,10.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7015,64,251494.0,7020,12.0,97.0,51.0,66.333336,46.0,89.0,14.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
7016,64,251494.0,7025,12.0,98.0,50.0,66.000000,48.0,86.0,14.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
7017,64,251494.0,7030,12.0,102.0,53.0,69.333336,49.0,88.0,16.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9
7018,64,251494.0,7035,12.0,98.0,52.0,67.333336,46.0,87.0,14.0,...,septic shock,,sepsis,-25,Male,82,Caucasian,113.3,115.4,182.9


In [4]:
labels_df.columns

Index(['groups', 'patientunitstayid', 'observationoffset', 'gcs', 'systolicbp',
       'diastolicbp', 'meanbp', 'pp', 'heartrate', 'respiration', 'spo2',
       'activeupondischarge', 'diagnosisoffset', 'icd9code',
       'diagnosispriority', 'category1', 'category2', 'category3', 'category4',
       'label', 'hospitaladmitoffset', 'gender', 'age', 'ethnicity',
       'admissionweight', 'dischargeweight', 'admissionheight'],
      dtype='object')

### 3. Relabel data

In [5]:
# Filter the data to include only male and female patients
labels_df["label_binary"] = labels_df["label"].apply(lambda x: 1 if x == "sepsis" else 0)

# filter features based on types
features_offset = ["observationoffset"]
features_vital = [
    "meanbp",
    "heartrate",
    "respiration"
    # 'meanbp_minmaxed_filter',
    # 'heartrate_minmaxed_filter',
    # 'respiration_minmaxed_filter'
]

In [6]:
original_labels = labels_df[["patientunitstayid", "groups", "label_binary"]].copy()

In [7]:
SAMPLING_RATE_MINUTES = 5 # 5 minutes
HOURS_FOR_INPUT_HR = 6 # initial 3 hours of data for input
HOURS_FOR_LABEL_HR = 3 # lastest 1 hour of data for label
PCT_FOR_QSOFA_POSITIVE = 0.3 # 50% of the data should be positive for QSOFA

In [8]:
gc.collect()
input_df = labels_df.groupby("groups").head(HOURS_FOR_INPUT_HR * 60 // SAMPLING_RATE_MINUTES).copy()
target_df = labels_df.groupby("groups").tail(HOURS_FOR_LABEL_HR * 60 // SAMPLING_RATE_MINUTES).copy()

In [9]:
def calculate_qsofa(row):
    """
    Calculate qsofa score for a given row
    """
    qsofa_score = 0
    if row["respiration"] >= 22:
        qsofa_score += 1
    if row["systolicbp"] < 100:
        qsofa_score += 1
    if row["gcs"] < 15:
        qsofa_score += 1
    return qsofa_score

def calculate_qsofa_label(row):
    """
    Calculate qsofa label for a given row
    """
    qsofa_score = calculate_qsofa(row)
    if qsofa_score >= 2:
        return 1
    else:
        return 0

target_df.loc[:, "label_qsofa"] = target_df.progress_apply(calculate_qsofa_label, axis=1)

  0%|          | 0/282708 [00:00<?, ?it/s]

100%|██████████| 282708/282708 [00:02<00:00, 97820.45it/s] 


In [10]:
new_labels = (target_df.groupby("groups")["label_qsofa"].mean() >= PCT_FOR_QSOFA_POSITIVE).astype(int).rename("new_labels").reset_index()

In [11]:
original_labels = original_labels.merge(new_labels, on="groups", how="left")
input_df = input_df.merge(new_labels, on="groups", how="left")

In [12]:
exact_groups = input_df.groupby("groups").apply(
    lambda x: (x["label_binary"] == x["new_labels"]).mean()
)

exact_groups = exact_groups[exact_groups == 1].index.tolist()
print(len(exact_groups))
input_df = input_df[input_df["groups"].isin(exact_groups)].copy()

4802


In [13]:
input_df[input_df['patientunitstayid'] == 543281]

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,label,hospitaladmitoffset,gender,age,ethnicity,admissionweight,dischargeweight,admissionheight,label_binary,new_labels
255888,3554,543281.0,3120,11.0,115.0,47.0,69.666664,68.0,62.0,18.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255889,3554,543281.0,3125,15.0,115.0,48.0,70.333336,67.0,62.0,20.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255890,3554,543281.0,3130,15.0,114.0,48.0,70.000000,66.0,62.0,18.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255891,3554,543281.0,3135,15.0,122.0,57.0,78.666664,65.0,64.0,18.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255892,3554,543281.0,3140,15.0,126.0,55.0,78.666664,71.0,65.0,27.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255955,3554,543281.0,3455,15.0,108.0,53.0,71.333336,55.0,68.0,23.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255956,3554,543281.0,3460,15.0,110.0,49.0,69.333336,61.0,65.0,18.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255957,3554,543281.0,3465,15.0,100.0,47.0,64.666664,53.0,64.0,17.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0
255958,3554,543281.0,3470,15.0,97.0,44.0,61.666668,53.0,63.0,15.0,...,other,-61,Female,70,Caucasian,72.7,70.0,165.1,0,0


In [14]:
# feature generation
# lagged feature geenration; here LaggedDiffFeatures is used to generate lagged features
# LaggedDiffFeatures calculates difference between current and previous values

periods = []

# using all periods
periods = list(range(2, SAMPLING_RATE_MINUTES * HOURS_FOR_INPUT_HR, 12))
# parallalize feature generation on grouped_by_patient

num_cores = multiprocessing.cpu_count()
print(f"Using {num_cores} cores")


def extract_features(df):
    # return feature_extractor.transform(df)

    return pd.concat(
        objs=[
            running_stats(df, features_vital, periods=periods),
            LaggedDiffFeatures(df, features_vital, periods=periods),
            lagged_fourier_transform(df, features_vital, periods=periods),
        ],
        axis=1,
    )


grouped_by_patient = input_df.sort_values(
    ["patientunitstayid"]
).groupby("groups")

features = Parallel(n_jobs=num_cores)(
    delayed(extract_features)(df) for _, df in tqdm(grouped_by_patient)
)
features_df = pd.concat(features, axis=0)
feature_names_generated = features_df.columns.tolist()

# combining features to labels_df
input_df = pd.concat([input_df, features_df], axis=1)

Using 64 cores


  0%|          | 0/4802 [00:00<?, ?it/s]

100%|██████████| 4802/4802 [00:16<00:00, 289.18it/s]


In [15]:
input_df[input_df['patientunitstayid'] == 543281]

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,feature_lagged_diff_respiration_26,feature_lagged_fourier_meanbp_2,feature_lagged_fourier_meanbp_14,feature_lagged_fourier_meanbp_26,feature_lagged_fourier_heartrate_2,feature_lagged_fourier_heartrate_14,feature_lagged_fourier_heartrate_26,feature_lagged_fourier_respiration_2,feature_lagged_fourier_respiration_14,feature_lagged_fourier_respiration_26
255888,3554,543281.0,3120,11.0,115.0,47.0,69.666664,68.0,62.0,18.0,...,-3.0,7.333336,27.565252,,1.0,11.625520,,4.0,48.658530,
255889,3554,543281.0,3125,15.0,115.0,48.0,70.333336,67.0,62.0,20.0,...,7.0,8.666664,41.547917,,3.0,7.829475,,9.0,30.586605,
255890,3554,543281.0,3130,15.0,114.0,48.0,70.000000,66.0,62.0,18.0,...,0.0,22.000000,20.199895,,14.0,1.312669,,14.0,18.615005,
255891,3554,543281.0,3135,15.0,122.0,57.0,78.666664,65.0,64.0,18.0,...,2.0,10.666664,40.652329,43.652711,6.0,8.756708,20.839385,1.0,31.644388,61.228074
255892,3554,543281.0,3140,15.0,126.0,55.0,78.666664,71.0,65.0,27.0,...,10.0,3.666664,4.574117,,3.0,4.790417,,5.0,5.648435,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255955,3554,543281.0,3455,15.0,108.0,53.0,71.333336,55.0,68.0,23.0,...,,2.999996,,,1.0,,,2.0,,
255956,3554,543281.0,3460,15.0,110.0,49.0,69.333336,61.0,65.0,18.0,...,,9.333332,,,4.0,,,6.0,,
255957,3554,543281.0,3465,15.0,100.0,47.0,64.666664,53.0,64.0,17.0,...,,10.000000,,,2.0,,,1.0,,
255958,3554,543281.0,3470,15.0,97.0,44.0,61.666668,53.0,63.0,15.0,...,,17.333332,,,6.0,,,7.0,,


In [16]:
filter_df = input_df.groupby("groups").tail(1).copy()

In [17]:
filter_df

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,feature_lagged_diff_respiration_26,feature_lagged_fourier_meanbp_2,feature_lagged_fourier_meanbp_14,feature_lagged_fourier_meanbp_26,feature_lagged_fourier_heartrate_2,feature_lagged_fourier_heartrate_14,feature_lagged_fourier_heartrate_26,feature_lagged_fourier_respiration_2,feature_lagged_fourier_respiration_14,feature_lagged_fourier_respiration_26
143,1,141515.0,2235,6.0,102.0,50.0,67.333336,52.0,78.0,22.0,...,,2.000000,,,0.0,,,0.0,,
215,2,141515.0,3165,7.0,104.0,56.0,72.000000,48.0,142.0,22.0,...,0.0,2.000000,21.410996,,0.0,29.807692,,0.0,0.000000,
287,3,155740.0,9275,11.0,98.0,50.0,66.000000,48.0,82.0,20.0,...,,3.333336,,,0.0,,,1.0,,
431,5,156331.0,22465,14.0,130.0,68.0,88.666664,62.0,106.0,24.0,...,,4.666672,,,4.0,,,6.0,,
791,10,157644.0,8185,15.0,98.0,50.0,66.000000,48.0,110.0,25.0,...,,3.333332,,,0.0,,,9.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564839,7844,3352445.0,3455,3.0,125.0,59.0,81.000000,66.0,101.0,20.0,...,0.0,1.666664,34.939583,78.383578,7.0,36.658139,82.067464,0.0,0.000000,0.000000
565055,7847,3352618.0,9230,11.0,144.0,72.0,96.000000,72.0,114.0,14.0,...,,4.666664,,,9.0,,,3.0,,
565199,7849,3353251.0,2470,3.0,120.0,60.0,80.000000,60.0,67.0,15.0,...,-3.0,2.000000,38.197352,23.494226,1.0,28.483292,35.853984,0.0,19.062421,82.743207
565271,7850,3353251.0,3895,6.0,137.0,51.0,79.666664,86.0,83.0,18.0,...,,,,,,,,,,


In [18]:
set(filter_df[filter_df['new_labels'] == 1]['patientunitstayid'].unique()).intersection(set(filter_df[filter_df['new_labels'] == 0]['patientunitstayid'].unique()))

set()

In [19]:
set(filter_df[filter_df['new_labels'] == 0]['patientunitstayid'].unique()).intersection(set(filter_df[filter_df['new_labels'] == 1]['patientunitstayid'].unique()))

set()

### Impute/Drop missing values

In [20]:
feature_names = features_vital + feature_names_generated

In [21]:
data = filter_df[feature_names]
missing = data.isnull().sum().sort_values(ascending = False)/data.shape[0]
missing[0:30]

feature_skew_heartrate_2                 1.000000
feature_kurt_meanbp_2                    1.000000
feature_kurt_respiration_2               1.000000
feature_skew_respiration_2               1.000000
feature_kurt_heartrate_2                 1.000000
feature_skew_meanbp_2                    1.000000
feature_lagged_fourier_respiration_26    0.696168
feature_lagged_fourier_meanbp_26         0.696168
feature_lagged_fourier_heartrate_26      0.696168
feature_lagged_fourier_heartrate_14      0.479592
feature_lagged_fourier_meanbp_14         0.479592
feature_lagged_fourier_respiration_14    0.479592
feature_lagged_diff_respiration_26       0.476885
feature_lagged_diff_heartrate_26         0.476885
feature_lagged_diff_meanbp_26            0.476885
feature_skew_heartrate_26                0.475010
feature_mean_heartrate_26                0.475010
feature_std_heartrate_26                 0.475010
feature_min_heartrate_26                 0.475010
feature_max_heartrate_26                 0.475010


In [22]:
del_cols = missing[missing>0.4].index.tolist()
filter_df.drop(del_cols, axis = 1, inplace = True)
feature_names = list(set(feature_names) - set(del_cols))
filter_df[feature_names].isnull().sum().sum()

36312

In [23]:
features_filter = filter_df[feature_names+['new_labels']].corr()['new_labels']
feature_names_select = features_filter[(features_filter.abs() >= 0.2) & (features_filter.abs() < 1)].index.tolist()

In [24]:
feature_names_select

['feature_min_heartrate_14',
 'feature_mean_heartrate_14',
 'feature_max_heartrate_2',
 'feature_min_heartrate_2',
 'feature_median_respiration_2',
 'feature_median_heartrate_14',
 'feature_mean_heartrate_2',
 'feature_mean_respiration_2',
 'feature_min_respiration_2',
 'feature_median_respiration_14',
 'feature_max_heartrate_14',
 'heartrate',
 'feature_mean_respiration_14',
 'feature_max_respiration_14',
 'feature_min_respiration_14',
 'feature_median_heartrate_2',
 'feature_max_respiration_2',
 'respiration']

In [25]:
import time
def model_results(model):
    model = get_pipeline(
        feature_names_select,
        model 
    )

    cv_scores = evaluate_model(
            model,
            X_train,
            y_train,
            X_test,
            y_test,
            feature_names_select
    )
    print_scores(cv_scores)

    start_time = time.time()
    model.fit(X_train[feature_names_select], y_train)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time:", execution_time, "seconds")

    # predict on test set
    y_pred = model.predict(X_test[feature_names_select])

    # save classification report as csv
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    print(report_df)

In [26]:
def print_scores(scores):
    print(
        "Train AUC: {:.3f} +/- {:.3f}".format(
            np.mean(scores["train_roc_auc"]), np.std(scores["train_roc_auc"])
        )
    )
    print(
        "Test AUC: {:.3f} +/- {:.3f}".format(
            np.mean(scores["test_roc_auc"]), np.std(scores["test_roc_auc"])
        )
    )
    print(
        "Train Precision: {:.3f} +/- {:.3f}".format(
            np.mean(scores["train_precision"]), np.std(scores["train_precision"])
        )
    )
    print(
        "Test Precision: {:.3f} +/- {:.3f}".format(
            np.mean(scores["test_precision"]), np.std(scores["test_precision"])
        )
    )
    print(
        "Train Recall: {:.3f} +/- {:.3f}".format(
            np.mean(scores["train_recall"]), np.std(scores["train_recall"])
        )
    )
    print(
        "Test Recall: {:.3f} +/- {:.3f}".format(
            np.mean(scores["test_recall"]), np.std(scores["test_recall"])
        )
    )
    print(
        "Train F1: {:.3f} +/- {:.3f}".format(
            np.mean(scores["train_f1"]), np.std(scores["train_f1"])
        )
    )
    print(
        "Test F1: {:.3f} +/- {:.3f}".format(
            np.mean(scores["test_f1"]), np.std(scores["test_f1"])
        )
    )


def check_splits(skf, X_train, y_train):
    for train_idx, test_idx in skf.split(
        X_train, y_train, X_train["patientunitstayid"]
    ):
        train_patients = X_train.iloc[train_idx]["patientunitstayid"].unique()
        test_patient = X_train.iloc[test_idx]["patientunitstayid"].unique()

        train_groups = X_train.iloc[train_idx]["groups"].unique()
        test_groups = X_train.iloc[test_idx]["groups"].unique()

        common_patients = set(train_patients).intersection(set(test_patient))
        print("Common patients: {}".format(len(common_patients)))

        common_groups = set(train_groups).intersection(set(test_groups))
        print("Common groups: {}".format(len(common_groups)))

        # ratio of labels in train and test
        print(
            "Train label ratio: {}".format(
                y_train.iloc[train_idx].value_counts(normalize=True)
            )
        )
        print(
            "Test label ratio: {}".format(
                y_train.iloc[test_idx].value_counts(normalize=True)
            )
        )

        print("-----------------------")


def split_based_on_group(
    filter_df, target_name, group_col, test_size=0.2, random_state=42, n_splits=1
):
    splitter = GroupShuffleSplit(
        test_size=test_size, n_splits=n_splits, random_state=random_state
    )
    split = splitter.split(
        filter_df, filter_df[target_name], groups=filter_df[group_col]
    )

    train_inds, test_inds = next(split)

    train = filter_df.iloc[train_inds]
    test = filter_df.iloc[test_inds]

    X_train, y_train, X_test, y_test = (
        train,
        train[target_name],
        test,
        test[target_name],
    )

    return X_train, y_train, X_test, y_test


In [30]:
X_train, y_train, X_test, y_test = split_based_on_group(
    filter_df, 'new_labels', "patientunitstayid"
)

In [23]:
def evaluate_model(
    model, X_train, y_train, X_test, y_test, feature_names_select
): #columns_to_encode

    skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        "precision": "precision",
        "recall": "recall",
        "f1": "f1",
        "roc_auc": "roc_auc",
    }

    scores = cross_validate(
        model,
        X_train[feature_names_select],
        y_train,
        cv=skf.split(X_train, y_train, X_train["patientunitstayid"]),
        scoring=scoring,
        n_jobs=-1,
        return_train_score=True,
    )

    return scores


In [24]:
def get_pipeline(feature_names_numeric, model):#, feature_names_categorical):

    model = Pipeline(
        steps=[
            (
                "preprocessor",
                ColumnTransformer(
                    transformers=[
                        ("numeric", "passthrough", feature_names_numeric),  # no scaling
                        # (
                        #     "categorical",
                        #     OneHotEncoder(handle_unknown="ignore"),
                        #     feature_names_categorical,
                        # ),
                    ]
                ),
            ),
            (
                "classifier",
                model,
            ),
        ]
    )

    return model

In [25]:
model_baseline_gbdt = lgb.LGBMClassifier(
        objective="binary",
        metric="auc",
        boosting_type="gbdt",
        max_depth=4,
        reg_alpha=10,
        reg_lambda=10,
        unbalance=True,
        subsample=0.2,
        colsample_bytree=0.2,
        n_estimators=400,
        num_leaves=30,
        n_jobs=-1,
        random_state=42,
    )

In [26]:
model_results(model_baseline_gbdt)

Train AUC: 0.883 +/- 0.004
Test AUC: 0.835 +/- 0.020
Train Precision: 0.643 +/- 0.012
Test Precision: 0.596 +/- 0.037
Train Recall: 0.815 +/- 0.010
Test Recall: 0.758 +/- 0.042
Train F1: 0.718 +/- 0.006
Test F1: 0.667 +/- 0.037
Execution time: 0.5263066291809082 seconds
              precision    recall  f1-score     support
0              0.906303  0.800000  0.849840  665.000000
1              0.596970  0.781746  0.676976  252.000000
accuracy       0.794984  0.794984  0.794984    0.794984
macro avg      0.751636  0.790873  0.763408  917.000000
weighted avg   0.821296  0.794984  0.802336  917.000000


### Save the model

In [27]:
# model_baseline_gbdt.fit(X_train[feature_names_select], y_train)
# filename = f"/data/public/MLA/share/MLA_interns/pipeline/models/sepsis_classification_traintest_LGBM.sav"
# pickle.dump(model_baseline_gbdt, open(filename, 'wb'))

### Select patients according to test result

In [28]:
model_baseline_gbdt = pickle.load(open('/data/public/MLA/share/MLA_interns/pipeline/models/sepsis_classification_traintest_LGBM.sav', 'rb'))

In [31]:
y_septic_pred = model_baseline_gbdt.predict(X_test[feature_names_select])
X_test['y_pred'] = y_septic_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['y_pred'] = y_septic_pred


In [34]:
X_test['y_pred_prob'] = model_baseline_gbdt.predict_proba(X_test[feature_names_select])[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['y_pred_prob'] = model_baseline_gbdt.predict_proba(X_test[feature_names_select])[:,1]


In [35]:
X_test[(X_test['new_labels'] == X_test['y_pred'])&(X_test['y_pred'] == 1)][['patientunitstayid','groups','y_pred_prob']]

Unnamed: 0,patientunitstayid,groups,y_pred_prob
2015,171473.0,27,0.883938
2951,198697.0,40,0.542629
3023,206759.0,41,0.823471
4751,251926.0,65,0.824542
4823,251926.0,66,0.526681
...,...,...,...
194111,3326839.0,2695,0.880911
195695,3330465.0,2717,0.716060
196199,3333485.0,2724,0.705257
198503,3347760.0,2756,0.813649


In [36]:
X_test[(X_test['new_labels'] == X_test['y_pred'])&(X_test['y_pred'] == 0)][['patientunitstayid','groups','y_pred_prob']]

Unnamed: 0,patientunitstayid,groups,y_pred_prob
202679,164380.0,2814,0.209093
202823,165335.0,2816,0.429192
205199,182368.0,2849,0.208099
206999,195726.0,2874,0.486383
209807,212995.0,2913,0.476144
...,...,...,...
558575,3345238.0,7757,0.397967
562895,3351176.0,7817,0.398858
562967,3351211.0,7818,0.419577
564407,3352068.0,7838,0.449263


In [43]:
DATA_DIR = "/home/daisy/MLA_dataset/"
FILE_NAME = "sepsis_classification_trainDataset_9hrs.parquet.gzip"

# Load the data
tmp_df = pd.read_parquet(os.path.join(DATA_DIR, FILE_NAME))

In [59]:
X_test[X_test['groups'] == 7817]

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,feature_lagged_diff_meanbp_14,feature_lagged_diff_heartrate_2,feature_lagged_diff_heartrate_14,feature_lagged_diff_respiration_2,feature_lagged_diff_respiration_14,feature_lagged_fourier_meanbp_2,feature_lagged_fourier_heartrate_2,feature_lagged_fourier_respiration_2,y_pred,y_pred_prob
562895,7817,3351176.0,9370,14.0,168.0,58.0,94.666664,110.0,60.0,15.0,...,13.333328,0.0,0.0,1.0,2.0,0.333336,0.0,3.0,0,0.398858


In [61]:
tmp_df[tmp_df['groups'] == 7817]

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,category3,category4,label,hospitaladmitoffset,gender,age,ethnicity,admissionweight,dischargeweight,admissionheight
844236,7817,3351176.0,9015,14.0,137.0,47.0,77.000000,90.0,60.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844237,7817,3351176.0,9020,14.0,142.0,49.0,80.000000,93.0,60.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844238,7817,3351176.0,9025,14.0,147.0,52.0,83.666664,95.0,60.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844239,7817,3351176.0,9030,14.0,150.0,52.0,84.666664,98.0,60.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844240,7817,3351176.0,9035,14.0,149.0,53.0,85.000000,96.0,60.0,11.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844339,7817,3351176.0,9530,14.0,167.0,57.0,93.666664,110.0,65.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844340,7817,3351176.0,9535,14.0,165.0,56.0,92.333336,109.0,65.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844341,7817,3351176.0,9540,14.0,170.0,58.0,95.333336,112.0,65.0,13.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0
844342,7817,3351176.0,9545,14.0,164.0,57.0,92.666664,107.0,66.0,14.0,...,s/p CABG < 7 days,,other,-5909,Male,80,African American,103.0,103.3,160.0


### Dropped

In [66]:
# DATA_DIR = "/data/public/MLA/share/MLA_interns/pipeline"
# FILE_NAME = "demo_data.csv" # get from notebook classification_trainDataset

# # Load the data
# demo_df = pd.read_csv(os.path.join(DATA_DIR, FILE_NAME)).reset_index(drop=True)
# #164380:2814 (20%), 171473:27(88%)
# new_data = pd.concat([tmp_df[tmp_df['groups'] == 7817].iloc[:72,:],tmp_df[tmp_df['groups'] == 27].iloc[72:,:]])
# new_data['patientunitstayid'] = [123456]*len(new_data)
# new_data['groups'] = [7817]*len(new_data)
# new_data['observationoffset'] = list(range(9015, 9555, 5))
# demo_df_3 = pd.concat([demo_df, new_data])
# demo_df_3.to_csv('/data/public/MLA/share/MLA_interns/pipeline/demo_data.csv', index = False)

In [67]:
# DATA_DIR = "/data/public/MLA/share/MLA_interns/pipeline"
# FILE_NAME = "demo_data.csv" # get from notebook classification_trainDataset

# # Load the data
# pd.read_csv(os.path.join(DATA_DIR, FILE_NAME)).reset_index(drop=True)

Unnamed: 0,groups,patientunitstayid,observationoffset,gcs,systolicbp,diastolicbp,meanbp,pp,heartrate,respiration,...,category3,category4,label,hospitaladmitoffset,gender,age,ethnicity,admissionweight,dischargeweight,admissionheight
0,29,269986.0,905,10.0,133.0,42.0,72.333336,91.0,70.0,16.0,...,septic shock,,septic,,,,,,,
1,29,269986.0,910,10.0,113.0,39.0,63.666668,74.0,69.0,16.0,...,septic shock,,septic,,,,,,,
2,29,269986.0,915,10.0,99.0,38.0,58.333332,61.0,69.0,15.0,...,septic shock,,septic,,,,,,,
3,29,269986.0,920,10.0,98.0,37.0,57.333332,61.0,68.0,16.0,...,septic shock,,septic,,,,,,,
4,29,269986.0,925,10.0,98.0,38.0,58.000000,60.0,68.0,16.0,...,septic shock,,septic,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,7817,123456.0,9530,11.0,123.0,67.0,85.666664,56.0,123.0,28.0,...,sepsis,,sepsis,-2.0,Female,58.0,African American,63.5,72.7,165.1
428,7817,123456.0,9535,11.0,124.0,66.0,85.333336,58.0,123.0,28.0,...,sepsis,,sepsis,-2.0,Female,58.0,African American,63.5,72.7,165.1
429,7817,123456.0,9540,11.0,150.0,81.0,104.000000,69.0,132.0,29.0,...,sepsis,,sepsis,-2.0,Female,58.0,African American,63.5,72.7,165.1
430,7817,123456.0,9545,11.0,128.0,71.0,90.000000,57.0,128.0,40.0,...,sepsis,,sepsis,-2.0,Female,58.0,African American,63.5,72.7,165.1
