In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
from tqdm import tqdm
from IPython.display import display, HTML
import preprocess as p
from scipy.stats import ttest_ind
from scipy.stats import f
from pprint import pprint
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from math import isnan
import warnings
import optuna
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from impyute.imputation.cs import mice

In [None]:
def load_data(data_path, path_prefix='data/', train=True):
    dataset = 'train/' if train else 'test/'
    file_list = os.listdir(path_prefix + dataset)
    all_patients = []
    for i,file in tqdm(enumerate(file_list)):
        df_all = pd.read_csv(path_prefix + dataset + file, sep='|')
        all_patients.append(df_all)
    return all_patients

In [None]:
def save_model(model, model_name):
    import pickle
    with open(f"{model_name}.pkl", "wb") as f:
        pickle.dump(model, f)

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
def prepare_df(df, rows_to_include=1):
    
    def stats(df,rows_to_include, future=6):
        is_sick = bool(len(df[df.SepsisLabel == 1]))
        if is_sick:
            index = df[df.SepsisLabel == 1].index[0]
            df = df[:index + future]
        df = df.tail(rows_to_include + future).mean(axis=0)
        return df
    
    labels = df.groupby("level_0").apply(lambda x: len(set(x.SepsisLabel) & set([1]))).rename("level_1")
    df = df.merge(labels, on='level_0')
    df['label'] = df['level_1']
    df = df.drop("level_1", axis=1)
    df_last_row = df.groupby("level_0").apply(lambda x: stats(x, rows_to_include, future = 1)).drop(["level_0", 'SepsisLabel'], axis=1)
    return df_last_row

In [6]:
for num_of_rows in [1,5, 10]:
    
    print(f"imp_type = {imp_type}, num_of_rows = {num_of_rows}")
    train_last_row = prepare_df(train, rows_to_include=num_of_rows)
    test_last_row = prepare_df(test, rows_to_include=num_of_rows)
    print("Datasets Created")
    def objective(trial):
        """Define the objective function"""

        params = {
            'max_depth': trial.suggest_int('max_depth', 1, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
            'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
            'scale_pos_weight':trial.suggest_int('scale_pos_weight', 1, 10),
            'eval_metric': 'mlogloss',
            'use_label_encoder': False
        }

    #     train_df = pd.concat([train_last_row, test_last_row], axis=0)
        train_df = train_last_row
        test_df = test_last_row

        # Split the train dataset into features and labels
        X_train = train_df.drop("label", axis=1)
        y_train = train_df["label"]
        X_test = test_df.drop("label", axis=1)
        y_test = test_df["label"]

        # Fit the model
        optuna_model = XGBClassifier(**params)
        optuna_model.fit(X_train, y_train)

        # Make predictions
        y_pred = optuna_model.predict(X_test)

        # Evaluate predictions
        f1 = f1_score(y_test, y_pred)
        return f1


    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=300)



    train_last_row = prepare_df(train, fill_nulls=imp_type, rows_to_include=num_of_rows)
    test_last_row = prepare_df(test, fill_nulls=imp_type, rows_to_include=num_of_rows, train_flag=False)
    train_df = train_last_row
    test_df = test_last_row


    X_train = train_df.drop("label", axis=1)
    y_train = train_df["label"]
    X_test = test_df.drop("label", axis=1)
    y_test = test_df["label"]

    # Fit the model
    model = XGBClassifier(**study.best_params)
    model.fit(X_train, y_train)

    from xgboost import plot_importance
    plot_importance(model, max_num_features=10) # top 10 most important features
    plt.show()



NameError: name 'imp_type' is not defined

In [91]:
X_train = train_last_row.drop("label", axis=1)
y_train = train_last_row["label"]
X_test = test_last_row.drop("label", axis=1)
y_test = test_last_row["label"]


# Split data into training and validation sets


def objective(trial):
    # Define hyperparameters to be tuned
    n_estimators = trial.suggest_int("n_estimators", 50, 1050, step=100)
    max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])

    class_weights = {0: 1, 1: 5}

    # Define random forest classifier with hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        class_weight=class_weights,
    )

    # Fit classifier to training data
    clf.fit(X_train, y_train)

    # Predict on validation data
    y_pred = clf.predict(X_test)

    # Calculate accuracy score
    f1 = f1_score(y_test, y_pred)

    # Return accuracy score as objective value
    return f1


# Define study object
study = optuna.create_study(direction="maximize")

# Run hyperparameter optimization
study.optimize(objective, n_trials=100)

# Print best hyperparameters and objective value
print("Best hyperparameters: ", study.best_params)
print("Best f1: ", study.best_value)

[32m[I 2023-05-07 15:51:06,647][0m A new study created in memory with name: no-name-3cc491bb-0e55-45cc-af90-7289ecafdc77[0m
[33m[W 2023-05-07 15:51:06,671][0m Trial 0 failed with parameters: {'n_estimators': 750, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'sqrt'} because of the following error: ValueError("Input contains NaN, infinity or a value too large for dtype('float32').").[0m
Traceback (most recent call last):
  File "C:\Users\ofekg\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ofekg\AppData\Local\Temp\ipykernel_25700\1431491153.py", line 38, in objective
    clf.fit(X_train, y_train)
  File "C:\Users\ofekg\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "C:\Users\ofekg\AppData\Local\Programs\Python\Python39\lib\site-packages\sklear

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# With 5 rows back, best features were:

In [93]:
one_row_back = {'max_depth': 5, 'learning_rate': 0.12637669071933097, 'n_estimators': 199, 'min_child_weight': 7, 'gamma': 8.788101397006915e-06, 'subsample': 0.34961859437458515, 'colsample_bytree': 0.7460989708274746, 'reg_alpha': 1.09797444756061e-06, 'reg_lambda': 3.0861484148146835e-06, 'scale_pos_weight': 2}
five_rows_back = {'max_depth': 6, 'learning_rate': 0.020021827800040026, 'n_estimators': 409, 'min_child_weight': 8, 'gamma': 0.002376046358768467, 'subsample': 0.3267777415133442, 'colsample_bytree': 0.9922434396349483, 'reg_alpha': 0.03345778334794334, 'reg_lambda': 0.00023611208593944376, 'scale_pos_weight': 2}
ten_rows_back = {'max_depth': 4, 'learning_rate': 0.036332387280166745, 'n_estimators': 500, 'min_child_weight': 4, 'gamma': 3.7763230108728382e-06, 'subsample': 0.7222965801713592, 'colsample_bytree': 0.8140034993675944, 'reg_alpha': 2.0147933854605774e-07, 'reg_lambda': 4.2754835795019617e-07, 'scale_pos_weight': 3}

In [13]:
one_row_back = {'max_depth': 5, 'learning_rate': 0.12637669071933097, 'n_estimators': 199, 'min_child_weight': 7, 'gamma': 8.788101397006915e-06, 'subsample': 0.34961859437458515, 'colsample_bytree': 0.7460989708274746, 'reg_alpha': 1.09797444756061e-06, 'reg_lambda': 3.0861484148146835e-06, 'scale_pos_weight': 2}
five_rows_back = {'max_depth': 6, 'learning_rate': 0.020021827800040026, 'n_estimators': 409, 'min_child_weight': 8, 'gamma': 0.002376046358768467, 'subsample': 0.3267777415133442, 'colsample_bytree': 0.9922434396349483, 'reg_alpha': 0.03345778334794334, 'reg_lambda': 0.00023611208593944376, 'scale_pos_weight': 2}
ten_rows_back = {'max_depth': 4, 'learning_rate': 0.036332387280166745, 'n_estimators': 500, 'min_child_weight': 4, 'gamma': 3.7763230108728382e-06, 'subsample': 0.7222965801713592, 'colsample_bytree': 0.8140034993675944, 'reg_alpha': 2.0147933854605774e-07, 'reg_lambda': 4.2754835795019617e-07, 'scale_pos_weight': 3}
for rows, params in [(1, one_row_back), (5, five_rows_back), (10, ten_rows_back)]:
    train_last_row = prepare_df(train, rows_to_include=rows)
    test_last_row = prepare_df(test, rows_to_include=rows)

    X_train = train_last_row.drop("label", axis=1)
    y_train = train_last_row["label"]
    X_test = test_last_row.drop("label", axis=1)
    y_test = test_last_row["label"]

    # Fit the model
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    save_model(model, f"xgboost_{rows}_rows_back")
#     from xgboost import plot_importance
#     plot_importance(model, max_num_features=10) # top 10 most important features
#     plt.show()

# RandomForest

In [15]:
import pandas as pd
from sklearn.impute import KNNImputer

def impute_null_values(train_df, test_df=None, method='mean', k=1):
    """
    Function that imputes null values of a pandas dataframe using mean/median or k-NN imputation.
    
    Parameters:
    train_df (pd.DataFrame): dataframe containing training data with null values to be imputed.
    test_df (pd.DataFrame): dataframe containing test data with null values to be imputed (default=None).
                             If None, only the train set will be imputed.
    method (str): 'mean' or 'median' for mean/median imputation or 'knn' for k-NN imputation (default='mean').
    k (int): number of neighbors to use for k-NN imputation (default=5).
    
    Returns:
    pd.DataFrame: dataframe with imputed null values.
    """
    # Make a copy of the original train dataframe to avoid modifying it.
    imputed_train_df = train_df.copy()
    if method is None:
        return train_df, test_df
    if method == 'mean':
        # Replace null values with the mean of each column from the train set.
        imputed_train_df.fillna(imputed_train_df.mean(), inplace=True)
        
    elif method == 'median':
        # Replace null values with the median of each column from the train set.
        imputed_train_df.fillna(imputed_train_df.median(), inplace=True)
        
    elif method == 'knn':
        # Replace null values with k-NN imputation using the values from the train set.
        imputer = KNNImputer(n_neighbors=k)
        imputed_array = imputer.fit_transform(imputed_train_df)
        imputed_train_df = pd.DataFrame(imputed_array, columns=imputed_train_df.columns)
        
    else:
        raise ValueError("Invalid imputation method. Choose either 'mean', 'median' or 'knn'.")
    
    if test_df is not None:
        # Make a copy of the original test dataframe to avoid modifying it.
        imputed_test_df = test_df.copy()
        
        if method == 'mean':
            # Replace null values with the mean of each column from the train set.
            imputed_test_df.fillna(imputed_train_df.mean(), inplace=True)
        
        elif method == 'median':
            # Replace null values with the median of each column from the train set.
            imputed_test_df.fillna(imputed_train_df.median(), inplace=True)
            
        elif method == 'knn':
            # Replace null values with k-NN imputation using the values from the train set.
            imputed_array = imputer.transform(imputed_test_df)
            imputed_test_df = pd.DataFrame(imputed_array, columns=imputed_test_df.columns)
            
        else:
            raise ValueError("Invalid imputation method. Choose either 'mean', 'median' or 'knn'.")
        
        return imputed_train_df, imputed_test_df
    
    else:
        return imputed_train_df


In [7]:
for imp in ['mean', 'median', 'knn']:
    print("Preparing dataset...")
    train_last_row = prepare_df(train, rows_to_include=5)
    print("Train...Done!")
    test_last_row = prepare_df(test, rows_to_include=5)
    print("Test...Done!")
    print("Imputing train and test...")
    train_imp, test_imp = impute_null_values(train_last_row, test_last_row, imp, k=1)
    print("Imputation Done!")

    X_train = train_imp.drop("label", axis=1)
    y_train = train_imp["label"]
    X_test = test_imp.drop("label", axis=1)
    y_test = test_imp["label"]

    def objective(trial):
        # Define hyperparameters to be tuned
        n_estimators = trial.suggest_int("n_estimators", 50, 1050, step=100)
        max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        max_features = trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])
        sick_weight = trial.suggest_int("sick_weight", 1, 10)
        class_weights = {0: 1, 1: sick_weight}

        # Define random forest classifier with hyperparameters
        clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42,
            class_weight=class_weights,
        )

        # Fit classifier to training data
        clf.fit(X_train, y_train)

        # Predict on validation data
        y_pred = clf.predict(X_test)

        # Calculate accuracy score
        f1 = f1_score(y_test, y_pred)

        # Return accuracy score as objective value
        return f1


    # Define study object
    study = optuna.create_study(direction="maximize")

    # Run hyperparameter optimization
    study.optimize(objective, n_trials=30)
    
    class_weights = {0: 1, 1: study.best_params.pop('sick_weight')}
    params = {k:v for k,v in study.best_params.items() if k != 'sick_weight'}
    clf = RandomForestClassifier(**params, class_weight=class_weights)
    clf.fit(X_train, y_train)
    save_model(clf, imp+"_RF")
    # Print best hyperparameters and objective value
    print("Best hyperparameters: ", study.best_params)
    print("Best f1: ", study.best_value)

Preparing dataset...


KeyboardInterrupt: 

In [16]:
mean_imp_best = {'n_estimators': 850, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_features': 'log2', 'sick_weight': 8}
params = {k:v for k,v in mean_imp_best.items() if k != 'sick_weight'}
median_imp_best = {'n_estimators': 350, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': 'log2', 'sick_weight': 5}
params = {k:v for k,v in median_imp_best.items() if k != 'sick_weight'}
knn_imp_best = {'n_estimators': 550, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'log2', 'sick_weight': 10}
params = {k:v for k,v in knn_imp_best.items() if k != 'sick_weight'}

for imp, params in [('mean', mean_imp_best), ('median', median_imp_best), ('knn', knn_imp_best)]:
    class_weights = {0: 1, 1: params['sick_weight']}

    params = {k:v for k,v in params.items() if k != 'sick_weight'}
    
    print("Preparing dataset...")
    train_last_row = prepare_df(train, rows_to_include=5)
    print("Train...Done!")
    test_last_row = prepare_df(test, rows_to_include=5)
    print("Test...Done!")
    print("Imputing train and test...")
    train_imp, test_imp = impute_null_values(train_last_row, test_last_row, imp, k=1)
    print("Imputation Done!")

    X_train = train_imp.drop("label", axis=1)
    y_train = train_imp["label"]
    X_test = test_imp.drop("label", axis=1)
    y_test = test_imp["label"]
    clf = RandomForestClassifier(**params, class_weight=class_weights)
    clf.fit(X_train, y_train)
    save_model(clf, imp+"_RF")
    print(f"saved model {imp}")

Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...
Imputation Done!
saved model mean
Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...
Imputation Done!
saved model median
Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...
Imputation Done!
saved model knn


In [17]:
for rows in [1,5,10]:
    print("Preparing dataset...")
    train_last_row = prepare_df(train, rows_to_include=rows)
    print("Train...Done!")
    test_last_row = prepare_df(test, rows_to_include=rows)
    print("Test...Done!")
    print("Imputing train and test...")
    train_imp, test_imp = impute_null_values(train_last_row, test_last_row, 'mean', k=1)
    print("Imputation Done!")
    X_train = train_imp.drop(columns=["label"], axis=1)
    y_train = train_imp["label"]
    X_test = test_imp.drop(columns=["label"], axis=1)
    y_test = test_imp["label"]

    def objective(trial):
        # Define hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0)
        algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
        random_state = trial.suggest_int('random_state', 0, 100)

        # Train Adaboost classifier with the given hyperparameters
        model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, 
                                   algorithm=algorithm, random_state=random_state)
        model.fit(X_train, y_train)

        # Evaluate the model on the testing set
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        return f1

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=15)
    
    model = AdaBoostClassifier(**study.best_params)
    model.fit(X_train, y_train)
    
    save_model(model, f"rows_{rows}_ADA")
    # Print the best hyperparameters and f1 score
    print("Best hyperparameters: ", study.best_params)
    print("Best f1 score: ", study.best_value)


Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...


[32m[I 2023-05-09 13:53:00,526][0m A new study created in memory with name: no-name-44fc3536-c6dc-411f-beb3-c69d6179f119[0m


Imputation Done!


[32m[I 2023-05-09 13:53:21,787][0m Trial 0 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 465, 'learning_rate': 0.0011770125849433326, 'algorithm': 'SAMME', 'random_state': 47}. Best is trial 0 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 13:53:40,503][0m Trial 1 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 376, 'learning_rate': 0.003179947024878677, 'algorithm': 'SAMME', 'random_state': 70}. Best is trial 0 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 13:54:01,737][0m Trial 2 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 254, 'learning_rate': 0.014386850372288278, 'algorithm': 'SAMME', 'random_state': 44}. Best is trial 0 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 13:54:35,487][0m Trial 3 finished with value: 0.6570281124497992 and parameters: {'n_estimators': 410, 'learning_rate': 0.42219065420044877, 'algorithm': 'SAMME.R', 'random_state': 37}. Best is trial 3 with v

Best hyperparameters:  {'n_estimators': 410, 'learning_rate': 0.42219065420044877, 'algorithm': 'SAMME.R', 'random_state': 37}
Best f1 score:  0.6570281124497992
Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...
Imputation Done!


[32m[I 2023-05-09 14:00:38,206][0m A new study created in memory with name: no-name-0befeb8e-dbab-498b-a265-0a442207d524[0m
[32m[I 2023-05-09 14:00:49,160][0m Trial 0 finished with value: 0.4139387539598733 and parameters: {'n_estimators': 184, 'learning_rate': 0.012966626035152036, 'algorithm': 'SAMME', 'random_state': 52}. Best is trial 0 with value: 0.4139387539598733.[0m
[32m[I 2023-05-09 14:00:52,357][0m Trial 1 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 53, 'learning_rate': 0.012930596039955283, 'algorithm': 'SAMME', 'random_state': 40}. Best is trial 1 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 14:01:05,073][0m Trial 2 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 184, 'learning_rate': 0.0038293197155126924, 'algorithm': 'SAMME', 'random_state': 3}. Best is trial 1 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 14:01:42,384][0m Trial 3 finished with value: 0.6167097329888028 and parameters: {'

Best hyperparameters:  {'n_estimators': 357, 'learning_rate': 0.6618631252661278, 'algorithm': 'SAMME.R', 'random_state': 95}
Best f1 score:  0.6792452830188679
Preparing dataset...
Train...Done!
Test...Done!
Imputing train and test...
Imputation Done!


[32m[I 2023-05-09 14:07:41,130][0m A new study created in memory with name: no-name-a4919313-461a-4069-8f1c-52990150d443[0m
[32m[I 2023-05-09 14:08:08,046][0m Trial 0 finished with value: 0.4911955514365152 and parameters: {'n_estimators': 327, 'learning_rate': 0.0010055281423484227, 'algorithm': 'SAMME.R', 'random_state': 71}. Best is trial 0 with value: 0.4911955514365152.[0m
[32m[I 2023-05-09 14:08:37,080][0m Trial 1 finished with value: 0.6265664160401003 and parameters: {'n_estimators': 354, 'learning_rate': 0.4537209174164803, 'algorithm': 'SAMME', 'random_state': 42}. Best is trial 1 with value: 0.6265664160401003.[0m
[32m[I 2023-05-09 14:09:12,523][0m Trial 2 finished with value: 0.4975798644724105 and parameters: {'n_estimators': 434, 'learning_rate': 0.01869315268245823, 'algorithm': 'SAMME.R', 'random_state': 76}. Best is trial 1 with value: 0.6265664160401003.[0m
[32m[I 2023-05-09 14:09:19,754][0m Trial 3 finished with value: 0.6502866502866502 and parameters:

Best hyperparameters:  {'n_estimators': 174, 'learning_rate': 0.9761878568519904, 'algorithm': 'SAMME.R', 'random_state': 52}
Best f1 score:  0.6735015772870663
