In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

In [None]:
# Get the feature of the training set
def get_features(filename):
    features = pd.read_csv('./'+filename)
    print(features.head(12))
    # train = train.sample(frac=1).reset_index(drop=True)
    # print(train.head())
    return features

# Get the labels of the training set
def get_labels(filename):
    labels = pd.read_csv('./'+filename)
    print(labels.head(12))
    # train = train.sample(frac=1).reset_index(drop=True)
    # print(train.head())
    return labels

# Get the features of the testing set
def get_test_feature(filename):
    test_features = pd.read_csv('./'+filename)
    print(test_features.head(12))
    # train = train.sample(frac=1).reset_index(drop=True)
    # print(train.head())
    return test_features

train_features = get_features('train_features.csv')
labels = get_labels('train_labels.csv')
test_features = get_test_feature('test_features.csv')
print(train_features.shape)
print(labels.shape)
print(test_features.shape)

In [None]:
train_features = train_features.drop(columns = ['pid', 'Time'])
test_features = test_features.drop(columns = ['pid', 'Time'])

In [None]:
train_features.head(12)

In [None]:
# Average multiple tests on one patient in the training set
average_train_features = train_features.groupby(np.arange(len(train_features))//12).mean()
print(average_train_features.shape)

# Check now how many are still NaN
for index, row in average_train_features.iteritems():
    print(average_train_features[index].isnull().sum())
average_train_features.head(12)

In [None]:
# Average multiple tests on one patient in the test set, also impute it using the same KNN imputer
average_test_features = test_features.groupby(np.arange(len(test_features))//12).mean()
print(average_test_features.shape)
average_test_features.head(12)

# Check now how many are still NaN
for index, row in average_test_features.iteritems():
    print(average_test_features[index].isnull().sum())
average_test_features.head(12)

In [None]:
'''
Since the HistGradBoosting method can handle the missing data and with better performance, skip the imputation part and
jump directly to the cell without imputation "# Not imputed"
'''

In [None]:
# Use KNN to impute data
from sklearn.impute import KNNImputer

In [None]:
# Impute the training features
imputer = KNNImputer(n_neighbors=1000)
imputed = imputer.fit_transform(average_train_features)
imputed_train_features = pd.DataFrame(imputed, columns=average_train_features.columns)

imputed_train_features.head(12)

In [None]:
# Also impute the test features
imputed_test = imputer.transform(average_test_features)
imputed_test_features = pd.DataFrame(imputed_test, columns=average_test_features.columns)

imputed_test_features.head(12)

In [None]:
# Number of NaN in train_features
print(imputed_train_features.shape)
print(imputed_train_features.isnull().sum())

In [None]:
# Number of NaN in the test_features
print(imputed_test_features.shape)
print(imputed_test_features.isnull().sum())

In [None]:
# Check the labels
print(labels.shape)
labels.head(12)

In [None]:
# # Delect the columns with too many NaN
# for index, row in average_features.iteritems():
#     if average_features[index].isnull().sum() >= 0.7 * len(average_features[index]):
#         average_features = average_features.drop(columns=index)
#         average_test_features = average_test_features.drop(columns=index)

In [None]:
# Imputed
X_train = imputed_train_features
X_test = imputed_test_features

In [None]:
'''
START FROM HERE WITHOUT IMPUTATION
'''

In [None]:
# Not imputed
X_train = average_train_features
X_test = average_test_features

In [None]:
medical_tests = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 
                 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
disease = ['LABEL_Sepsis']
vital_signs = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

In [None]:
# Standardize the training data
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# Import validation methods
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [None]:
# Histogram-Based Gradient Boosting Classifier and Regressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
output_dict = {}
for index, row in labels.iteritems():
    if index in medical_tests:
        hgbC = HistGradientBoostingClassifier()
        y = labels[index]
        fit = hgbC.fit(X_train, y)
        proba = fit.predict_proba(X_test)[:,1]
        output_dict[index] = proba
    elif index in disease:
        hgbC = HistGradientBoostingClassifier()
        y = labels[index]
        fit = hgbC.fit(X_train, y)
        proba = fit.predict_proba(X_test)[:,1]
        output_dict[index] = proba
    elif index in vital_signs:
        hgbR = HistGradientBoostingRegressor()
        y = labels[index]
        fit = hgbR.fit(X_train, y)
        values = fit.predict(X_test)
        output_dict[index] = values
    print(output_dict)
output = pd.DataFrame(output_dict)

In [None]:
output.to_csv('prediction_9.csv', index=False, float_format='%.3f')

In [None]:
hgbC = HistGradientBoostingClassifier()

In [None]:
hgbR = HistGradientBoostingRegressor()

In [None]:
for item in medical_tests + disease:
    y = labels[item]
    score = cross_validate(hgbC, X_train, y, cv=10, scoring=('roc_auc', 'accuracy'), return_train_score=True)
    avgScore = {}
    for k,v in score.items():
        # v is the list of scores for fold k
        avgScore[k] = sum(v)/ float(len(v))
    print(avgScore)

In [None]:
for item in vital_signs:
    y = labels[item]
    regression_score = cross_validate(hgbR, X_train, y, cv=10, scoring=('r2'), return_train_score=True)
    avgScore = {}
    for k,v in regression_score.items():
        # v is the list of scores for fold k
        avgScore[k] = sum(v)/ float(len(v))
    print(avgScore)

In [None]:
'''
Tried to select HistGradientBoostingClassifier's parameters and find there are not that much difference.
'''
# # Grid Search Cross Validation
# parameters = {
#  'max_iter': [1000,1200,1500],
#  'learning_rate': [0.1],
#  'max_depth': [25, 50, 75],
#  'l2_regularization': [1.5],
#  'random_state': [2022],
#  }
# tuning = GridSearchCV(estimator =HistGradientBoostingClassifier(), 
#             param_grid = parameters, scoring='roc_auc',n_jobs=4, cv=5)
# tuning.fit(X_train,y)
# tuning.cv_results_, tuning.best_params_, tuning.best_score_

In [None]:
'''
The Code below is production code during model selection and testing, including Logistic Regression, SGD classifier, SVM with kernels, KNN. 
They are either too slow or with bad performance so I do not use them afterall.
'''

In [None]:
# Import the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

In [None]:
# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
y_train.mean()

In [None]:
# Test for LABEL_BaseExcess
model = svm.SVC(C=0.8, kernel='rbf', gamma=0.04, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
y = labels['LABEL_Fibrinogen']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
y_train.mean()

In [None]:
# Test for LABEL_Fibrinogen
model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_AST
y = labels['LABEL_AST']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.02, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_Alkalinephos
y = labels['LABEL_Alkalinephos']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_Bilirubin_total
y = labels['LABEL_Bilirubin_total']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=0.8, kernel='rbf', gamma=0.075, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_Lactate
y = labels['LABEL_Lactate']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_TroponinI
y = labels['LABEL_TroponinI']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_SaO2
y = labels['LABEL_SaO2']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_Bilirubin_direct
y = labels['LABEL_Bilirubin_direct']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=0.01, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_EtCO2
y = labels['LABEL_EtCO2']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=1, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
# Test for LABEL_Sepsis
y = labels['LABEL_Sepsis']
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the training features and test features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

print(y_train.mean())

model = svm.SVC(C=1, kernel='rbf', gamma=1, class_weight='balanced', probability=True)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auroc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print('y_pred_train mean: ', y_pred_train.mean())
print('y_test mean: ', y_test.mean())
print('y_pred_test mean: ', y_pred.mean())
print('train accuracy: ', score)
print('test accuracy: ', test_score)
print('auroc score: ', auroc_score)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [None]:
print(X)

In [None]:
ss = StandardScaler()
X = ss.fit_transform(X)
X

In [None]:
model = svm.SVC(C=0.1, kernel='rbf', gamma=0.01, class_weight='balanced')
scores = cross_validate(model, X_train, y_train, cv=5, scoring=('roc_auc', 'accuracy'))
scores

In [None]:
model = svm.SVC(C=0.2, kernel='rbf', gamma=0.02, class_weight='balanced')
scores = cross_validate(model, X, y, cv=5, scoring=('roc_auc', 'accuracy'))
scores

In [None]:
# Use Cross validation to find the best C and gamma
def optimize_C_gamma(X, y, index):
    print('Scores for label: ', index, 'Start! \n')
    scores = []
    for c in np.arange(0.1, 1.1, 0.3):
        for g in np.arange(0.01, 0.25, 0.04):
            print('Current C: ', c)
            print('Current Gamma: ', g)
            # Train the SVM model
            model = svm.SVC(C=c, kernel='rbf', gamma=g, class_weight='balanced')
            score = cross_validate(model, X, y, cv=5, scoring=('roc_auc', 'accuracy'), return_train_score=True)
            avgScore = {}
            avgScore['LABEL'] = index
            for k,v in score.items():
                # v is the list of grades for student k
                avgScore[k] = sum(v)/ float(len(v))
            avgScore.pop('fit_time')
            avgScore.pop('score_time')
        
            scores.append(avgScore)
            print(avgScore, '\n')
            print('Interate once!\n')
    print('Scores for label: ', index, 'Finished! \n')
    return scores

In [None]:
def gridsearch_C_gamma(X, y, index):
    print('Scores for label: ', index, 'Start! \n')
    params_grid = {'C': np.arange(0.1, 1, 0.3), 'gamma': np.arange(0.01, 0.2, 0.03)}
    model = svm.SVC()
    grid_search = GridSearchCV(model, param_grid = params_grid, scoring='roc_auc')
    grid_search.fit(X, y)
    score = grid_search.score(X, y)
    return score

In [None]:
# Use KNN to classify
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = X
y = labels['LABEL_BaseExcess']

def search_knn(X, y, index):
    print('Scores for label: ', index, 'Start! \n')
    scores = []
    for k in range(1,20,2):
        print('K: ', k)
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_validate(knn, X, y, cv=5, scoring=['roc_auc', 'accuracy'], return_train_score=True)
        avgScore = {}
        avgScore['LABEL'] = index
        for k,v in score.items():
            # v is the list of grades for student k
            avgScore[k] = sum(v)/ float(len(v))
        avgScore.pop('fit_time')
        avgScore.pop('score_time')
        scores.append(avgScore)
        print(avgScore, '\n')
        print('Interate once!\n')
    print('Scores for label: ', index, 'Finished! \n')
    print(scores)

In [None]:
def search_knn(X, y, index):
    print('Scores for label: ', index, 'Start! \n')
    scores = []
    for k in range(201,400,4):
        print('K: ', k)
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_validate(knn, X, y, cv=5, scoring=['roc_auc', 'accuracy'], return_train_score=True)
        avgScore = {}
        avgScore['LABEL'] = index
        for k,v in score.items():
            # v is the list of grades for student k
            avgScore[k] = sum(v)/ float(len(v))
        avgScore.pop('fit_time')
        avgScore.pop('score_time')
        scores.append(avgScore)
        print(avgScore, '\n')
        print('Interate once!\n')
    print('Scores for label: ', index, 'Finished! \n')

In [None]:
X = X
y = labels['LABEL_Sepsis']
y.mean()

In [None]:
knn = KNeighborsClassifier(n_neighbors=10000)
score_test = cross_validate(knn, X, y, cv=5, scoring=['roc_auc', 'accuracy'], return_train_score=True)
avgScore_test={}
avgScore_test['LABEL'] = 'LABEL_Sepsis'
for k,v in score_test.items():
    # v is the list of output for item k
    avgScore_test[k] = sum(v)/ float(len(v))
# avgScore_test.pop('fit_time')
# avgScore_test.pop('score_time')
print(avgScore_test, '\n')

In [None]:
search_knn(X, labels['LABEL_Sepsis'], 'LABEL_Sepsis')

In [None]:
# Train models for each medical test
medical_tests = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

for index, row in labels.iteritems():
    if index in medical_tests:
        scores_for_label = search_knn(X, labels[index], index)

In [None]:
scores = optimize_C_gamma(X, y, 'LABEL_BaseExcess')

In [None]:
model = svm.SVC(C=0.8, kernel='rbf', gamma=0.04, class_weight='balanced')
scores = cross_validate(model, X, y, cv=5, scoring=('roc_auc', 'accuracy'), return_train_score=True)
scores

In [None]:
# Train models for each medical test
medical_tests = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

# Iterate the medical tests
for index, row in labels.iteritems():
    if index in medical_tests:
        scores_for_label = optimize_C_gamma(X, labels[index], index)

In [None]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier

In [None]:
X = X
y = labels['LABEL_Sepsis']

In [None]:
sgd = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, class_weight='balanced')

In [None]:
scores = cross_validate(sgd, X, y, cv=5, scoring=('roc_auc', 'accuracy'), return_train_score=True)
print(scores)