In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K

In [3]:
train = pd.read_csv('data/trainSet.txt')
test = pd.read_csv('data/testSet.txt')

In [17]:
binary_vars = ['RespiratoryFailure', 'Sex', 'Cough', 'DifficultyInBreathing', 'CardiovascularDisease']

## Autoencoder should work!

In [4]:
def create_model():
    
    regularizer = keras.regularizers.l1_l2(l1=0.0, l2=0.0)
    
    inputs = keras.Input(shape=(16,))
    
    x = keras.layers.Dense(16, activation='relu')(inputs)
    # build a little auto-encoder
    x = keras.layers.Dense(160, activation='relu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.25)(x)
    # bottleneck
    x = keras.layers.Dense(8, activation='relu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.25)(x)
    x = keras.layers.Dense(160, activation='relu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.25)(x)
    
    out1 = keras.layers.Dense(16, activation='sigmoid')(x)
    out2 = keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs, {'impute': out1, 'classify':out2})
    
    return model

In [5]:
# From https://github.com/keras-team/keras/issues/7065
def masked_mse(mask_value=-1):
    def f(y_true, y_pred):
        mask_true = K.cast(K.not_equal(y_true, mask_value), K.floatx())
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sum(masked_squared_error, axis=-1) / K.sum(mask_true, axis=-1)
        return masked_mse * 100

    return f

In [6]:
def imputer_validation(valid_set):
    """
    Function to remove extra fields from data for validation.
    """
    fields_removed = valid_set.copy()
    imputer_mask = np.zeros(valid_set.shape)
    for n, row in enumerate(fields_removed):
        non_nan = np.where(row != -1)[0]
        set_nan = np.random.choice(non_nan)
        fields_removed[n, set_nan] = -1
        imputer_mask[n, set_nan] = True
    
    return fields_removed, imputer_mask

In [7]:
def calculate_score(x_true, x_pred, errorfun='mse', scaler=None):
    if scaler:
        x_true = scaler.inverse_transform(x_true.copy())
        x_pred = scaler.inverse_transform(x_pred.copy())
        x_true = np.clip(x_true, a_min=-1, a_max=None)
        x_pred = np.clip(x_pred, a_min=-1, a_max=None)
    if errorfun == 'mse':
        score = mean_squared_error(x_true, x_pred)
    elif errorfun == 'acc':
        score = accuracy_score(x_true, x_pred > 0.5)
    elif errorfun == "masked_mse":
        score = masked_mse(-1)(x_true, x_pred)
        score = np.mean(score)
    
    return score

In [20]:
# columns to remove:
cols_out = ['PatientID', 'ImageFile', 'Hospital', 'Prognosis']
impute_cols = [i for i in train.columns if i not in cols_out]
# split validation and prediction
train_clean = train[impute_cols]
train_y = train['Prognosis'] == 'MILD'

In [9]:
# Factorize hospitals, and use hospitals for cross-validation
train.Hospital, mapping_index = train.Hospital.factorize()
PF = PredefinedSplit(train['Hospital'])

In [10]:
early_stopping = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [11]:
accs, mses0, mses1, imps0, imps1, trues, preds = [], [], [], [], [], [], []

train_mask = np.isnan(train_clean).values

for n, (tr, vl) in enumerate(PF.split(train)):
    model = create_model()
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  loss={'impute': masked_mse(), 'classify': 'binary_crossentropy'},
                  metrics={'impute': 'mse', 'classify': 'accuracy'})
    MM = MinMaxScaler(clip=True)
    
    # Remove Nans
    x_train = train_clean.iloc[tr].copy().fillna(0)
    x_valid = train_clean.iloc[vl].copy().fillna(0)
    # Define y-values
    y_train = train_y.iloc[tr]
    y_valid = train_y.iloc[vl]
    # Scale data from 0 to 1
    x_train = MM.fit_transform(x_train)
    x_valid = MM.transform(x_valid)
    
    # Set former nan values to -1
    x_train[train_mask[tr]] = -1
    x_valid[train_mask[vl]] = -1
    
    # Fit Model
    model.fit(x_train, {'impute':x_train, 'classify': y_train}, verbose=0, callbacks=[early_stopping], validation_split=0.15,
             epochs=250)
    
    # Predict the things
    prediction = model.predict(x_valid)
    preds.append(MM.inverse_transform(prediction['impute']))

    pred_val, pred_y = prediction['impute'], prediction['classify']
    
    
    # Returns metrics, weirdly
    _, _, _, train_acc, train_mse = model.evaluate(x_train, 
                                                   {'impute':x_train, 
                                                    'classify': y_train}, 
                                                   verbose=0)
    
    # Set previous NAN to -1
    pred_val[train_mask[vl]] = -1
    x_valid[train_mask[vl]] = -1
    
    # imputer validation:
    x_impute, impute_mask = imputer_validation(x_valid)
    
    imputer_predict = model.predict(x_impute)['impute']
    imputer_predict[impute_mask==False] = -1
    
    validation_mask = x_valid.copy()
    validation_mask[impute_mask==False] = -1
    
    # imputer error
    imp0 = calculate_score(validation_mask, 
                            imputer_predict, errorfun='masked_mse')
    imp1 = calculate_score(validation_mask, 
                            imputer_predict, errorfun='masked_mse', scaler=MM)
    
    # Keepingt track
    imps0.append(imp0)
    imps1.append(imp1)
    mses0.append(calculate_score(x_valid, pred_val, scaler=MM, errorfun='masked_mse'))
    mses1.append(calculate_score(x_valid, pred_val, errorfun='masked_mse'))
    accs.append(calculate_score(y_valid, pred_y, errorfun='acc'))
    trues.append(MM.inverse_transform(x_valid))
    preds.append(MM.inverse_transform(prediction['impute']))
    print(f"Split {mapping_index[n]}, with validation n = {x_valid.shape[0]}: \nRescaled MSE - {mses0[-1]:4.2f} " +
          f"MSE - {mses1[-1]:4.2f} " +
          f"Acc - {accs[-1]:4.2f}\n" +
          f"Impute MSE - {imps0[-1]:4.2f} " +
          f"Impute MSE scaled - {imps1[-1]:4.2f}")
          
    print(f"Train - MSE {train_mse:4.3f}, ACC {train_acc:4.3f}")
    
    print("==============================================")

Split D, with validation n = 139: 
Rescaled MSE - 125896808.00 MSE - 1.97 Acc - 0.65
Impute MSE - 8.96 Impute MSE scaled - 30325944.00
Train - MSE 0.550, ACC 0.633
Split E, with validation n = 101: 
Rescaled MSE - 200913.61 MSE - 0.49 Acc - 0.64
Impute MSE - 10.23 Impute MSE scaled - 52554.90
Train - MSE 0.519, ACC 0.668
Split F, with validation n = 488: 
Rescaled MSE - 91947280.00 MSE - 2.01 Acc - 0.62
Impute MSE - 15.98 Impute MSE scaled - 236450928.00
Train - MSE 0.480, ACC 0.691
Split B, with validation n = 104: 
Rescaled MSE - 127791816.00 MSE - 0.94 Acc - 0.66
Impute MSE - 14.18 Impute MSE scaled - 1187535.88
Train - MSE 0.549, ACC 0.648
Split C, with validation n = 31: 
Rescaled MSE - 10188045.00 MSE - 0.66 Acc - 0.55
Impute MSE - 19.50 Impute MSE scaled - 94413816.00
Train - MSE 0.508, ACC 0.656


In [12]:
rescaled_df = pd.DataFrame(np.clip(np.vstack(trues), a_min=0, a_max=None), columns = train_clean.columns)
rescaled_preds = pd.DataFrame(np.clip(np.vstack(preds), a_min=0, a_max=None), columns = train_clean.columns)

# Final Training and Creating Test Set

In [18]:
model = create_model()
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  loss={'impute': masked_mse(), 'classify': 'binary_crossentropy'},
                  metrics={'impute': 'mape', 'classify': 'accuracy'})

MM = MinMaxScaler(clip=True)

# Transform Data:
x_train = train_clean.copy().fillna(0)
# Define y-values
y_train = train_y
# Scale data from 0 to 1
x_train = MM.fit_transform(x_train)
# Set former nan values to -1
x_train[train_mask] = -1


model.fit(x_train, {'impute':x_train, 'classify': y_train}, verbose=1, callbacks=[early_stopping], validation_split=0.15,
             epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250


Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250


Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250


<tensorflow.python.keras.callbacks.History at 0x2bfaa5f5588>

In [23]:
test_clean = test[impute_cols] 
x_test = test_clean.copy().fillna(0)
x_test = MM.transform(x_test)
x_test[np.isnan(test_clean)] = -1

In [26]:
predictions = model.predict(x_test)

In [29]:
# Predidctions
imputations = predictions['impute']
classify = predictions['classify']

In [56]:
previous_values = test_clean.copy().values
imputed_thing = np.zeros(previous_values.shape)

imputed_thing[np.isnan(test_clean)] = MM.inverse_transform(imputations)[np.isnan(test_clean)]
imputed_thing = np.clip(imputed_thing, a_min=0, a_max=None)

In [57]:
imputed_thing[np.isnan(test_clean)==False] = previous_values[np.isnan(test_clean)==False]

In [68]:
test_out = pd.DataFrame(imputed_thing, columns=test_clean.columns)
test_out[binary_vars] = (test_out[binary_vars] > 0.5) * 1.0
test_out['Prognosis'] = classify > 0.5

In [69]:
test_out.loc[test_out['Prognosis'] == True, 'Prognosis'] = 'MILD'
test_out.loc[test_out['Prognosis'] == False, 'Prognosis'] = 'SEVERE'

In [72]:
test_out[['PatientID', 'ImageFile', 'Hospital']] = test[['PatientID', 'ImageFile', 'Hospital']]

In [74]:
test_out = test_out[test.columns]

In [77]:
test_out.describe()

Unnamed: 0,Age,Sex,Temp_C,Cough,DifficultyInBreathing,WBC,CRP,Fibrinogen,LDH,Ddimer,Ox_percentage,PaO2,SaO2,pH,CardiovascularDisease,RespiratoryFailure
count,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0
mean,60.484394,0.258333,37.925557,0.466667,0.366667,6.956501,12.117571,603.156421,319.731508,1073.452809,93.404331,71.080574,94.285997,7.495585,0.3,0.025
std,13.198359,0.439554,1.213976,0.500979,0.483915,2.483231,13.75764,41.623326,170.133271,1527.853532,4.826866,21.347838,4.354086,0.15769,0.460179,0.15678
min,27.0,0.0,36.0,0.0,0.0,2.3,0.14,527.183044,28.198984,1.055853,70.0,30.635891,77.0,6.91315,0.0,0.0
25%,51.0,0.0,37.0,0.0,0.0,5.1475,2.649895,573.586044,162.508957,99.221571,90.0,57.470881,91.7904,7.44,0.0,0.0
50%,61.034046,0.0,37.927573,0.0,0.0,6.916075,7.449948,599.353668,319.20047,391.812439,94.565804,69.0,96.0,7.48,0.0,0.0
75%,67.0,1.0,39.0,1.0,1.0,7.925,15.777942,622.236145,425.25,1171.129272,97.258677,80.390287,97.750393,7.5425,1.0,0.0
max,95.0,1.0,41.5,1.0,1.0,17.72,75.839714,761.324585,1159.0,7274.918457,100.0,175.0,100.0,7.915079,1.0,1.0
