In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K

## Classification and Imputation using Auto-Encoder

The idea is to train a dense neural network as an auto-encoder to fill in missing values.


## Outputs
Creates output files in the "predictions" directory.
Imputes as much as possible all NaNs in both train and test files and adds predictions.

**IMPORTANT**: Preidictions in the train set were created by the Auto-Encoder!!


In [2]:
# Load data
train = pd.read_csv('data/trainSet.txt')
test = pd.read_csv('data/testSet.txt')

In [3]:
# definde columns of interest
cols_out = ['PatientID', 'ImageFile', 'Hospital', 'Prognosis']
binary_vars = ['RespiratoryFailure', 'Sex', 'Cough', 'DifficultyInBreathing', 'CardiovascularDisease']
impute_cols = [i for i in train.columns if i not in cols_out]

## Autoencoder should work!

In [4]:
def create_model():
    
    regularizer = keras.regularizers.l1_l2(l1=0, l2=0.005)
    
    inputs = keras.Input(shape=(16,))
    x = keras.layers.Dense(16, activation='selu')(inputs)
    # build a little auto-encoder
    x = keras.layers.Dense(80, activation='selu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.5)(x)
    # bottleneck
    x = keras.layers.Dense(4, activation='selu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.5)(x)
    x = keras.layers.Dense(80, activation='selu', kernel_regularizer=regularizer)(x)
    x = keras.layers.Dropout(0.5)(x)
    
    out1 = keras.layers.Dense(16, activation='sigmoid', name='imputing')(x)
    out2 = keras.layers.Dense(1, activation='sigmoid', name='classifying')(x)
    
    model = keras.Model(inputs, {'impute': out1, 'classify':out2})
    
    return model

In [5]:
# From https://github.com/keras-team/keras/issues/7065
# Calculate MSE only on known values
def masked_mse(mask_value=-1):
    def f(y_true, y_pred):
        mask_true = K.cast(K.not_equal(y_true, mask_value), K.floatx())
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sum(masked_squared_error, axis=-1) / K.sum(mask_true, axis=-1)
        return masked_mse * 100

    return f


# Validate imputation by removing one nan extra per column
def imputer_validation(valid_set):
    """
    Function to remove extra fields from data for validation.
    """
    fields_removed = valid_set.copy()
    imputer_mask = np.zeros(valid_set.shape)
    for n, row in enumerate(fields_removed):
        non_nan = np.where(row != -1)[0]
        set_nan = np.random.choice(non_nan)
        fields_removed[n, set_nan] = -1
        imputer_mask[n, set_nan] = True
    
    return fields_removed, imputer_mask


# helper to calculate scores
def calculate_score(x_true, x_pred, errorfun='mse', scaler=None):
    if scaler:
        x_true = scaler.inverse_transform(x_true.copy())
        x_pred = scaler.inverse_transform(x_pred.copy())
        x_true = np.clip(x_true, a_min=-1, a_max=None)
        x_pred = np.clip(x_pred, a_min=-1, a_max=None)
    if errorfun == 'mse':
        score = mean_squared_error(x_true, x_pred)
    elif errorfun == 'acc':
        score = accuracy_score(x_true, x_pred > 0.5)
    elif errorfun == "masked_mse":
        score = masked_mse(-1)(x_true, x_pred)
        score = np.mean(score)
    
    return score


def post_processing(original_df, predicted_values, classify, scaler=None):
    # Post processing
    nan_mask = np.isnan(original_df)
    previous_values = original_df.copy().values
    imputed_thing = np.zeros(previous_values.shape)

    # Rescale values and add to empty matrix
    if scaler is not None:
        predicted_values = MM.inverse_transform(predicted_values)
    
    imputed_thing[nan_mask] = predicted_values[nan_mask]
    # Clip negatives
    imputed_thing = np.clip(imputed_thing, a_min=0, a_max=None) 
    # Add known values
    imputed_thing[nan_mask==False] = previous_values[nan_mask==False]
    
    # Add classification and column names, binarize binary values
    df_out = pd.DataFrame(imputed_thing, columns=original_df.columns)
    df_out[binary_vars] = (df_out[binary_vars] > 0.5) * 1.0
    df_out['Prognosis'] = classify > 0.5
    
    return df_out

In [6]:
# some global vars
early_stopping = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
OPTIMIZER = keras.optimizers.Adam(lr=0.0005)
CALLBACKS = [early_stopping]
LOSS = {'impute': masked_mse(), 'classify': 'binary_crossentropy'}

In [7]:
# Preprare data for training
train_clean = train[impute_cols]
train_y = train['Prognosis'] == 'MILD'

In [None]:
accs, mses0, mses1, imps0, imps1, trues, preds, indices, hosp_val = [], [], [], [], [], [], [], [], []

train_mask = np.isnan(train_clean).values

for hosp in np.unique(train.Hospital):
    model = create_model()
    model.compile(optimizer=OPTIMIZER, 
                  loss=LOSS,
                  metrics={'impute': 'mse', 'classify': 'accuracy'})
    MM = MinMaxScaler(clip=True)
    
    tr = np.where(train.Hospital != hosp)[0]
    vl = np.where(train.Hospital == hosp)[0]

    # Remove Nans
    x_train = train_clean.iloc[tr].copy().fillna(0)
    x_valid = train_clean.iloc[vl].copy().fillna(0)
    # Define y-values
    y_train = train_y.iloc[tr]
    y_valid = train_y.iloc[vl]
    # Scale data from 0 to 1
    x_train = MM.fit_transform(x_train)
    x_valid = MM.transform(x_valid)
    
    # Set former nan values to -1
    x_train[train_mask[tr]] = -1
    x_valid[train_mask[vl]] = -1
    
    # Fit Model
    model.fit(x_train, {'impute':x_train, 'classify': y_train}, verbose=0, 
              callbacks=CALLBACKS, validation_split=0.15,
             epochs=250)
    
    # Predict the things
    prediction = model.predict(x_valid)
    preds.append(MM.inverse_transform(prediction['impute']))

    pred_val, pred_y = prediction['impute'], prediction['classify']
    
    
    # Returns metrics, weirdly
    _, _, _, train_acc, train_mse = model.evaluate(x_train, 
                                                   {'impute':x_train, 
                                                    'classify': y_train}, 
                                                   verbose=0)
    
    # Set previous NAN to -1
    pred_val[train_mask[vl]] = -1
    x_valid[train_mask[vl]] = -1
    
    # imputer validation:
    x_impute, impute_mask = imputer_validation(x_valid)
    
    imputer_predict = model.predict(x_impute)['impute']
    imputer_predict[impute_mask==False] = -1
    
    validation_mask = x_valid.copy()
    validation_mask[impute_mask==False] = -1
    
    # imputer error
    imp0 = calculate_score(validation_mask, 
                            imputer_predict, errorfun='masked_mse')
    imp1 = calculate_score(validation_mask, 
                            imputer_predict, errorfun='masked_mse', scaler=MM)
    
    # Keepingt track
    imps0.append(imp0)
    imps1.append(imp1)
    mses0.append(calculate_score(x_valid, pred_val, scaler=MM, errorfun='masked_mse'))
    mses1.append(calculate_score(x_valid, pred_val, errorfun='masked_mse'))
    accs.append(calculate_score(y_valid, pred_y, errorfun='acc'))
    trues.append(MM.inverse_transform(x_valid))
    preds.append(MM.inverse_transform(prediction['impute']))
    print(f"Split {hosp}, with validation n = {x_valid.shape[0]}: \nRescaled MSE - {mses0[-1]:4.2f} " +
          f"MSE - {mses1[-1]:4.2f} " +
          f"Acc - {accs[-1]:4.2f}\n" +
          f"Impute MSE - {imps0[-1]:4.2f} " +
          f"Impute MSE scaled - {imps1[-1]:4.2f}")
          
    print(f"Train - MSE {train_mse:4.3f}, ACC {train_acc:4.3f}")
    
    indices.append(vl)
    hosp_val.append([hosp] * len(vl))
    print("==============================================")

# Final Training and Creating Test Set

In [None]:
model = create_model()
model.compile(optimizer=OPTIMIZER, 
                  loss=LOSS,
                  metrics={'impute': 'mae', 'classify': 'accuracy'})

MM = MinMaxScaler(clip=True)

# Transform Data:
x_train = train_clean.copy().fillna(0)
# Define y-values
y_train = train_y
# Scale data from 0 to 1
x_train = MM.fit_transform(x_train)
# Set former nan values to -1
x_train[train_mask] = -1


model.fit(x_train, {'impute':x_train, 'classify': y_train}, verbose=1, callbacks=CALLBACKS, 
          validation_split=0.15, epochs=250)

In [None]:
train_predictions = model.predict(x_train)
train_imputations = train_predictions['impute']
train_classify = train_predictions['classify']

train_out = post_processing(train_clean, train_imputations, train_classify, MM)

In [None]:
# Add missing columns
train_out[['PatientID', 'ImageFile', 'Hospital']] = train[['PatientID', 'ImageFile', 'Hospital']]
# Change order to original
train_out = train_out[train.columns]

train_out.to_csv('predictions/simon_TRAIN_impute_classify.csv', index=None)

In [None]:
test_clean = test[impute_cols] 
x_test = test_clean.copy().fillna(0)
x_test = MM.transform(x_test)
x_test[np.isnan(test_clean)] = -1

In [None]:
predictions = model.predict(x_test)

In [None]:
# Predidctions
imputations = predictions['impute']
classify = predictions['classify']

In [None]:
test_out = post_processing(test_clean, imputations, classify, MM)

test_out[test_out.Prognosis==True] = "MILD"
test_out[test_out.Prognosis==False] = "SEVERE"

In [None]:
# Add missing columns
test_out[['PatientID', 'ImageFile', 'Hospital']] = test[['PatientID', 'ImageFile', 'Hospital']]
# Change order to original
test_out = test_out[test.columns]

In [None]:
# Save to file
test_out.to_csv('predictions/simon_TEST_impute_classify.csv', index=False)