# Train models for COVID-19 prognosis

### Read libraries

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

### Read dataset. Select relevant features.

In [13]:
df = pd.read_csv('datasets/processed/uck_prognosis.csv')
df = df[['LYT', 'HGB', 'PLT', 'WBC', 'Age', 'Sex', 'target']]
df['Sex'] = df['Sex'].replace({'K': 0, 'M': 1}).astype('int64')
df['target'] = df['target'].replace({0: 0, 1: 1, 2: 1, 3: 2})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LYT     970 non-null    float64
 1   HGB     974 non-null    float64
 2   PLT     974 non-null    float64
 3   WBC     974 non-null    float64
 4   Age     974 non-null    int64  
 5   Sex     974 non-null    int64  
 6   target  974 non-null    float64
dtypes: float64(5), int64(2)
memory usage: 53.4 KB


### One-hot encode the target variable
Target variable has 3 classes, so for some models, it needs to be on-hot encoded.

In [14]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X = df.copy().drop('target', axis=1)
y = df['target']
# y = enc.fit_transform(np.expand_dims(y, 1)).todense()
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
969    0.0
970    0.0
971    0.0
972    2.0
973    2.0
Name: target, Length: 974, dtype: float64

### Impute missing values with K-nearest neighbours

In [15]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X = imputer.fit_transform(X)

### Scale features to <-1, 1>

In [16]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X = scalar.fit_transform(X)

### Define metrics helper functions

In [17]:
def get_confusion_matrix(y_true, y_pred, num_classes=3, classes=['0', '1', '2']):
    confusion_matrix = np.zeros((num_classes, num_classes))
    y_true = list(y_true)
    y_pred = list(y_pred)
    for yi_t, yi_p in zip(y_true, y_pred):
        confusion_matrix[yi_t, yi_p] += 1
    return confusion_matrix
    


In [18]:
def create_metrics_dict():
    metrics = {
        'accuracy': [],
        'f1': [],
        'class_report':{
            '0.0': {
                'precision': [],
                'recall': [],
                'f1-score': [],
                'support': []
            },
            '1.0': {
                'precision': [],
                'recall': [],
                'f1-score': [],
                'support': []
            },
            '2.0': {
                'precision': [],
                'recall': [],
                'f1-score': [],
                'support': []
            },
            'accuracy': [],
            'macro avg': {
                'precision': [],
                'recall': [],
                'f1-score': [],
                'support': []
            },
            'weighted avg': {
                'precision': [],
                'recall': [],
                'f1-score': [],
                'support': []
            }
        }
    }
    return metrics
    

In [19]:
def calc_metrics(y_true, y_pred, metrics):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    class_report = classification_report(y_true, y_pred, output_dict=True)
    
    metrics['accuracy'].append(acc)
    metrics['f1'].append(f1)
    for k1 in metrics['class_report'].keys():
        for k2 in metrics['class_report'][k1]:
            val = class_report[k1][k2]
            metrics['class_report'][k1][k2].append(val)
    return metrics
    
	

In [20]:
from statistics import mean

def get_avg_metrics(metrics):
    acc = round(mean(metrics['accuracy']), 2)
    f1 = round(mean(metrics['f1']), 2)

    class_report = {}
    print('     precision    recall    support')
    for k1 in metrics['class_report'].keys():
        class_report[k1] = {}
        for k2 in metrics['class_report'][k1]:
            val = mean(metrics['class_report'][k1][k2])*100
            val = round(val, 2)
            class_report[k1][k2] = val
    
    print(f'Accuracy: {acc}')
    print(f'F1: {f1}')
    print(f"classification report: {class_report}")


### Train XGBoost model
Results are evaluated with 5-fold cross-validation.

In [21]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

metrics = create_metrics_dict()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train, test in skf.split(X, y):
	X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
	oversample = SMOTE() #SMOTEENN(enn=EditedNearestNeighbours())
	X_train, y_train = oversample.fit_resample(X_train, y_train)
	xg = xgb.XGBClassifier(objective ='multi:softmax(num_class=3)', colsample_bytree = 0.5, learning_rate = 0.001,
                max_depth = 5, alpha = 5, n_estimators = 2) # UCK + zenodo (UCK)
	xg.fit(X_train, y_train)
	y_pred = xg.predict(X_test)
	# print(classification_report(y_test, y_pred, output_dict=True))
	
	metrics = calc_metrics(y_test, y_pred, metrics)
get_avg_metrics(metrics)

     precision    recall    support
Accuracy: 0.57
F1: 0.59
classification report: {'0.0': {'precision': 74.05, 'recall': 60.11, 'f1-score': 66.2, 'support': 10880.0}, '1.0': {'precision': 16.41, 'recall': 53.64, 'f1-score': 25.01, 'support': 1080.0}, '2.0': {'precision': 56.33, 'recall': 52.66, 'f1-score': 54.26, 'support': 7520.0}, 'accuracy': {}, 'macro avg': {'precision': 48.93, 'recall': 55.47, 'f1-score': 48.49, 'support': 19480.0}, 'weighted avg': {'precision': 64.02, 'recall': 56.88, 'f1-score': 59.31, 'support': 19480.0}}




### Train CatBoost model
Results are evaluated with 5-fold cross-validation.

In [22]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

CAT_FEATURES = [5]
metrics = create_metrics_dict()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train, test in skf.split(X, y):
	X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
	oversample = SMOTE() #SMOTEENN(enn=EditedNearestNeighbours())
	X_train, y_train = oversample.fit_resample(X_train, y_train)
	
	X_cont_test = X_test[:, 0:-1].astype(np.float32)
	X_cat_test = X_test[:, -1::].astype(np.int64)

	X_cont_train = X_train[:, 0:-1].astype(np.float32)
	X_cat_train = X_train[:, -1::].astype(np.int64)

	df_dict_train = {}
	for i, col in enumerate(['LYT', 'HGB', 'PLT', 'WBC', 'Age']):
		df_dict_train[col] = X_cont_train[:, i].reshape(-1)
	df_dict_train['Sex'] = X_cat_train.reshape(-1)

	df_dict_test = {}
	for i, col in enumerate(['LYT', 'HGB', 'PLT', 'WBC', 'Age']):
		df_dict_test[col] = X_cont_test[:, i].reshape(-1)
	df_dict_test['Sex'] = X_cat_test.reshape(-1)

	X_train_df = pd.DataFrame.from_dict(df_dict_train)
	X_test_df = pd.DataFrame.from_dict(df_dict_test)
	
	catboost_model = CatBoostClassifier(learning_rate=1e-3, loss_function='MultiClass')

	catboost_model.fit(X_train_df, y_train, 
						eval_set = (X_test_df, y_test),
						cat_features=CAT_FEATURES,
						use_best_model = True)
	y_pred = catboost_model.predict(X_test_df) #> 0.5

	metrics = calc_metrics(y_test, y_pred, metrics)
get_avg_metrics(metrics)

0:	learn: 1.0981637	test: 1.0982960	best: 1.0982960 (0)	total: 48.6ms	remaining: 48.5s
1:	learn: 1.0976894	test: 1.0979590	best: 1.0979590 (1)	total: 50.5ms	remaining: 25.2s
2:	learn: 1.0972552	test: 1.0976965	best: 1.0976965 (2)	total: 52.4ms	remaining: 17.4s
3:	learn: 1.0968883	test: 1.0974289	best: 1.0974289 (3)	total: 54.3ms	remaining: 13.5s
4:	learn: 1.0965619	test: 1.0972056	best: 1.0972056 (4)	total: 56.1ms	remaining: 11.2s
5:	learn: 1.0962075	test: 1.0969501	best: 1.0969501 (5)	total: 57.8ms	remaining: 9.57s
6:	learn: 1.0958270	test: 1.0967106	best: 1.0967106 (6)	total: 59.4ms	remaining: 8.43s
7:	learn: 1.0954592	test: 1.0965015	best: 1.0965015 (7)	total: 61.2ms	remaining: 7.59s
8:	learn: 1.0951203	test: 1.0963011	best: 1.0963011 (8)	total: 63ms	remaining: 6.94s
9:	learn: 1.0947127	test: 1.0960826	best: 1.0960826 (9)	total: 64.8ms	remaining: 6.42s
10:	learn: 1.0943518	test: 1.0958342	best: 1.0958342 (10)	total: 66.7ms	remaining: 6s
11:	learn: 1.0938648	test: 1.0955649	best: 1.0

### Train fully-connected ANN model
Results are evaluated with 5-fold cross-validation.

In [23]:
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

input = tf.keras.layers.Input(shape=(6,))
x = tf.keras.layers.Dense(100, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(400, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_initializer='glorot_uniform')(x)
x = tf.keras.layers.Dropout(.5)(x)

x = tf.keras.layers.Dense(3, activation='softmax', kernel_initializer='glorot_uniform')(x)

metrics = create_metrics_dict()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train, test in skf.split(X, y):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    oversample = SMOTE() #SMOTEENN(enn=EditedNearestNeighbours())
    # X_train, y_train = oversample.fit_resample(X_train, y_train)
    model = tf.keras.Model(inputs=input, outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), # UCK imb 1e-5
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

    y_pred = np.argmax(model.predict(X_test), axis=-1)
    metrics = calc_metrics(y_test, y_pred, metrics)
get_avg_metrics(metrics)

2021-12-13 20:12:03.795780: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 20:12:03.800424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 20:12:03.800662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 20:12:03.801579: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

     precision    recall    support
Accuracy: 0.79
F1: 0.78
classification report: {'0.0': {'precision': 79.77, 'recall': 85.3, 'f1-score': 82.4, 'support': 10880.0}, '1.0': {'precision': 61.52, 'recall': 46.36, 'f1-score': 52.37, 'support': 1080.0}, '2.0': {'precision': 78.86, 'recall': 73.66, 'f1-score': 76.09, 'support': 7520.0}, 'accuracy': {}, 'macro avg': {'precision': 73.38, 'recall': 68.44, 'f1-score': 70.29, 'support': 19480.0}, 'weighted avg': {'precision': 78.42, 'recall': 78.65, 'f1-score': 78.31, 'support': 19480.0}}
