# Train models for SARS-CoV-2 detection

### Read libraries

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc, roc_curve
from sklearn.model_selection import train_test_split

### Read dataset
Select dataset by uncommenting the appropriate line.

In [69]:
# Baseline training - imbalanced dataset
# df = pd.read_csv('../datasets/processed/imbalanced_detection.csv')

# Zenodo dataset
# df = pd.read_csv('../datasets/processed/auxiliary/zenodo_detection.csv')

# UCK dataset
df = pd.read_csv('../datasets/processed/auxiliary/balanced_uck_detection.csv')

In [70]:
df = df.drop(['Unnamed: 0'], axis=1) # 'Unnamed: 0.1']
df = df[['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC',
       'Age', 'Sex', 'target']]

X = df.copy().drop('target', axis=1)
y = df['target']
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BAT     923 non-null    float64
 1   EOT     923 non-null    float64
 2   LYT     923 non-null    float64
 3   MOT     923 non-null    float64
 4   HGB     926 non-null    float64
 5   MCHC    926 non-null    float64
 6   MCV     926 non-null    float64
 7   PLT     926 non-null    float64
 8   WBC     926 non-null    float64
 9   Age     926 non-null    int64  
 10  Sex     926 non-null    int64  
 11  target  926 non-null    int64  
dtypes: float64(9), int64(3)
memory usage: 86.9 KB


### Uncomment if you want to experiment with transfer learning

In [71]:
# source = 'zenodo'
# target = 'uck'

# df_uck = pd.read_csv('/home/basia/Desktop/cbc_covid/datasets/processed/auxiliary/balanced_uck_detection.csv')
# df_uck = df_uck[['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC',
#        'Age', 'Sex', 'target']]
# df_zenodo = pd.read_csv('/home/basia/Desktop/cbc_covid/datasets/processed/auxiliary/zenodo_detection.csv')
# df_zenodo = df_zenodo[['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC',
#        'Age', 'Sex', 'target']]


# if source == 'zenodo':
#     df1 = df_uck
#     df2 = df_zenodo
# else:
#     df1 = df_zenodo
#     df2 = df_uck
# X = df1.copy().drop('target', axis=1)
# y = df1['target']
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42)
# X2 = df2.copy().drop('target', axis=1)
# y2 = df2['target']


# X_train = np.concatenate((X_train, X2), axis=0) #.reshape(-1)
# y_train = np.concatenate((y_train, y2), axis=0).reshape(-1)


# print(f"Train data COV- {len(X_train)-y_train.sum()}")
# print(f"Train data COV+ {y_train.sum()}")
# print(f"Test data COV- {len(X_test) - y_test.sum()}")
# print(f"Train data COV+ {y_test.sum()}")

### Impute missing values with K-nearest neighbours

In [72]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
# No transfer learning
X = imputer.fit_transform(X)

# Transfer learning
# X_train = imputer.fit_transform(X_train)
# X_test = imputer.fit_transform(X_test)

### Scale values of features to <-1, 1>

In [73]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
# No transfer learning
X = scalar.fit_transform(X)

# Transfer learning
# X_train = scalar.fit_transform(X_train)
# X_test = scalar.fit_transform(X_test)

### Split dataset into train and test samples

In [74]:
from sklearn.model_selection import train_test_split

# No transfer learning
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
print(len(X_train))
print(y_train.sum())

740
376


### For tree-based algorithms use SMOTE to balance the classes

In [75]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(len(X_train))
y_train.sum()


752


376

### Convert features into continous and categorical columns

In [76]:
X_cont_test = X_test[:, 0:-1].astype(np.float32)
X_cat_test = X_test[:, -1::].astype(np.int64)

X_cont_train = X_train[:, 0:-1].astype(np.float32)
X_cat_train = X_train[:, -1::].astype(np.int64)

df_dict_train = {}
for i, col in enumerate(['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC', 'Age']):
    df_dict_train[col] = X_cont_train[:, i].reshape(-1)
df_dict_train['Sex'] = X_cat_train.reshape(-1)

df_dict_test = {}
for i, col in enumerate(['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC', 'Age']):
    df_dict_test[col] = X_cont_test[:, i].reshape(-1)
df_dict_test['Sex'] = X_cat_test.reshape(-1)

X_train_df = pd.DataFrame.from_dict(df_dict_train)
X_test_df = pd.DataFrame.from_dict(df_dict_test)
X_train_df

Unnamed: 0,BAT,EOT,LYT,MOT,HGB,MCHC,MCV,PLT,WBC,Age,Sex
0,-0.732260,-0.523441,-1.560120,-0.811280,0.318811,-0.623424,0.294449,1.678910,2.111205,0.843714,0
1,-1.040951,-0.668417,-1.191401,-1.741668,-0.633509,-0.623424,-0.251557,0.547091,-0.120083,1.046236,0
2,2.045966,-0.668417,0.125455,-0.743203,-2.311406,-0.555675,4.038493,-1.837506,-1.435251,-0.472679,1
3,-0.732260,-0.668417,-1.349423,-1.265128,-0.814903,-0.216929,3.586088,-0.662488,-0.614676,2.160106,1
4,0.502508,-0.185161,0.380047,0.323340,-0.860252,0.054068,0.216448,0.408853,-0.712096,0.742453,0
...,...,...,...,...,...,...,...,...,...,...,...
747,0.304867,0.385172,0.917280,0.364229,-1.313880,-2.168232,-2.613464,1.026075,-0.089024,-0.727837,0
748,1.218694,0.864542,1.039538,0.044631,0.790420,0.433487,-0.158549,0.139987,-0.214752,0.953033,1
749,1.119891,0.170646,1.525416,-0.159256,1.230482,-0.084941,-0.453672,0.411987,0.268896,-0.224773,1
750,0.525277,0.650633,0.062632,-0.014814,0.280153,0.258981,-0.054766,-0.291329,-0.699228,-0.078839,1


### Train with XGBoost

In [77]:

import xgboost as xgb

xg = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.5, learning_rate = 0.001,
                max_depth = 5, alpha = 5, n_estimators = 2)
                
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test) > 0.5
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1, drop_intermediate=False)
auc_val = auc(fpr, tpr)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy: {acc}')
print(f'Specificity: {tn / (tn+fp)}')
print(f'Sensitivity: {tp / (tp+fn)}')
print(f'F1: {f1}')
print(f"AUC: {auc_val}") 

Accuracy: 0.6451612903225806
Specificity: 0.7575757575757576
Sensitivity: 0.5172413793103449
F1: 0.576923076923077
AUC: 0.6374085684430512




### Train with CatBoost

In [78]:
from catboost import CatBoostClassifier

CAT_FEATURES = [10]
print(list(X_train))
# set up the model
# 1 e-4 uck, zenodo
# 1e-3 uck balanced
catboost_model = CatBoostClassifier(loss_function='CrossEntropy', learning_rate=1e-3)
# fit model
catboost_model.fit(X_train_df, y_train, 
                   eval_set = (X_test_df, y_test),
                   cat_features=CAT_FEATURES,
                   use_best_model = True)
y_pred = catboost_model.predict(X_test_df) > 0.5
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1, drop_intermediate=False)
auc_val = auc(fpr, tpr)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy: {acc}')
print(f'Specificity: {tn / (tn+fp)}')
print(f'Sensitivity: {tp / (tp+fn)}')
print(f'F1: {f1}')
print(f"AUC: {auc_val}") 

[array([-0.73225957, -0.52344057, -1.56012005, -0.81127987,  0.31881123,
       -0.62342387,  0.29444923,  1.67891009,  2.11120497,  0.84371377,
       -0.93920915]), array([-1.04095135, -0.66841743, -1.19140053, -1.74166782, -0.63350877,
       -0.62342387, -0.25155715,  0.54709057, -0.12008339,  1.0462357 ,
       -0.93920915]), array([ 2.04596644, -0.66841743,  0.12545489, -0.7432027 , -2.3114059 ,
       -0.55567472,  4.03849301, -1.83750627, -1.43525084, -0.47267874,
        1.06472558]), array([-0.73225957, -0.66841743, -1.34942318, -1.26512765, -0.81490305,
       -0.21692897,  3.58608772, -0.66248753, -0.61467628,  2.16010629,
        1.06472558]), array([ 0.50250755, -0.18516123,  0.38004694,  0.32333959, -0.86025162,
        0.05406763,  0.21644832,  0.40885308, -0.71209609,  0.74245281,
       -0.93920915]), array([-1.04095135, -0.62009181,  0.03766453, -0.03973864,  0.95369123,
        0.46056252,  0.62205306, -0.56744925, -0.27370694,  0.89434426,
       -0.93920915]), arr

### Train with fully-connected neural network

In [79]:
import tensorflow as tf

input = tf.keras.layers.Input(shape=(11,))

x = tf.keras.layers.Dense(100, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(400, activation='relu', kernel_initializer='glorot_uniform')(input)
x = tf.keras.layers.Dropout(.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_initializer='glorot_uniform')(x)
x = tf.keras.layers.Dropout(.5)(x)

x = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')(x)
model = tf.keras.Model(inputs=input, outputs=x)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100, batch_size=512)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

Epoch 1/100


  return dispatch_target(*args, **kwargs)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [80]:
y_pred = (model.predict(X_test) > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1, drop_intermediate=False)
auc_val = auc(fpr, tpr)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy: {acc}')
print(f'Specificity: {tn / (tn+fp)}')
print(f'Sensitivity: {tp / (tp+fn)}')
print(f'F1: {f1}')
print(f"AUC: {auc_val}") 


Accuracy: 0.7311827956989247
Specificity: 0.7575757575757576
Sensitivity: 0.7011494252873564
F1: 0.7093023255813954
AUC: 0.7293625914315569
