# Preprocess UCK and Zenodo datasets for COVID-19 severity prognosis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

Read dataframes.

In [2]:
df_zenodo = pd.read_csv('../datasets/processed/auxiliary/zenodo_detection.csv')
df_uck = pd.read_csv('../datasets/processed/auxiliary/uck_detection1.csv')

In [3]:
print(df_zenodo['target'].value_counts())
print(df_uck['target'].value_counts())

0    920
1    816
Name: target, dtype: int64
0    16989
1      463
Name: target, dtype: int64


Add all rows from both dataframes to a joint dataframe

In [4]:
df_imbalanced = df_zenodo.append(df_uck)
df_imbalanced.to_csv('../datasets/processed/imbalanced_detection2.csv')
df_imbalanced

Unnamed: 0.1,Unnamed: 0,WBC,HGB,MCV,MCHC,PLT,LYT,MOT,EOT,BAT,Age,Sex,target
0,0,9.90,13.250000,100.850000,34.45,200.500000,1.20,0.8,0.3,0.0,82.0,1.0,0
1,1,9.20,14.900000,82.000000,34.90,337.000000,,,,,51.0,1.0,1
2,2,6.70,14.050000,84.300000,34.00,200.000000,0.75,0.5,0.0,0.0,58.0,1.0,0
3,3,9.70,14.500000,84.600000,34.30,209.500000,2.60,0.7,0.1,0.1,82.0,0.0,0
4,4,1.00,9.233333,98.433333,32.30,55.333333,0.50,0.1,0.1,0.0,79.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17447,30276,7.94,12.500000,90.000000,32.50,112.000000,24.70,8.4,0.9,0.1,73.0,1.0,0
17448,30163,4.66,9.100000,88.500000,35.80,133.000000,23.60,11.4,0.0,0.6,79.0,1.0,0
17449,30272,16.23,11.600000,86.900000,34.20,224.000000,14.10,6.7,0.1,0.2,29.0,0.0,0
17450,30266,12.24,7.300000,84.900000,34.10,116.000000,10.00,6.9,0.0,0.2,66.0,0.0,0


Add only as many non empty Cov- patients as their are Cov+ patients in UCK dataset. Add all rows from Zenodo.

In [5]:
samples_count = len(df_uck.loc[df_uck['target']==1])
samples_count

463

In [6]:
df_uck_balanced_neg = df_uck.dropna()
df_uck_balanced_neg = df_uck_balanced_neg.loc[df_uck_balanced_neg['target']==0]
df_uck_balanced_neg = df_uck_balanced_neg.sample(n=samples_count)
df_uck_balanced_neg
df_uck_balanced_pos = df_uck.loc[df_uck['target']==1]
df_uck_balanced = df_uck_balanced_neg.append(df_uck_balanced_pos)

In [7]:
df_balanced = df_zenodo.append(df_uck_balanced)
df_balanced.to_csv('../datasets/processed/balanced_detection2.csv')
df_balanced

Unnamed: 0.1,Unnamed: 0,WBC,HGB,MCV,MCHC,PLT,LYT,MOT,EOT,BAT,Age,Sex,target
0,0,9.90,13.250000,100.850000,34.45,200.500000,1.20,0.8,0.3,0.0,82.0,1.0,0
1,1,9.20,14.900000,82.000000,34.90,337.000000,,,,,51.0,1.0,1
2,2,6.70,14.050000,84.300000,34.00,200.000000,0.75,0.5,0.0,0.0,58.0,1.0,0
3,3,9.70,14.500000,84.600000,34.30,209.500000,2.60,0.7,0.1,0.1,82.0,0.0,0
4,4,1.00,9.233333,98.433333,32.30,55.333333,0.50,0.1,0.1,0.0,79.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17416,30199,2.10,8.800000,83.400000,30.80,70.000000,19.00,11.0,0.0,1.0,63.0,0.0,1
17419,30203,10.08,8.600000,88.400000,34.30,556.000000,14.80,4.2,1.0,0.6,39.0,1.0,1
17424,30301,1.35,8.800000,84.500000,34.40,30.000000,19.30,11.9,0.0,0.0,73.0,1.0,1
17425,30140,8.36,9.900000,93.200000,33.00,240.000000,18.90,6.3,1.1,0.4,77.0,0.0,1


In [8]:
pd.pivot_table(df_balanced, index=['target'], values=['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC',
       'Age', 'Sex'], aggfunc='count')

Unnamed: 0_level_0,Age,BAT,EOT,HGB,LYT,MCHC,MCV,MOT,PLT,Sex,WBC
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1329,1078,1078,1328,1078,1328,1328,1078,1328,1383,1328
1,1279,1219,1219,1271,1219,1271,1271,1219,1271,1279,1271


In [9]:
pd.pivot_table(df_imbalanced, index=['target'], values=['BAT', 'EOT', 'LYT', 'MOT', 'HGB', 'MCHC', 'MCV', 'PLT', 'WBC',
       'Age', 'Sex'], aggfunc='count')

Unnamed: 0_level_0,Age,BAT,EOT,HGB,LYT,MCHC,MCV,MOT,PLT,Sex,WBC
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,17855,17519,17518,17854,17519,17853,17853,17518,17853,17909,17852
1,1279,1219,1219,1271,1219,1271,1271,1219,1271,1279,1271
