## Import libraries

In [1]:
## Import libraries
import pandas as pd
import numpy as np
import scipy.stats as st
from sklearn.preprocessing import StandardScaler, PowerTransformer
import sys
import re
import os

os.environ['PATH'] = os.environ['PATH'] + ':/Library/TeX/texbin'
sys.path.append('/Users/riccardo/Documents/GitHub/COVID19Classification/')
print(sys.version)

from Modules import Parameters

3.6.12 |Anaconda, Inc.| (default, Sep  8 2020, 17:50:39) 
[GCC Clang 10.0.0 ]


## Import data

In [2]:
# Import data of hostpitalized patients
file_name = 'DataInpatients_CCIMasked.xlsx'
path_import = Parameters.path_datasets + file_name
DataInpatients_0 = pd.read_excel(path_import, engine='openpyxl')
DataInpatients_0.drop(columns=['Unnamed: 0'], inplace=True)
print('Shape inpatients data:', DataInpatients_0.shape)

# Import data of non-hostpitalized patients
file_name = 'DataOutpatients.xlsx'
path_import = Parameters.path_datasets + file_name
DataOutpatients_0 = pd.read_excel(path_import, engine='openpyxl')
DataOutpatients_0.drop(columns=['Unnamed: 0'], inplace=True)
print('Shape outpatients data:', DataOutpatients_0.shape)

Shape inpatients data: (826, 94)
Shape outpatients data: (462, 98)


## Preprocessing

### Format nans

In [3]:
# nan/nat to none
DataInpatients_1 = DataInpatients_0.copy()
DataInpatients_1 = DataInpatients_1.where(DataInpatients_1.notnull().values, -1e100)
DataInpatients_1 = DataInpatients_1.where(DataInpatients_1.values!=-1e100, np.nan)

DataOutpatients_1 = DataOutpatients_0.copy()
DataOutpatients_1 = DataOutpatients_1.where(DataOutpatients_1.notnull().values, -1e100)
DataOutpatients_1 = DataOutpatients_1.where(DataOutpatients_1.values!=-1e100, np.nan)

### Neutro, Baso, Euso

In [4]:
if 'NeutroBaEu/uL' not in DataInpatients_1.columns:
    new_idx = DataInpatients_1.columns.get_loc('WBC/uL') + 1
    v = DataInpatients_1['WBC/uL'].values - (DataInpatients_1['Mono/uL'].values + DataInpatients_1['Linfo/uL'].values)
    v[v<0] = 0
    DataInpatients_1.insert(loc=new_idx, column='NeutroBaEu/uL', value=v)

    new_idx = DataOutpatients_1.columns.get_loc('WBC/uL') + 1
    v = DataOutpatients_1['WBC/uL'].values - (DataOutpatients_1['Mono/uL'].values + DataOutpatients_1['Linfo/uL'].values)
    v[v<0] = 0
    DataOutpatients_1.insert(loc=new_idx, column='NeutroBaEu/uL', value=v)

### $\Delta t_{\mathrm{ons}}$ 

In [5]:
# Dates
dates = [date for date in DataInpatients_1.columns if 'date#' in date]

if 'delta_onset' not in DataInpatients_1.columns:
    
    # Find time elapsed since ref. timepoint for each date
    ref_date_str = '01-01-2020'
    ref_date = pd.Timestamp(ref_date_str)
    d_dates = {}
    for date in dates:
        d_dates[date] = np.array([(element-ref_date).days if pd.notnull(element) else np.nan for element in DataInpatients_1[date]])
    d_dates_df = pd.DataFrame(d_dates, columns=dates, index=DataInpatients_1['ID'])

    # Add delta_onset column to DataInpatients_1
    delta_onset_lowerbound = Parameters.donset_min
    delta_onset_upperbound = Parameters.donset_max
    delta_onset_df = pd.DataFrame(d_dates_df['date#flowcyt_exam'].values - d_dates_df['date#onset'].values, columns=['delta_onset'], index=d_dates_df.index)
    v = delta_onset_df.values
    mask = (v>=delta_onset_lowerbound) & (v<=delta_onset_upperbound) & (pd.notnull(v))
    delta_onset_df_masked = delta_onset_df.loc[mask, :].copy()
    DataOutpatients_1['delta_onset'] = np.nan
    DataInpatients_1['delta_onset'] = np.nan
    DataInpatients_1 = DataInpatients_1.set_index(DataInpatients_1['ID'], drop=False)
    DataInpatients_1.loc[delta_onset_df_masked.index, 'delta_onset'] = delta_onset_df_masked.values
    DataInpatients_1.reset_index(drop=True, inplace=True)
    print('N. delta_onset data:', sum(DataInpatients_1['delta_onset'].notna()))

N. delta_onset data: 718


### Outpatients categories: no covid, no hospitalization

In [6]:
columns_covid_data = DataInpatients_1.columns

mask_noCovid = DataOutpatients_1['COVID ']==0
Data_noCovid = DataOutpatients_1.loc[mask_noCovid, columns_covid_data].copy()
print('Data no covid shape:', Data_noCovid.shape)

mask_Covid = mask_noCovid==False
mask_noAdmission =  DataOutpatients_1['Admission']==0
Data_noAdmission = DataOutpatients_1.loc[mask_Covid & mask_noAdmission, columns_covid_data].copy()
print('Data no admission shape:', Data_noAdmission.shape)

DataOutpatients = DataOutpatients_1.loc[mask_noCovid | mask_noAdmission, columns_covid_data].copy()
print('Data control shape:', DataOutpatients.shape)

Data no covid shape: (95, 96)
Data no admission shape: (367, 96)
Data control shape: (462, 96)


### Age masking

In [7]:
## Age range
min_age = Parameters.age_min # int>=0
max_age = Parameters.age_max # int>lower_bound
age_masking = (min_age>0) | (max_age<150)

## Masking
age = np.round(DataInpatients_1['age'].values)
title = 'Age between %d and %d' % (min_age, max_age)
age_mask = (age >= min_age) & (age < max_age)
DataInpatients_2 = DataInpatients_1.loc[age_mask, :].copy()
age = np.round(DataOutpatients_1['age'].values)
age_mask = (age >= min_age) & (age < max_age)
DataOutpatients_2 = DataOutpatients_1.loc[age_mask, :].copy()

print('Min. age:', np.round(min(DataInpatients_2['age'])))
print('Max. age:', np.round(max(DataInpatients_2['age'])))
print('Title:', title)

Min. age: 30.0
Max. age: 98.0
Title: Age between 30 and 100


### $\Delta t_{\mathrm{onset}}$ masking 

In [18]:
## delta_onset range
min_donset = Parameters.donset_min # int>=0
max_donset = Parameters.donset_max # int>lower_bound
donset_masking = (min_donset>=0) | (max_donset<=30)

## Masking
donset = np.round(DataInpatients_2['delta_onset'].values)
donset_mask = (donset >= min_donset) & (donset <= max_donset)
DataInpatients_3 = DataInpatients_2.loc[donset_mask, :].copy()
DataOutpatients_3 = DataOutpatients_2.copy()

print('Min. donset:', np.round(min(DataInpatients_3['delta_onset'])), '(>=%d)'%min_donset)
print('Max. donset:', np.round(max(DataInpatients_3['delta_onset'])), '(<=%d)'%max_donset)
print('Data inpatients shape:', DataInpatients_3.shape)
print('Data outpatients shape:', DataOutpatients_3.shape)

Min. donset: 1.0 (>=0)
Max. donset: 29.0 (<=30)
Data inpatients shape: (711, 96)
Data outpatients shape: (382, 100)


### Outliers removal

In [19]:
# Need to define variables of interest

DataInpatients_4 = DataInpatients_3.copy()