## Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer

## Custom modules
import sys
import os
os.environ['PATH'] = os.environ['PATH'] + ':/Library/TeX/texbin'
# Append the parent directory to the sys.path
this_path = os.path.abspath('') 
parent_dir = os.path.dirname(this_path)  
sys.path.append(parent_dir)
from Modules import Parameters

## Import data

In [None]:
# Import data of hostpitalized patients
file_name = 'DataInpatients_anonymized.xlsx'
path_import = Parameters.path_datasets + file_name
DataInpatients_0 = pd.read_excel(path_import, engine='openpyxl')
print('Shape inpatients data:', DataInpatients_0.shape)

# Import data of non-hostpitalized patients
file_name = 'DataOutpatients_anonymized.xlsx'
path_import = Parameters.path_datasets + file_name
DataOutpatients_0 = pd.read_excel(path_import, engine='openpyxl')
print('Shape outpatients data:', DataOutpatients_0.shape)

## Preprocessing

### Format nans

In [None]:
# nan/nat to none
DataInpatients_1 = DataInpatients_0.copy()
DataInpatients_1 = DataInpatients_1.where(DataInpatients_1.notnull().values, -1e100)
DataInpatients_1 = DataInpatients_1.where(DataInpatients_1.values!=-1e100, np.nan)

DataOutpatients_1 = DataOutpatients_0.copy()
DataOutpatients_1 = DataOutpatients_1.where(DataOutpatients_1.notnull().values, -1e100)
DataOutpatients_1 = DataOutpatients_1.where(DataOutpatients_1.values!=-1e100, np.nan)

### Ganulocytes

In [None]:
if 'Granulo/uL' not in DataInpatients_1.columns:
    new_idx = DataInpatients_1.columns.get_loc('WBC/uL') + 1
    v = DataInpatients_1['WBC/uL'].values - (DataInpatients_1['Mono/uL'].values + DataInpatients_1['Lymph/uL'].values)
    v[v<0] = 0
    DataInpatients_1.insert(loc=new_idx, column='Granulo/uL', value=v)

    new_idx = DataOutpatients_1.columns.get_loc('WBC/uL') + 1
    v = DataOutpatients_1['WBC/uL'].values - (DataOutpatients_1['Mono/uL'].values + DataOutpatients_1['Lymph/uL'].values)
    v[v<0] = 0
    DataOutpatients_1.insert(loc=new_idx, column='Granulo/uL', value=v)

### $\Delta t_{\mathrm{ons}}$ 

In [None]:
# Dates
dates = [date for date in DataInpatients_1.columns if 'date#' in date]

if 'delta_onset' not in DataInpatients_1.columns:
    
    # Find time elapsed since ref. timepoint for each date
    ref_date_str = '01-01-2020'
    ref_date = pd.Timestamp(ref_date_str)
    d_dates = {}
    for date in dates:
        d_dates[date] = np.array([(element-ref_date).days if pd.notnull(element) else np.nan for element in DataInpatients_1[date]])
    d_dates_df = pd.DataFrame(d_dates, columns=dates, index=DataInpatients_1['ID'])

    # Add delta_onset column to DataInpatients_1
    date_flowcyt_exam = d_dates_df['date#flowcyt_exam'].values
    date_onset = d_dates_df['date#onset'].values    
    delta_onset_df = pd.DataFrame(date_flowcyt_exam - date_onset, columns=['delta_onset'], index=d_dates_df.index)
    v = delta_onset_df.values
    DataInpatients_1['delta_onset'] = v
    DataOutpatients_1['delta_onset'] = np.nan
    print('N. delta_onset data:', sum(DataInpatients_1['delta_onset'].notna()))

### Outpatients categories: no covid, no hospitalization

In [None]:
columns_covid_data = DataInpatients_1.columns

mask_noCovid = DataOutpatients_1['COVID ']==0
Data_noCovid = DataOutpatients_1.loc[mask_noCovid, columns_covid_data].copy()
print('Data no covid shape:', Data_noCovid.shape)

mask_Covid = mask_noCovid==False
mask_noAdmission =  DataOutpatients_1['admission']==0
Data_noAdmission = DataOutpatients_1.loc[mask_Covid & mask_noAdmission, columns_covid_data].copy()
print('Data no admission shape:', Data_noAdmission.shape)

DataOutpatients = Data_noAdmission.loc[mask_noCovid | mask_noAdmission, columns_covid_data].copy()
print('Data control shape:', DataOutpatients.shape)

### Age masking (inpatients only)

In [None]:
## Age range
min_age = Parameters.age_min # int>=0
max_age = Parameters.age_max # int>lower_bound
age_masking = (min_age>0) | (max_age<150)

## Masking
age = np.round(DataInpatients_1['age'].values)
age_mask = (age >= min_age) & (age < max_age)
DataInpatients_2 = DataInpatients_1.loc[age_mask, :].copy()

print('Min. age:', np.round(min(DataInpatients_2['age'])))
print('Max. age:', np.round(max(DataInpatients_2['age'])))
print('Data inpatients shape:', DataInpatients_2.shape)
print('Data outpatients shape:', DataOutpatients.shape)

### $\Delta t_{\mathrm{ons}}$ masking (inpatients only)

In [None]:
## delta_onset range
min_donset = Parameters.donset_min # int>=0
max_donset = Parameters.donset_max # int>lower_bound
donset_masking = (min_donset>=0) | (max_donset<=30)

## Masking
donset = np.round(DataInpatients_2['delta_onset'].values)
donset_mask = (donset >= min_donset) & (donset <= max_donset)
DataInpatients_3 = DataInpatients_2.loc[donset_mask, :].copy()

print('Min. donset:', np.round(min(DataInpatients_3['delta_onset'])), '(>=%d)'%min_donset)
print('Max. donset:', np.round(max(DataInpatients_3['delta_onset'])), '(<=%d)'%max_donset)
print('Data inpatients shape:', DataInpatients_3.shape)
print('Data outpatients shape:', DataOutpatients.shape)

### Outliers removal

In [None]:
# Variables of interest
immunecells_set = Parameters.immunecells_set
cytokines_set = Parameters.cytokines_set
demographics_set = Parameters.demographics_set
scores_set = Parameters.scores_set
biomarkers_set = Parameters.biomarkers_set
output_set = Parameters.output_set
allinput_set = Parameters.allinput_set

# Final dataset
DataInpatients = DataInpatients_3.copy()

std = StandardScaler()
ptr = PowerTransformer()
z_score_th = Parameters.z_score_th
for name in allinput_set:
    z = ptr.fit_transform(std.fit_transform(DataInpatients[name].values.reshape(-1, 1)))
    is_outlier = abs(z) > z_score_th
    n_outliers = sum(is_outlier)
    if n_outliers:
        DataInpatients[name].where(is_outlier.reshape(-1,)==False, inplace=True)

## Export preprocessed data

In [None]:
file_name = 'DataInpatients_preprocessed.xlsx'
path = Parameters.path_datasets + file_name
DataInpatients.to_excel(path, index=False)

file_name = 'DataOutpatients_preprocessed.xlsx'
path = Parameters.path_datasets + file_name
DataOutpatients.to_excel(path, index=False)