## <center> LOADING THE DATASET IN PYTHON </center>

## Loading the dataset in Python to preserve pandas datatypes

In [1]:
import pandas as pd

### Option 1 - Loading as HDF5 file

In [2]:
# Option 1 - Loading as HDF5 file
df= pd.read_hdf('Databases/ICH_database.hdf5')
df.head(4)

Unnamed: 0,patient,sex,hospital,follow_up,final_outcome,nfamily_medhist,tobacco,n_tobacco,drugs,alcohol,...,rdw,mchc,mpv,mch,inr,fibrinogen,maxfibrinogen,time_between_CT_bloodanalysis,age,survival_days
0,1,1,1.0,2.0,2.0,17,2.0,,0,0,...,17.0,33.0,9.0,28.1,4.21,344.0,618.0,0,74,4
1,2,2,2.0,0.0,1.0,4,0.0,0.0,0,0,...,14.7,32.5,10.7,29.8,,,1081.0,0,81,-1000
2,3,2,1.0,0.0,0.0,8,0.0,0.0,0,0,...,14.0,33.1,8.7,30.1,3.16,298.0,470.0,0,78,-1000
3,4,2,1.0,2.0,2.0,8,0.0,0.0,0,0,...,15.8,34.1,7.6,19.3,1.09,344.0,344.0,0,79,7


In [3]:
df.dtypes[:4]

patient         int64
sex          category
hospital     category
follow_up    category
dtype: object

### Option 2 - Loading as CSV

In [4]:
# Option 2 - Loading as CSV

# Load metadadata and extract an array with categorical columns index
df2_metadata= pd.read_csv('Databases/ICH_database_metadata.csv')
categorical= df2_metadata[df2_metadata['Pandas_Datatype'] == 'category'].index

# Load data and convert into categories the appropriate variables
df2= pd.read_csv('Databases/ICH_database.csv', keep_default_na=True)
df2[df2.columns[categorical]]= df2[df2.columns[categorical]].astype('category')
df2.head()

Unnamed: 0,patient,sex,hospital,follow_up,final_outcome,nfamily_medhist,tobacco,n_tobacco,drugs,alcohol,...,rdw,mchc,mpv,mch,inr,fibrinogen,maxfibrinogen,time_between_CT_bloodanalysis,age,survival_days
0,1,1,1.0,2.0,2.0,17,2.0,,0,0,...,17.0,33.0,9.0,28.1,4.21,344.0,618.0,0,74,4
1,2,2,2.0,0.0,1.0,4,0.0,0.0,0,0,...,14.7,32.5,10.7,29.8,,,1081.0,0,81,-1000
2,3,2,1.0,0.0,0.0,8,0.0,0.0,0,0,...,14.0,33.1,8.7,30.1,3.16,298.0,470.0,0,78,-1000
3,4,2,1.0,2.0,2.0,8,0.0,0.0,0,0,...,15.8,34.1,7.6,19.3,1.09,344.0,344.0,0,79,7
4,5,1,2.0,0.0,1.0,4,0.0,0.0,0,1,...,13.4,33.5,7.8,30.7,1.01,,,0,86,1016


In [5]:
df2.dtypes[:4]

patient         int64
sex          category
hospital     category
follow_up    category
dtype: object

In [6]:
# All the variables with NaN are converted into float64
# After convert some variables into categories, there are the following types

def countif (nan, dtype):
    values= [c_ for c_ in df.columns if df[c_].isnull().values.any() == nan and df[c_].dtype == dtype]
    return values

In [7]:
datatypes= ['nan_int64','nan_float64','nan_category','nonan_int64','nonan_float64','nonan_category']
counts_nan= [len(countif(True,dtype[4:])) for dtype in datatypes[:3]]
counts_nonan= [len(countif(False,dtype[6:])) for dtype in datatypes[3:]]

columns_dtypes= pd.DataFrame(data={'Datatype':datatypes, 'Count':counts_nan+counts_nonan}, index=list(range(6)))
columns_dtypes

Unnamed: 0,Datatype,Count
0,nan_int64,0
1,nan_float64,31
2,nan_category,26
3,nonan_int64,16
4,nonan_float64,0
5,nonan_category,69


### Check both ways are equivalent

In [8]:
df.equals(df2)

True