In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
database_type = "/full"

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

#### Reading in lab dataset and keeping relevant columns to our study .

In [4]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv('../../eICU' + database_type + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141764,Female,87,
1,141765,Female,87,46.5
2,143870,Male,76,77.5
3,144815,Female,34,60.3
4,145427,Male,61,91.7
...,...,...,...,...
2515,3351763,Female,62,134.5
2516,3352230,Male,41,127.0
2517,3352231,Male,41,127.0
2518,3352333,Male,72,68.3


#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [5]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [6]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'age', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [7]:
patient['gender'].fillna('Unknown', inplace=True)

#### Replacing String with numerical value

In [8]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

array(['87', '76', '34', '61', '55', '60', '28', '89', '59', '44', '66',
       '41', '63', '57', '52', '23', '73', '39', '20', '29', '71', '18',
       '83', '84', '47', '38', '46', '49', '81', '68', '51', '17', '85',
       '56', '80', '48', '74', '16', '64', '75', '67', '72', '82', '77',
       '78', '58', '88', '53', '54', '65', '69', '62', '50', '42', '32',
       '25', '40', '43', '79', '70', '86', '30', '37', '26', '19', '31',
       '21', '45', nan, '22', '27', '33', '35', '24', '36', '15'],
      dtype=object)

#### Finding out how much null ages there are

In [9]:
na = patient.age.isnull().sum(axis = 0)
na

4

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [10]:
patient = patient.astype({'age': float})

In [11]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [12]:
patient = patient.astype({'age': int})

In [13]:
patient['age'].unique()

array([87, 76, 34, 61, 55, 60, 28, 89, 59, 44, 66, 41, 63, 57, 52, 23, 73,
       39, 20, 29, 71, 18, 83, 84, 47, 38, 46, 49, 81, 68, 51, 17, 85, 56,
       80, 48, 74, 16, 64, 75, 67, 72, 82, 77, 78, 58, 88, 53, 54, 65, 69,
       62, 50, 42, 32, 25, 40, 43, 79, 70, 86, 30, 37, 26, 19, 31, 21, 45,
       22, 27, 33, 35, 24, 36, 15], dtype=int64)

#### Using ages to fill in null admission weights

In [14]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

198

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

In [15]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [16]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [17]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [18]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141764,Female,87,166.5
1,141765,Female,87,46.5
2,143870,Male,76,77.5
3,144815,Female,34,60.3
4,145427,Male,61,91.7
...,...,...,...,...
2515,3351763,Female,62,134.5
2516,3352230,Male,41,127.0
2517,3352231,Male,41,127.0
2518,3352333,Male,72,68.3


#### Dropping irrelevant columns

In [19]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient.drop_duplicates()
patient

Unnamed: 0,patientunitstayid,admissionweight
0,141764,166.5
1,141765,46.5
2,143870,77.5
3,144815,60.3
4,145427,91.7
...,...,...
2515,3351763,134.5
2516,3352230,127.0
2517,3352231,127.0
2518,3352333,68.3


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [20]:
path = "../../eICU/training/"
patient.to_csv(path + "patient.csv", sep=',', index=False, encoding='utf-8')