In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import numpy as np

In [4]:
database_type = "/full"

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

#### Reading in lab dataset and keeping relevant columns to our study .

In [5]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv('../../eICU' + database_type + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
1,141178,Female,52,54.4
2,141179,Female,52,
3,141194,Male,68,73.9
4,141196,Male,71,
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [6]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [7]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'age', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [8]:
patient['gender'].fillna('Unknown', inplace=True)

#### Replacing String with numerical value

In [9]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

array(['70', '52', '68', '71', '77', '25', '82', '89', '81', '59', '43',
       '19', '67', '73', '63', '61', '45', '76', '50', '72', '80', '48',
       '65', '78', '30', '75', '46', '39', '79', '58', '87', '60', '85',
       '83', '53', '86', '64', '55', '88', '84', '62', '56', '18', '38',
       '41', '35', '54', '74', '49', '51', '47', '33', '66', '36', '69',
       '34', '44', '57', '22', '42', '27', '17', '37', '28', '40', '20',
       '32', '23', '24', '26', '29', '31', nan, '21', '15', '16', '1',
       '10', '14', '12', '13', '8', '11', '2', '5', '4', '7', '3', '6',
       '9'], dtype=object)

#### Finding out how much null ages there are

In [10]:
na = patient.age.isnull().sum(axis = 0)
na

95

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [11]:
patient = patient.astype({'age': float})

In [12]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [13]:
patient = patient.astype({'age': int})

In [14]:
patient['age'].unique()

array([70, 52, 68, 71, 77, 25, 82, 89, 81, 59, 43, 19, 67, 73, 63, 61, 45,
       76, 50, 72, 80, 48, 65, 78, 30, 75, 46, 39, 79, 58, 87, 60, 85, 83,
       53, 86, 64, 55, 88, 84, 62, 56, 18, 38, 41, 35, 54, 74, 49, 51, 47,
       33, 66, 36, 69, 34, 44, 57, 22, 42, 27, 17, 37, 28, 40, 20, 32, 23,
       24, 26, 29, 31, 21, 15, 16,  1, 10, 14, 12, 13,  8, 11,  2,  5,  4,
        7,  3,  6,  9], dtype=int64)

#### Using ages to fill in null admission weights

In [15]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

16734

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

In [16]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [17]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [18]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [19]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
1,141178,Female,52,54.4
2,141179,Female,52,176.4
3,141194,Male,68,73.9
4,141196,Male,71,194.7
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


#### Dropping irrelevant columns

In [20]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient

Unnamed: 0,patientunitstayid,admissionweight
0,141168,84.3
1,141178,54.4
2,141179,176.4
3,141194,73.9
4,141196,194.7
...,...,...
200854,3353235,90.0
200855,3353237,78.4
200856,3353251,102.0
200857,3353254,83.9


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [21]:
path = "../../eICU/training/"
patient.to_csv(path + "patient.csv", sep=',', index=False, encoding='utf-8')