In [1]:
import pandas as pd
import numpy as np

In [2]:
database_type = "/full"

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

#### Reading in lab dataset and keeping relevant columns to our study .

In [3]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv('../../eICU' + database_type + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
1,141178,Female,52,54.4
2,141179,Female,52,
3,141194,Male,68,73.9
4,141196,Male,71,
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


In [4]:
patientIds = pd.read_csv('../../eICU' + '/training' + '/patientIds.csv')
patientIds = patientIds['patientunitstayid'].tolist()

In [5]:
patient = patient.loc[patient['patientunitstayid'].isin(patientIds)]
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
8,141227,Male,82,82.20
19,141288,Female,61,
20,141289,Female,61,
22,141297,Male,63,
23,141304,Male,70,
...,...,...,...,...
200839,3353145,Male,51,42.20
200844,3353194,Female,51,63.05
200846,3353197,Female,66,71.50
200848,3353199,Female,66,71.50


#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [6]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [7]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'age', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [8]:
patient['gender'].fillna('Unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


#### Replacing String with numerical value

In [9]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array(['82', '61', '63', '70', '78', '75', '80', '58', '79', '60', '73',
       '84', '62', '81', '72', '85', '67', '64', '55', '46', '74', '71',
       '65', '83', '57', '86', '51', '66', '27', '49', '77', '38', '20',
       '25', '59', '41', '30', '87', '89', '44', '54', '53', '26', '28',
       '56', '40', '76', '31', '68', '48', '69', '52', '88', '43', '33',
       '42', '50', '47', '29', '45', '36', '37', '23', '21', '22', '19',
       '32', '24', '34', '35', '16', '39', '18', nan, '17', '13', '2',
       '15', '14', '3', '1', '12'], dtype=object)

#### Finding out how much null ages there are

In [10]:
na = patient.age.isnull().sum(axis = 0)
na

2

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [11]:
patient = patient.astype({'age': float})

In [12]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [13]:
patient = patient.astype({'age': int})

In [14]:
patient['age'].unique()

array([82, 61, 63, 70, 78, 75, 80, 58, 79, 60, 73, 84, 62, 81, 72, 85, 67,
       64, 55, 46, 74, 71, 65, 83, 57, 86, 51, 66, 27, 49, 77, 38, 20, 25,
       59, 41, 30, 87, 89, 44, 54, 53, 26, 28, 56, 40, 76, 31, 68, 48, 69,
       52, 88, 43, 33, 42, 50, 47, 29, 45, 36, 37, 23, 21, 22, 19, 32, 24,
       34, 35, 16, 39, 18, 17, 13,  2, 15, 14,  3,  1, 12], dtype=int64)

#### Using ages to fill in null admission weights

In [15]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

1555

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

In [16]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [17]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [18]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [19]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
8,141227,Male,82,82.20
19,141288,Female,61,166.50
20,141289,Female,61,166.50
22,141297,Male,63,194.70
23,141304,Male,70,194.70
...,...,...,...,...
200839,3353145,Male,51,42.20
200844,3353194,Female,51,63.05
200846,3353197,Female,66,71.50
200848,3353199,Female,66,71.50


#### Dropping irrelevant columns

In [20]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient = patient.drop_duplicates()
patient

Unnamed: 0,patientunitstayid,admissionweight
8,141227,82.20
19,141288,166.50
20,141289,166.50
22,141297,194.70
23,141304,194.70
...,...,...
200839,3353145,42.20
200844,3353194,63.05
200846,3353197,71.50
200848,3353199,71.50


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [21]:
path = "../../eICU/training/"
patient.to_csv(path + "patient.csv", sep=',', index=False, encoding='utf-8')
