In [1]:
import pandas as pd
import numpy as np

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

In [2]:
databasePath = "../../eICU/full/"
exportPath = "../../eICU/training/"

#### Reading in lab dataset and keeping relevant columns to our study .

In [3]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv(databasePath + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
1,141178,Female,52,54.4
2,141179,Female,52,
3,141194,Male,68,73.9
4,141196,Male,71,
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


#### Reading in patient ids to keep

In [4]:
patientIds = pd.read_csv(exportPath + '/patientIds.csv')
patientIds = patientIds['patientunitstayid'].tolist()

#### Keeping patient ids from lab table that exist in patient id file

In [5]:
patient = patient.loc[patient['patientunitstayid'].isin(patientIds)]
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
6,141203,Female,77,70.2
8,141227,Male,82,82.2
9,141229,Female,> 89,89.8
16,141266,Male,73,120.4
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


#### Updating patient ids with patient ids that remain from dataset

In [6]:
newPatientIds = pd.DataFrame(patient['patientunitstayid'].unique())
newPatientIds.rename(columns={0: 'patientunitstayid'}, inplace=True)
newPatientIds

Unnamed: 0,patientunitstayid
0,141168
1,141203
2,141227
3,141229
4,141266
...,...
173104,3353235
173105,3353237
173106,3353251
173107,3353254


In [7]:
newPatientIds.to_csv(exportPath + "patientIds.csv", sep=',', index=False, encoding='utf-8')

#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [8]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [9]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'age', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [10]:
patient['gender'].fillna('Unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


#### Replacing String with numerical value

In [11]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array(['70', '77', '82', '89', '73', '63', '61', '45', '76', '72', '80',
       '48', '65', '78', '30', '75', '46', '79', '58', '60', '68', '53',
       '71', '64', '55', '67', '88', '84', '62', '81', '56', '85', '38',
       '41', '54', '74', '59', '52', '51', '50', '36', '69', '44', '86',
       '22', '83', '57', '66', '42', '27', '49', '18', '39', '28', '40',
       '20', '35', '47', '25', '43', '23', '33', '87', '32', '26', '37',
       '31', '34', '29', '19', '21', '24', '17', '16', '15', '14', nan,
       '1', '12', '13', '8', '11', '2', '5', '10', '4', '7', '3', '6',
       '9'], dtype=object)

#### Finding out how much null ages there are

In [12]:
na = patient.age.isnull().sum(axis = 0)
na

14

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [13]:
patient = patient.astype({'age': float})

In [14]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [15]:
patient = patient.astype({'age': int})

In [16]:
patient['age'].unique()

array([70, 77, 82, 89, 73, 63, 61, 45, 76, 72, 80, 48, 65, 78, 30, 75, 46,
       79, 58, 60, 68, 53, 71, 64, 55, 67, 88, 84, 62, 81, 56, 85, 38, 41,
       54, 74, 59, 52, 51, 50, 36, 69, 44, 86, 22, 83, 57, 66, 42, 27, 49,
       18, 39, 28, 40, 20, 35, 47, 25, 43, 23, 33, 87, 32, 26, 37, 31, 34,
       29, 19, 21, 24, 17, 16, 15, 14,  1, 12, 13,  8, 11,  2,  5, 10,  4,
        7,  3,  6,  9])

#### Using ages to fill in null admission weights

In [17]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

6093

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

#### Filling for Male

In [18]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [19]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [20]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [21]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.3
6,141203,Female,77,70.2
8,141227,Male,82,82.2
9,141229,Female,89,89.8
16,141266,Male,73,120.4
...,...,...,...,...
200854,3353235,Male,50,90.0
200855,3353237,Female,79,78.4
200856,3353251,Male,73,102.0
200857,3353254,Male,81,83.9


#### Dropping irrelevant columns

In [22]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient = patient.drop_duplicates()
patient

Unnamed: 0,patientunitstayid,admissionweight
0,141168,84.3
6,141203,70.2
8,141227,82.2
9,141229,89.8
16,141266,120.4
...,...,...
200854,3353235,90.0
200855,3353237,78.4
200856,3353251,102.0
200857,3353254,83.9


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [23]:
patient.to_csv(exportPath + "patient.csv", sep=',', index=False, encoding='utf-8')
