In [1]:
import pandas as pd
import numpy as np

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

In [None]:
databasePath = "../../eICU/full/"
exportPath = "../../eICU/training/"

#### Reading in lab dataset and keeping relevant columns to our study .

In [3]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv(databasePath + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.30
1,141178,Female,52,54.40
2,141179,Female,52,
3,141194,Male,68,73.90
4,141196,Male,71,
5,141197,Male,71,102.10
6,141203,Female,77,70.20
7,141208,Female,25,95.30
8,141227,Male,82,82.20
9,141229,Female,> 89,89.80


#### Reading in patient ids to keep

In [4]:
patientIds = pd.read_csv(exportPath + '/patientIds.csv')
patientIds = patientIds['patientunitstayid'].tolist()

#### Keeping patient ids from lab table that exist in patient id file

In [5]:
patient = patient.loc[patient['patientunitstayid'].isin(patientIds)]
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
8,141227,Male,82,82.2
19,141288,Female,61,
20,141289,Female,61,
22,141297,Male,63,
23,141304,Male,70,
25,141314,Male,45,
31,141362,Male,65,
34,141392,Female,78,
36,141432,Male,75,131.5
41,141454,Female,79,82.2


#### Updating patient ids with patient ids that remain from dataset

In [6]:
newPatientIds = pd.DataFrame(patient['patientunitstayid'].unique())
newPatientIds.rename(columns={0: 'patientunitstayid'}, inplace=True)
newPatientIds

Unnamed: 0,patientunitstayid
0,141227
1,141288
2,141289
3,141297
4,141304
5,141314
6,141362
7,141392
8,141432
9,141454


In [7]:
newPatientIds.to_csv(exportPath + "patientIds.csv", sep=',', index=False, encoding='utf-8')

#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [8]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [9]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [10]:
patient['gender'].fillna('Unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


#### Replacing String with numerical value

In [11]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


array(['82', '61', '63', '70', '45', '65', '78', '75', '79', '80', '58',
       '30', '53', '60', '55', '84', '81', '72', '67', '73', '64', '36',
       '71', '74', '83', '56', '86', '89', '49', '51', '77', '57', '46',
       '66', '38', '62', '20', '59', '87', '54', '28', '40', '44', '52',
       '69', '68', '76', '19', '85', '50', '48', '33', '88', '37', '42',
       '26', '32', '47', '41', '43', '29', '31', '23', '25', '24', '35',
       '18', '34', '22', '16', '27', '21', '39', '17', '14', '15', '1',
       '13', '4', '5', '9', '12'], dtype=object)

#### Finding out how much null ages there are

In [12]:
na = patient.age.isnull().sum(axis = 0)
na

0

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [13]:
patient = patient.astype({'age': float})

In [14]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [15]:
patient = patient.astype({'age': int})

In [16]:
patient['age'].unique()

array([82, 61, 63, 70, 45, 65, 78, 75, 79, 80, 58, 30, 53, 60, 55, 84, 81,
       72, 67, 73, 64, 36, 71, 74, 83, 56, 86, 89, 49, 51, 77, 57, 46, 66,
       38, 62, 20, 59, 87, 54, 28, 40, 44, 52, 69, 68, 76, 19, 85, 50, 48,
       33, 88, 37, 42, 26, 32, 47, 41, 43, 29, 31, 23, 25, 24, 35, 18, 34,
       22, 16, 27, 21, 39, 17, 14, 15,  1, 13,  4,  5,  9, 12])

#### Using ages to fill in null admission weights

In [17]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

1542

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

#### Filling for Male

In [18]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [19]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [20]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [21]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
8,141227,Male,82,82.2
19,141288,Female,61,166.5
20,141289,Female,61,166.5
22,141297,Male,63,194.7
23,141304,Male,70,194.7
25,141314,Male,45,200.9
31,141362,Male,65,194.7
34,141392,Female,78,166.5
36,141432,Male,75,131.5
41,141454,Female,79,82.2


#### Dropping irrelevant columns

In [22]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient = patient.drop_duplicates()
patient

Unnamed: 0,patientunitstayid,admissionweight
8,141227,82.2
19,141288,166.5
20,141289,166.5
22,141297,194.7
23,141304,194.7
25,141314,200.9
31,141362,194.7
34,141392,166.5
36,141432,131.5
41,141454,82.2


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [23]:
patient.to_csv(exportPath + "patient.csv", sep=',', index=False, encoding='utf-8')
