In [1]:
import pandas as pd
import numpy as np

In [2]:
database_type = "/full"

The `patient` table includes general information about the patient admissions (for example, demographics, admission and discharge details). 
See: http://eicu-crd.mit.edu/eicutables/patient/

#### Reading in lab dataset and keeping relevant columns to our study .

In [3]:
columns = ['patientunitstayid', 'admissionweight', 'age', 'gender']
patient = pd.read_csv('../../eICU' + database_type + '/patient.csv', usecols=columns)
patient

Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141168,Female,70,84.30
1,141178,Female,52,54.40
2,141179,Female,52,
3,141194,Male,68,73.90
4,141196,Male,71,
5,141197,Male,71,102.10
6,141203,Female,77,70.20
7,141208,Female,25,95.30
8,141227,Male,82,82.20
9,141229,Female,> 89,89.80


In [4]:
columns = ['patientunitstayid', 'diagnosis']
diagnosis = pd.read_csv('../../eICU' + '/training' + '/diagnosis.csv', usecols=columns)
diagnosis

Unnamed: 0,patientunitstayid,diagnosis
0,141168,0
1,141168,0
2,141203,0
3,141203,0
4,141227,1
5,141229,0
6,141229,0
7,141266,0
8,141284,0
9,141284,0


In [5]:
df_sepsis = diagnosis[diagnosis.diagnosis==1]

In [6]:
len(df_sepsis.patientunitstayid.unique())

23479

In [7]:
df_not_sepsis = diagnosis[diagnosis.diagnosis==0]

In [8]:
len(df_not_sepsis.patientunitstayid.unique())

171797

In [9]:
df_sepsis = pd.DataFrame(df_sepsis.patientunitstayid.unique())

In [10]:
df_not_sepsis = pd.DataFrame(df_not_sepsis.patientunitstayid.unique()).sample(n=24000)

In [11]:
df_sepsis = df_sepsis.append(df_not_sepsis)

In [12]:
patient = pd.merge(df_sepsis, patient,  how='left', left_on=[0], right_on = ['patientunitstayid'])
patient

Unnamed: 0,0,patientunitstayid,gender,age,admissionweight
0,141227,141227,Male,82,82.20
1,141288,141288,Female,61,
2,141289,141289,Female,61,
3,141297,141297,Male,63,
4,141304,141304,Male,70,
5,141392,141392,Female,78,
6,141432,141432,Male,75,131.50
7,141462,141462,Male,80,103.30
8,141470,141470,Female,58,
9,141751,141751,Female,60,


In [13]:
patient = patient.drop(0, axis=1).copy()

In [15]:
type(patient)

pandas.core.frame.DataFrame

#### Querying the dataframe to find all columns in the dataframe with has 'nan' or null values that fill an entire column

In [16]:
all_nan_cols = patient.columns[patient.isna().all()]
all_nan_cols

Index([], dtype='object')

> `all_nan_cols` were empty, which means that all of our columns have useful data.
This means that we must find nan values within the dataframe.

#### Querying the dataframe to find all columns in the dataframe which has 'nan' or null values that exist within useful data

In [17]:
all_nan_cols = patient.columns[patient.isna().any()]
all_nan_cols

Index(['gender', 'age', 'admissionweight'], dtype='object')

> The columns 
`'gender', 'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unitadmitsource', 'admissionweight', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus'` 
were identified by the `df.isna().any()` function as columns that have nan/null values existing within them.

#### Filling null gender values with `Unknown`

In [18]:
patient['gender'].fillna('Unknown', inplace=True)

#### Replacing String with numerical value

In [19]:
patient['age'] = patient['age'].apply(lambda x : '89' if (x == '> 89') else x)
patient['age'] = patient['age'].apply(lambda x : '1' if (x == '0') else x)

patient.age.unique()

array(['82', '61', '63', '70', '78', '75', '80', '58', '60', '81', '72',
       '73', '67', '74', '86', '65', '49', '77', '57', '46', '66', '38',
       '62', '20', '59', '89', '87', '54', '53', '28', '56', '71', '44',
       '30', '84', '68', '64', '85', '69', '51', '88', '33', '55', '26',
       '41', '43', '79', '40', '83', '29', '45', '31', '37', '23', '50',
       '42', '19', '32', '25', '24', '34', '22', '76', '35', '16', '52',
       '27', '47', '48', '39', '21', '36', '18', '17', '11', '7', '2',
       '14', '13', '6', nan, '15', '1'], dtype=object)

#### Finding out how much null ages there are

In [20]:
na = patient.age.isnull().sum(axis = 0)
na

3

we can drop these null values afterwards to test and play around with results

#### Convert to Float, fill null values with the mean age, convert to integer

In [21]:
patient = patient.astype({'age': float})

In [22]:
import math

patient['age'].fillna(math.floor(patient['age'].mean()), inplace=True)

In [23]:
patient = patient.astype({'age': int})

In [24]:
patient['age'].unique()

array([82, 61, 63, 70, 78, 75, 80, 58, 60, 81, 72, 73, 67, 74, 86, 65, 49,
       77, 57, 46, 66, 38, 62, 20, 59, 89, 87, 54, 53, 28, 56, 71, 44, 30,
       84, 68, 64, 85, 69, 51, 88, 33, 55, 26, 41, 43, 79, 40, 83, 29, 45,
       31, 37, 23, 50, 42, 19, 32, 25, 24, 34, 22, 76, 35, 16, 52, 27, 47,
       48, 39, 21, 36, 18, 17, 11,  7,  2, 14, 13,  6, 15,  1])

#### Using ages to fill in null admission weights

In [25]:
# set all admissionweight = 0 to null
patient['admissionweight'] = patient['admissionweight'].replace({0:np.nan})
na = patient.admissionweight.isnull().sum(axis = 0)
na

1670

#### Females and associated weights:  
    '<20', 116,  
    '20-39', 167.6,  
    '40-59', 176.4,  
    '>60', 166.5  

#### Male and associated weights:  
    '<20', 116,  
    '20-39', 196.9,  
    '40-59', 200.9,  
    '>60', 194.7  

In [26]:
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Male') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

#### Filling for Females

In [27]:
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 167.6, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 176.4, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Female') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 166.5, patient['admissionweight'])

#### Filling for Unknown Genders

In [28]:
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] < 20) & (patient['admissionweight'].isnull()), 116, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 20) & (patient['age'] <= 39) & (patient['admissionweight'].isnull()), 196.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 40) & (patient['age'] <= 59) & (patient['admissionweight'].isnull()), 200.9, patient['admissionweight'])
patient['admissionweight'] = np.where((patient['gender'] == 'Unknown') & (patient['age'] >= 60) & (patient['admissionweight'].isnull()), 194.7, patient['admissionweight'])

In [29]:
print('Null values in patient admission weight: ', patient.admissionweight.isnull().sum(axis = 0))
patient

Null values in patient admission weight:  0


Unnamed: 0,patientunitstayid,gender,age,admissionweight
0,141227,Male,82,82.20
1,141288,Female,61,166.50
2,141289,Female,61,166.50
3,141297,Male,63,194.70
4,141304,Male,70,194.70
5,141392,Female,78,166.50
6,141432,Male,75,131.50
7,141462,Male,80,103.30
8,141470,Female,58,176.40
9,141751,Female,60,166.50


#### Dropping irrelevant columns

In [30]:
patient = patient[['patientunitstayid', 'admissionweight']]
patient = patient.drop_duplicates()
patient

Unnamed: 0,patientunitstayid,admissionweight
0,141227,82.20
1,141288,166.50
2,141289,166.50
3,141297,194.70
4,141304,194.70
5,141392,166.50
6,141432,131.50
7,141462,103.30
8,141470,176.40
9,141751,166.50


#### Saving the Cleaned Patient DataFrame to a `csv` file

In [31]:
path = "../../eICU/training/"
patient.to_csv(path + "patient.csv", sep=',', index=False, encoding='utf-8')