In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
databasePath = "../eICU/training/"
exportPath = "../eICU/training/"

#### Interpolating Data across patient timeseries

In [3]:
finalMerge = pd.read_csv(databasePath + 'finalMerge.csv')
finalMerge = finalMerge.sort_values(by=['patientunitstayid', 'observationoffset'])
finalMerge

Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
135258203,141168,72.0,,,,,,,,,0.0
135258204,141168,118.0,,,,,,,,,0.0
15,141168,119.0,,140.0,,,,,,,
201,141168,124.0,,140.0,,,,,,,
213,141168,129.0,,140.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
131613215,3353263,899.0,,87.0,13.0,,,,,,
131613211,3353263,904.0,,85.0,13.0,,,,,,
131613231,3353263,909.0,,91.0,23.0,,,,,,
131613239,3353263,914.0,,89.0,19.0,,,,,,


In [4]:
print("temperature: ", len(finalMerge['temperature'].unique()))
print("heartrate: ", len(finalMerge['heartrate'].unique()))
print("creatinine: ", len(finalMerge['creatinine'].unique()))
print("wbcx1000: ", len(finalMerge['wbcx1000'].unique()))
print("urineoutputbyweight: ", len(finalMerge['urineoutputbyweight'].unique()))
print("diagnosis: ", len(finalMerge['diagnosis'].unique()))

temperature:  2338
heartrate:  287
creatinine:  6087
wbcx1000:  5465
urineoutputbyweight:  207883
diagnosis:  3


In [5]:
finalMerge.diagnosis.value_counts()

0.0    454377
1.0     82935
Name: diagnosis, dtype: int64

In [6]:
idList = sorted(set(list(finalMerge['patientunitstayid'])))
print("Number of ids: ", len(idList))

Number of ids:  156191


In [7]:
idList1 = idList[0:30000]
idList2 = idList[30000:60000]
idList3 = idList[60000:90000]
idList4 = idList[90000:120000]
idList5 = idList[120000:]

In [8]:
print(len(idList1)+len(idList2)+len(idList3)+len(idList4)+len(idList5))

156191


In [9]:
cols = ['temperature', 'heartrate', 'respiration', 'systemicsystolic', 'creatinine', 'wbcx1000', 'lactate', 'urineoutputbyweight']

In [10]:
def data_interpolation(idList):
    x = 0
    
    interpolatedData = pd.DataFrame()

    for id in idList:
        df = finalMerge[finalMerge['patientunitstayid'] == id]
        countNull = df[cols].isnull().sum().sum()
        percentOfNull = (countNull / (len(df.index) * 8)) * 100
        # print(percentOfNull)

        if x%5000 == 0:
            print(x)
        x+=1

        if percentOfNull < 80:
            # Interpolating Patient Data based on closet value
            df.loc[:, 'temperature'] = df['temperature'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'heartrate'] = df['heartrate'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'respiration'] = df['respiration'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'systemicsystolic'] = df['systemicsystolic'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'creatinine'] = df['creatinine'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'wbcx1000'] = df['wbcx1000'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'lactate'] = df['lactate'].interpolate(method='linear', limit_direction='both')
            df.loc[:, 'urineoutputbyweight'] = df['urineoutputbyweight'].interpolate(method='linear', limit_direction='both')
            interpolatedData = interpolatedData.append(df)

    return interpolatedData

### 1

In [19]:
%%time

interpolatedData1 = data_interpolation(idList1)
interpolatedData1

0
5000
10000
15000
20000
25000
CPU times: user 1h 28min 51s, sys: 1min 59s, total: 1h 30min 51s
Wall time: 1h 30min 51s


Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
131613374,141203,-1580.0,,112.0,25.0,,0.330000,10.200000,3.5,,
131613373,141203,-473.0,,112.0,25.0,,0.390000,8.500000,3.5,,
131613371,141203,1.0,,112.0,25.0,,0.560000,12.700000,3.5,,
492,141203,5.0,,112.0,25.0,,0.560682,12.748182,3.5,,
654,141203,10.0,,114.0,25.0,,0.561364,12.796364,3.5,,
...,...,...,...,...,...,...,...,...,...,...,...
23340386,755638,1381.0,,60.0,20.0,,0.600000,10.600000,,49.056604,
23340434,755638,1386.0,,63.0,20.0,,0.600000,10.600000,,49.056604,
23340410,755638,1391.0,,63.0,19.0,,0.600000,10.600000,,49.056604,
23340388,755638,1396.0,,63.0,21.0,,0.600000,10.600000,,49.056604,


In [20]:
interpolatedData1.to_csv(exportPath + "interpolatedData1.csv", sep=',', index=False, encoding='utf-8')

### 2

In [11]:
%%time

interpolatedData2 = data_interpolation(idList2)
interpolatedData2

0
5000
10000
15000
20000
25000
CPU times: user 1h 34min 2s, sys: 3min 50s, total: 1h 37min 53s
Wall time: 1h 37min 53s


Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
131775700,755666,-2042.0,,111.0,20.0,,0.660000,30.200000,,10.457886,
131775710,755666,-425.0,,111.0,20.0,,0.740000,30.100000,,10.457886,
132986433,755666,-20.0,,111.0,20.0,,0.739340,30.053807,,10.457886,
23340686,755666,8.0,,111.0,20.0,,0.738680,30.007614,,9.681895,
23340778,755666,13.0,,102.0,18.0,,0.738020,29.961421,,8.905905,
...,...,...,...,...,...,...,...,...,...,...,...
48440741,1257494,2001.0,,80.0,18.0,,0.999333,4.777800,,10.651629,
48440641,1257494,2006.0,,79.0,16.0,,0.999667,4.773900,,10.651629,
131977918,1257494,2291.0,,79.0,16.0,,1.000000,4.770000,,10.651629,
131977920,1257494,3432.0,,79.0,16.0,,1.200000,5.920000,,10.651629,


In [12]:
interpolatedData2.to_csv(exportPath + "interpolatedData2.csv", sep=',', index=False, encoding='utf-8')

### 3

In [11]:
%%time

interpolatedData3 = data_interpolation(idList3)
interpolatedData3

0
5000
10000
15000
20000
25000
CPU times: user 1h 26min 14s, sys: 3min 27s, total: 1h 29min 41s
Wall time: 1h 29min 42s


Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
131977942,1257551,-183.0,,68.0,20.0,,2.300000,10.800000,,0.081433,
48444122,1257551,19.0,,68.0,20.0,,2.294340,10.759434,,0.081433,
135435887,1257551,20.0,,66.0,10.5,,2.288679,10.718868,,0.081433,1.0
48443990,1257551,24.0,,64.0,1.0,,2.283019,10.678302,,0.081433,
48444152,1257551,29.0,,63.0,24.0,,2.277358,10.637736,,0.081433,
...,...,...,...,...,...,...,...,...,...,...,...
74467614,1804307,1380.0,,74.0,32.0,,1.380000,4.300000,,3.726245,
74467620,1804307,1385.0,,74.0,28.0,,1.380000,4.300000,,3.726245,
74467632,1804307,1390.0,,74.0,33.0,,1.380000,4.300000,,3.726245,
74467644,1804307,1395.0,,80.0,34.0,,1.380000,4.300000,,3.726245,


In [12]:
interpolatedData3.to_csv(exportPath + "interpolatedData3.csv", sep=',', index=False, encoding='utf-8')

### 4

In [11]:
%%time

interpolatedData4 = data_interpolation(idList4)
interpolatedData4

0
5000
10000
15000
20000
25000
CPU times: user 1h 34min 19s, sys: 329 ms, total: 1h 34min 20s
Wall time: 1h 34min 20s


Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
132172719,1804353,-287.0,,72.0,29.0,,1.280000,13.100000,1.0,12.096774,
133916235,1804353,-111.0,,72.0,29.0,,1.280429,13.065714,1.0,12.096774,
74467708,1804353,6.0,,72.0,29.0,,1.280857,13.031429,1.0,12.096774,
74467744,1804353,11.0,,74.0,30.0,,1.281286,12.997143,1.0,12.096774,
74467831,1804353,16.0,,72.0,31.0,,1.281714,12.962857,1.0,12.096774,
...,...,...,...,...,...,...,...,...,...,...,...
99154735,2769904,1168.0,,62.0,19.0,,1.140000,10.400000,,1.307190,
99154672,2769904,1173.0,,62.0,21.0,,1.140000,10.400000,,1.960784,
99154744,2769904,1178.0,,63.0,19.0,,1.140000,10.400000,,2.614379,
99154627,2769904,1183.0,,60.0,12.0,,1.140000,10.400000,,3.267974,


In [12]:
interpolatedData4.to_csv(exportPath + "interpolatedData4.csv", sep=',', index=False, encoding='utf-8')

### 5

In [11]:
%%time

interpolatedData5 = data_interpolation(idList5)
interpolatedData5

0
5000
10000
15000
20000
25000
30000
35000
CPU times: user 2h 12min 48s, sys: 12min 51s, total: 2h 25min 39s
Wall time: 2h 25min 40s


Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
132349650,2769907,-742.0,,100.0,23.0,,4.530000,13.200000,1.5,8.016032,
132349655,2769907,-619.0,,100.0,23.0,,4.610000,13.272152,1.5,8.016032,
132349648,2769907,-209.0,,100.0,23.0,,4.930000,13.344304,1.5,8.016032,
99155452,2769907,7.0,,100.0,23.0,,4.903117,13.416456,1.5,8.016032,
99155424,2769907,12.0,,100.0,23.0,,4.876234,13.488608,1.5,8.016032,
...,...,...,...,...,...,...,...,...,...,...,...
131613215,3353263,899.0,,87.0,13.0,,1.060000,6.400000,,,
131613211,3353263,904.0,,85.0,13.0,,1.060000,6.400000,,,
131613231,3353263,909.0,,91.0,23.0,,1.060000,6.400000,,,
131613239,3353263,914.0,,89.0,19.0,,1.060000,6.400000,,,


In [12]:
interpolatedData5.to_csv(exportPath + "interpolatedData5.csv", sep=',', index=False, encoding='utf-8')

In [None]:
del finalMerge

In [None]:
interpolatedData[cols].isnull().sum().sum()

In [None]:
interpolatedData['diagnosis'] = interpolatedData['diagnosis'].fillna(0)
interpolatedData.diagnosis.value_counts()

In [None]:
sepsisOnly = interpolatedData[interpolatedData['diagnosis'] == 1]
sepsisOnly

In [None]:
sepsisOnly['temperature'] = sepsisOnly['temperature'].fillna(sepsisOnly['temperature'].mean())
sepsisOnly['heartrate'] = sepsisOnly['heartrate'].fillna(sepsisOnly['heartrate'].mean())
sepsisOnly['respiration'] = sepsisOnly['respiration'].fillna(sepsisOnly['respiration'].mean())
sepsisOnly['systemicsystolic'] = sepsisOnly['systemicsystolic'].fillna(sepsisOnly['systemicsystolic'].mean())
sepsisOnly['creatinine'] = sepsisOnly['creatinine'].fillna(sepsisOnly['creatinine'].mean())
sepsisOnly['wbcx1000'] = sepsisOnly['wbcx1000'].fillna(sepsisOnly['wbcx1000'].mean())
sepsisOnly['lactate'] = sepsisOnly['lactate'].fillna(sepsisOnly['lactate'].mean())
sepsisOnly['urineoutputbyweight'] = sepsisOnly['urineoutputbyweight'].fillna(sepsisOnly['urineoutputbyweight'].mean())

print(sepsisOnly[cols].isnull().sum().sum())

In [None]:
notSepsis = interpolatedData[interpolatedData['diagnosis'] == 0]
notSepsis

In [None]:
notSepsis['temperature'] = notSepsis['temperature'].fillna(notSepsis['temperature'].mean())
notSepsis['heartrate'] = notSepsis['heartrate'].fillna(notSepsis['heartrate'].mean())
notSepsis['respiration'] = notSepsis['respiration'].fillna(notSepsis['respiration'].mean())
notSepsis['systemicsystolic'] = notSepsis['systemicsystolic'].fillna(notSepsis['systemicsystolic'].mean())
notSepsis['creatinine'] = notSepsis['creatinine'].fillna(notSepsis['creatinine'].mean())
notSepsis['wbcx1000'] = notSepsis['wbcx1000'].fillna(notSepsis['wbcx1000'].mean())
notSepsis['lactate'] = notSepsis['lactate'].fillna(notSepsis['lactate'].mean())
notSepsis['urineoutputbyweight'] = notSepsis['urineoutputbyweight'].fillna(notSepsis['urineoutputbyweight'].mean())

print(notSepsis[cols].isnull().sum().sum())

In [None]:
finalData = notSepsis.append(sepsisOnly)
del sepsisOnly, notSepsis

finalData = finalData.astype({'diagnosis': int})
finalData = finalData.sort_values(['patientunitstayid','observationoffset'])
finalData

In [None]:
finalData.to_csv(exportPath + "finalData.csv", sep=',', index=False, encoding='utf-8')