In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# function to fill missing values
def impute_missing_vals(df, attributes):
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

In [3]:
train_data_path = 'D:/RK/Marwadi University/Sem-7/Project/raw/train/train_data.csv'
test_data_path = 'D:/RK/Marwadi University/Sem-7/Project/raw/test/test_data.csv'
val_data_path = 'D:/RK/Marwadi University/Sem-7/Project/raw/val/val_data.csv'

In [4]:
import pickle
file = open('D:/RK/Marwadi University/Sem-7/Project/raw_data.pickle','rb')
raw_data = pickle.load(file)
file.close()

In [5]:
train_data = pd.read_csv(train_data_path, sep='|')
test_data = pd.read_csv(test_data_path, sep='|')
val_data = pd.read_csv(val_data_path, sep='|')

In [6]:
train_data = train_data[train_data['ICULOS']<=24]
test_data = test_data[test_data['ICULOS']<=24]
val_data = val_data[val_data['ICULOS']<=24]
raw_data = raw_data[raw_data['ICULOS']<=24]

In [7]:
patient_id_train = train_data['patient_id'].unique()
patient_id_test = test_data['patient_id'].unique()
patient_id_val = val_data['patient_id'].unique()
patient_id_raw = raw_data['patient_id'].unique()

In [8]:
for i in patient_id_train:
    patient = train_data[train_data['patient_id'] == i]
    sepsis = patient['SepsisLabel']
    if sepsis.sum() == 0:
        train_data.drop(train_data[train_data['patient_id'] == i].index, inplace = True)

In [9]:
for i in patient_id_test:
    patient = test_data[test_data['patient_id'] == i]
    sepsis = patient['SepsisLabel']
    if sepsis.sum() == 0:
        test_data.drop(test_data[test_data['patient_id'] == i].index, inplace = True)

In [10]:
for i in patient_id_val:
    patient = val_data[val_data['patient_id'] == i]
    sepsis = patient['SepsisLabel']
    if sepsis.sum() == 0:
        val_data.drop(val_data[val_data['patient_id'] == i].index, inplace = True)

In [11]:
for i in patient_id_raw:
    patient = raw_data[raw_data['patient_id'] == i]
    sepsis = patient['SepsisLabel']
    if sepsis.sum() == 0:
        raw_data.drop(raw_data[raw_data['patient_id'] == i].index, inplace = True)

In [12]:
raw_data['SepsisLabel'].value_counts()

1    11519
0     9031
Name: SepsisLabel, dtype: int64

In [13]:
train_data['SepsisLabel'].value_counts()

1    8054
0    6496
Name: SepsisLabel, dtype: int64

In [14]:
test_data['SepsisLabel'].value_counts()

1    1651
0    1149
Name: SepsisLabel, dtype: int64

In [15]:
val_data['SepsisLabel'].value_counts()

1    1814
0    1386
Name: SepsisLabel, dtype: int64

In [16]:
attributes = train_data.columns[:-1]

In [17]:
raw_clean = impute_missing_vals(raw_data, attributes)
raw_clean = raw_clean.drop(['Unit1','Unit2'], axis=1)

In [18]:
train_clean = impute_missing_vals(train_data, attributes)
train_clean = train_clean.drop(['Unit1','Unit2'], axis=1)

In [19]:
test_clean = impute_missing_vals(test_data, attributes)
test_clean = test_clean.drop(['Unit1','Unit2'],axis=1)

In [20]:
val_clean = impute_missing_vals(val_data, attributes)
val_clean = val_clean.drop(['Unit1','Unit2'],axis=1)

In [31]:
 raw_clean.isna().sum()

HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
HospAdmTime         0
ICULOS              0
patient_id          0
SepsisLabel         0
dtype: int64

In [22]:
train_clean.to_csv("D:/RK/Marwadi University/Sem-7/Project/base/train_data.csv", sep='|', index= False)
test_clean.to_csv("D:/RK/Marwadi University/Sem-7/Project/base/test_data.csv", sep='|', index= False)
val_clean.to_csv("D:/RK/Marwadi University/Sem-7/Project/base/val_data.csv", sep='|', index= False)
raw_clean.to_csv("D:/RK/Marwadi University/Sem-7/Project/base/raw_data.csv", sep='|', index= False)

In [30]:
train_clean

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,PTT,WBC,Fibrinogen,Platelets,Age,Gender,HospAdmTime,ICULOS,patient_id,SepsisLabel
34,85.0,100.0,36.10,117.0,90.0,74.0,11.0,50.0,-7.0,18.0,...,72.1,12.6,102.0,127.0,58.54,0,-405.34,1,15,0
35,85.0,100.0,36.10,117.0,90.0,74.0,11.0,50.0,-7.0,18.0,...,72.1,12.6,102.0,127.0,58.54,0,-405.34,2,15,0
36,89.5,100.0,36.55,122.5,93.0,75.5,9.5,50.0,-7.0,18.0,...,72.1,12.6,102.0,127.0,58.54,0,-405.34,3,15,0
37,97.0,100.0,36.70,127.0,97.0,79.0,12.0,50.0,-8.0,18.0,...,37.0,12.6,102.0,127.0,58.54,0,-405.34,4,15,0
38,90.0,100.0,37.00,110.0,85.0,70.0,14.0,50.0,-8.0,17.0,...,37.0,17.7,102.0,102.0,58.54,0,-405.34,5,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121604,99.5,100.0,36.10,153.0,112.0,83.0,20.0,33.0,2.0,25.1,...,30.5,7.1,290.0,363.0,69.00,1,-339.52,8,119888,1
121605,107.0,100.0,36.10,133.0,103.0,82.0,18.0,33.0,2.0,25.1,...,30.5,7.1,290.0,363.0,69.00,1,-339.52,9,119888,1
121606,121.0,100.0,36.30,132.0,107.0,89.0,18.0,33.0,2.0,25.1,...,30.5,7.1,290.0,363.0,69.00,1,-339.52,10,119888,1
121607,120.0,100.0,36.30,144.0,113.0,98.5,14.0,33.0,2.0,25.1,...,30.5,7.1,290.0,363.0,69.00,1,-339.52,11,119888,1
