## Import Libraries

In [None]:
import pandas as pd
import numpy as np

## Read Datasets
Read Datasets and do some basic exploratory analysis

In [None]:
apsvar = pd.read_csv('../data/apacheApsVar.csv')

In [None]:
apsvar.head()

In [None]:
apsvar.columns

In [None]:
apsvar.shape

#### Read vitals aperiod dataset to get the systolic and diastolic

In [None]:
vitsaperiod = pd.read_csv('../data/vitalAperiodic.csv')

In [None]:
vitsaperiod.head()

In [None]:
vitsaperiod.columns

In [None]:
vitsaperiod.shape

#### Join the dialstolic and systolic blood pressures to the apsvar dataset

In [None]:
df = apsvar.join(vitsaperiod[['observationoffset', 'noninvasivesystolic', 'noninvasivediastolic']], on='patientunitstayid')

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.reset_index(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.columns

## Section: Calculation of SOFA scores

### Select only the required columns
Select only those columns that contribute to the calculation for the SOFA scores.

In [None]:
dfmain = df[['urine', 'wbc', 'creatinine', 'pao2', 'fio2', 'bilirubin', 'observationoffset', 'patientunitstayid']]

In [None]:
dfmain.isna().sum()

#### Calculate Mean Arterial Pressure (MAP)

In [None]:
def calcMAP(listOfSBP, listofDBP):
    """A function to calculate MAP ie, Mean Arterial Pressure"""
    
    l = []
    for i in range(len(listOfSBP)):
        mapval = (listOfSBP[i] + (listofDBP[i])*2)/3.0
        l.append(mapval)
    
    return l    

In [None]:
mapvals = calcMAP(df['noninvasivesystolic'].tolist(), df['noninvasivediastolic'].tolist())

In [None]:
mapvals = pd.DataFrame(mapvals, columns=['MAP'])

In [None]:
mapvals.head()

#### Calculate Glasgow Coma Score (GCS)

In [None]:
def calcGCS(eyeslist, motorlist, verballist):
    """A function to calculate Glasgow Coma Scale"""
    
    l = []
    for i in range(len(eyeslist)):
        gcs = eyeslist[i] + motorlist[i] + verballist[i]
        if gcs < 0:
            l.append(None)
        else:
            l.append(gcs)
    
    return l

In [None]:
gcsvals = calcGCS(df['eyes'].tolist(), df['motor'].tolist(), df['verbal'].tolist())

In [None]:
gcsvals = pd.DataFrame(gcsvals, columns=['GCS'])

In [None]:
gcsvals.head()

In [None]:
gcsvals.isna().sum()

In [None]:
len(gcsvals)

In [None]:
dfmain.head()

In [None]:
#Concat the datasets
dfmain = pd.concat([dfmain, mapvals, gcsvals], axis=1)

In [None]:
dfmain.isna().sum()

In [None]:
dfmain.dropna(inplace=True)

In [None]:
dfmain.reset_index(inplace=True)

In [None]:
dfmain.head()

In [None]:
dfmain.shape

#### Calculate the SOFA score

In [None]:
def calcSOFA(wbclist, creatininelist, pao2list, bilirubinlist, maplist, gcslist):
    """A function to calculate sofa score"""
    
    l = []
    for i in range(len(creatininelist)):
        
        ## calculate wbc/platelets score
        if wbclist[i] == -0.1:
            wbc = 0
        elif wbclist[i] >= 150:
            wbc = 0
        elif wbclist[i] < 20:
            wbc = 4
        elif wbclist[i] < 50:
            wbc = 3
        elif wbclist[i] < 100:
            wbc = 2
        elif wbclist[i] < 150:
            wbc = 1
        else:
            wbc = 0
                
        ## calculate creatinine score
        if creatininelist[i] == -0.1:
            renal = 0
        elif creatininelist[i] >= 5.0:
            renal = 4
        elif creatininelist[i] >= 3.5:
            renal = 3
        elif creatininelist[i] >= 2.0:
            renal = 2
        elif creatininelist[i] >= 1.2:
            renal = 1
        elif creatininelist[i] < 1.2:
            renal = 0
        else: 
            renal = 0
            
        ## calculate pao2 score
        if pao2list[i] == -1.0:
            resp = 0
        elif pao2list[i] < 100:
            resp = 4
        elif pao2list[i] < 200:
            resp = 3
        elif pao2list[i] < 300:
            resp = 2
        elif pao2list[i] < 400:
            resp = 1
        elif pao2list[i] >= 400:
            resp = 0
        else:
            resp = 0
            
        ## calculate bilirubin score
        if bilirubinlist[i] == -1.0:
            liver = 0
        elif bilirubinlist[i] < 1.2:
            liver = 0
        elif bilirubinlist[i] < 2.0:
            liver = 1
        elif bilirubinlist[i] < 6.0:
            liver = 2
        elif bilirubinlist[i] < 12.0:
            liver = 3
        elif bilirubinlist[i] >= 12.0:
            liver = 4
        else:
            liver = 0
            
        ## calculate MAP score
        if maplist[i] == -1.0:
            mapval = 0
        elif maplist[i] < 70:
            mapval = 1
        elif maplist[i] >= 70:
            mapval = 0
        else: 
            mapval = 0
            
        ## calculate Glasgow Coma Score (GCS) score
        if gcslist[i] == -1.0:
            gcs = 0
        elif gcslist[i] < 6:
            gcs = 4
        elif gcslist[i] < 10:
            gcs = 3
        elif gcslist[i] < 13:
            gcs = 2
        elif gcslist[i] < 15:
            gcs = 1
        elif gcslist[i] == 15:
            gcs = 0
        else:
            gcs = 0
    
        ## CALCULATE the sofa score and append it
        sofascore = wbc + renal + resp + liver + mapval + gcs
        l.append(sofascore)
        
    
    return l


In [None]:
sofaslist = calcSOFA(dfmain['wbc'].tolist(), dfmain['creatinine'].tolist(), 
                     dfmain['pao2'].tolist(), dfmain['bilirubin'].tolist(),
                     dfmain['MAP'].tolist(), dfmain['GCS'].tolist())

In [None]:
sofaslist = pd.DataFrame(sofaslist, columns=['SOFA'])

In [None]:
# Concat the SOFA column
dfmain = pd.concat([dfmain, sofaslist], axis=1)

In [None]:
dfmain.head()

In [None]:
dfmain['SOFA'].value_counts()

In [None]:
vitsaperiod.columns

In [None]:
vitsaperiod.shape

### Medication datasets exploration
Medication datasets basic exploration for the calculation of tSuspicion.

In [None]:
medication = pd.read_csv('../data/medication.csv')

In [None]:
medication.head()

In [None]:
medication.columns

#### Get the drug (antibiotics) start and stop time

In [None]:
dfmeds = medication[['medicationid', 'patientunitstayid', 'drugstartoffset', 'drugstopoffset']]

In [None]:
dfmeds.head()

In [None]:
dfmeds.isna().sum()

In [None]:
diagnosis = pd.read_csv('../data/diagnosis.csv')

In [None]:
diagnosis.head()

In [None]:
diagnosis.columns

#### Get the diagnosis string for each patient

In [None]:
dfdiag = diagnosis[['patientunitstayid', 'diagnosisstring']]

In [None]:
dfdiag.head()

In [None]:
dfdiag.isna().sum()

#### Filter out patients who have sepsis in their diagnosis

In [None]:
droplist = []
for i in range(dfdiag.shape[0]):
    if "sepsis" in dfdiag.iloc[i]['diagnosisstring'].lower():
        pass
    else:
        droplist.append(i)

In [None]:
len(droplist)

In [None]:
dfdiag.drop(droplist, inplace=True)

In [None]:
dfdiag.reset_index(inplace=True)

In [None]:
dfdiag.drop('index', axis=1, inplace=True)

In [None]:
dfdiag.head()

#### Join the dataframes of diagnosis and medications

In [None]:
dfmedsmain = dfdiag.join(dfmeds.drop('patientunitstayid', axis=1), on='patientunitstayid')

In [None]:
dfmeds.shape

In [None]:
dfmedsmain.head()

In [None]:
dfmedsmain.columns

In [None]:
dfmedsmain.shape

In [None]:
dfmedsmain.isna().sum()

In [None]:
droplist = []
for i in range(dfmedsmain.shape[0]):
    if dfmedsmain.iloc[i]['drugstopoffset'] - dfmedsmain.iloc[i]['drugstartoffset'] < 72 :
        droplist.append(i)

In [None]:
dfmedsmain.drop(droplist, inplace=True)

In [None]:
dfmedsmain.reset_index(inplace=True)

In [None]:
dfmedsmain.drop('index', axis=1, inplace=True)

In [None]:
dfmedsmain.head()

In [None]:
patientslistwithsepsis = dfmedsmain['patientunitstayid'].tolist()

### Join the main datasets

In [None]:
df00 = pd.merge(dfmedsmain, dfmain, on='patientunitstayid', how='left')

In [None]:
df00.head()

In [None]:
df00 = pd.concat([df00, pd.DataFrame(df00['drugstartoffset'].tolist(), columns=['tDrugAdminister'])], axis=1)

In [None]:
df00.head()

In [None]:
# Big dataset import. Instead call the already saved dataset from data.
lab = pd.read_csv('../data/lab.csv')

In [None]:
lab = lab[['patientunitstayid', 'labresultoffset']]

In [None]:
df00 = pd.merge(df00, lab, on='patientunitstayid', how='left')

In [None]:
df00.head()

In [None]:
df00.to_csv('../data/df00.csv', index=False)

In [None]:
# Import dataset from data.
df00 = pd.read_csv('../data/df00.csv')

In [None]:
df00.head()

In [None]:
df00.isna().sum()

In [None]:
df00.dropna(inplace=True)

In [None]:
df00.reset_index(inplace=True, drop=True)

In [None]:
df00.head()

### Calculate tSuspicion

In [None]:
def calcTSuspicion(tDrugAdList, labResOffsetList):
    """Function to calculate the time of suspicion (tSuspicion) """
    
    l = []
    for i in range(len(tDrugAdList)):
        if tDrugAdList[i] <= labResOffsetList[i]:
            l.append(tDrugAdList[i])
        else: 
            l.append(labResOffsetList[i])
            
    return l

In [None]:
tsuspList = calcTSuspicion(df00['tDrugAdminister'].tolist(), df00['labresultoffset'].tolist())

In [None]:
tsuspList = pd.DataFrame(tsuspList, columns=['tSusp'])

In [None]:
df00 = pd.concat([df00, tsuspList], axis=1)

In [None]:
df00.head()

In [None]:
df00.isna().sum()

In [None]:
df00.to_csv('../data/df01.csv', index=False)

In [None]:
df00.shape

In [None]:
df00 = pd.read_csv('../data/df00.csv')

### Considering the periodic vitals dataset 
Vitals periodic is a heavy dataset and thus it is broken into 3 parts. The dataset can be imported from /data as vitals.csv

In [None]:
vitals1 = pd.read_csv('../data/vitalPeriodic.csv', nrows=48890547)

In [None]:
vitals1.head()

In [None]:
vitals1.columns

In [None]:
vitals1 = vitals1[['vitalperiodicid', 'patientunitstayid', 'observationoffset', 'systemicsystolic', 'systemicdiastolic']]

In [None]:
vitals1.isna().sum()

In [None]:
vitals1.dropna(inplace=True)

In [None]:
vitals1.reset_index(inplace=True, drop=True)

In [None]:
vitals1.head()

In [None]:
vitals1.shape

In [None]:
vitalsnames = ['vitalperiodicid', 'patientunitstayid', 'observationoffset', 'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2', 'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic', 'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

In [None]:
vitals2 = pd.read_csv('../data/vitalPeriodic.csv', skiprows=48890547, nrows=48890547, 
                      names=vitalsnames)

In [None]:
vitals2.head()

In [None]:
vitals2 = vitals2[['vitalperiodicid', 'patientunitstayid', 'observationoffset', 'systemicsystolic', 'systemicdiastolic']]

In [None]:
vitals2.isna().sum()

In [None]:
vitals2.dropna(inplace=True)
vitals2.reset_index(inplace=True, drop=True)

In [None]:
vitals2.head()

In [None]:
vitals2.shape

In [None]:
vitals3 = pd.read_csv('../data/vitalPeriodic.csv', skiprows=97781094, nrows=48890547, 
                      names=vitalsnames)

In [None]:
vitals3.head()

In [None]:
vitals3 = vitals3[['vitalperiodicid', 'patientunitstayid', 'observationoffset', 'systemicsystolic', 'systemicdiastolic']]

In [None]:
vitals3.isna().sum()

In [None]:
vitals3.dropna(inplace=True)
vitals3.reset_index(inplace=True, drop=True)

In [None]:
vitals3.head()

In [None]:
vitals3.shape

In [None]:
vitals3.isna().sum()

In [None]:
vitals = pd.concat([vitals1, vitals2, vitals3])

In [None]:
vitals.head()

In [None]:
vitals.shape

In [None]:
vitals.isna().sum()

In [None]:
vitals.reset_index(inplace=True, drop=True)

In [None]:
vitals.head()

#### Import vitals dataset

In [None]:
vitals.to_csv('../data/vitals.csv', index=False)

In [None]:
df00.head()

In [None]:
vitals.head()

In [None]:
dfmain.head()

In [None]:
dfmain.shape

In [None]:
dfmain.drop('index', axis=1, inplace=True)

## Final calculations for labeling the patient as Sepsis or non-sepsis

In [None]:
patientsId = df00['patientunitstayid']

In [None]:
patientsId

### Calculate MAP values for every entry in the vitals periodic dataframe. <br>
The only detector ie, MAP is considered is because I could only calculate MAP  value from the vitals periodic dataset.

In [None]:
vitalsmap = calcMAP(vitals['systemicsystolic'].tolist(), vitals['systemicdiastolic'].tolist())

In [None]:
vitalsmap = pd.DataFrame(vitalsmap, columns=['MAP'])

In [None]:
vitals = pd.concat([vitals, vitalsmap], axis=1)

In [None]:
vitals.head()

In [None]:
df11 = pd.DataFrame(df00['observationoffset'].tolist(), columns=['observationoffsetinit'])

In [None]:
df11 = pd.concat([df00['patientunitstayid'], df00['MAP'], df11], axis=1)

In [None]:
df11.isna().sum()

In [None]:
df11.dropna(inplace=True)

In [None]:
df11.drop_duplicates(inplace=True)
df11.reset_index(inplace=True, drop=True)

In [None]:
df11.head()

In [None]:
df11.shape

#### You can also load the ivitals dataset

In [None]:
ivitals = vitals.merge(df11, how='outer', on='patientunitstayid', suffixes=('_vitals', '_df'))

In [None]:
ivitals.shape

In [None]:
ivitals.dropna(inplace=True)
ivitals.reset_index(inplace=True, drop=True)

In [None]:
ivitals.shape

In [None]:
ivitals

### Check if the data entries are only seperated by 24 hours.

In [None]:
def checkifvalid(obsofflist, obsoffinitlist, df):
    """Function to check if SBP and DBP have increased in 24 hours"""
    
    droplist = []
    for i in range(len(obsofflist)):
        if obsofflist[i] - obsoffinitlist[i] <= 1440:
            droplist.append(i)
        
    df.drop(droplist, inplace=True)

In [None]:
checkifvalid(ivitals['observationoffset'].tolist(), ivitals['observationoffsetinit'].tolist(), ivitals)

In [None]:
ivitals.shape

In [None]:
ivitals.drop('vitalperiodicid', axis=1, inplace=True)
ivitals.reset_index(inplace=True, drop=True)

In [None]:
ivitals

In [None]:
ivitals.to_csv('../data/ivitals.csv' , index=False)

##### You can also load the dataset from data/

In [None]:
ivitals = pd.read_csv('../data/ivitals.csv')

In [None]:
def getPatientsWithSepsis(MAPvitalslist, MAPinitlist, obsofflist, obsoffinitlist, patientidlist):
    """Function to find the patients who have a two point deterioration in SOFA score"""
    
    l = []
    m = []
    
    for i in range(len(MAPvitalslist)):
        if MAPinitlist[i] < 70 and MAPvitalslist[i] >= 70 and obsofflist[i] >= obsoffinitlist[i]:
                l.append(patientidlist[i])
        elif MAPinitlist[i] > 70 and MAPvitalslist[i] <= 70 and obsofflist[i] < obsoffinitlist[i]:
            l.append(patientidlist[i])
        else:
            pass
    
        m.append(MAPvitalslist[i] - MAPinitlist[i])
            
    return [l, m]

In [None]:
l = getPatientsWithSepsis(ivitals['MAP_vitals'].tolist(), ivitals['MAP_df'].tolist(), 
                      ivitals['observationoffset'].tolist(), ivitals['observationoffsetinit'].tolist(),
                      ivitals['patientunitstayid'].tolist())

In [None]:
diffinMAP = l[1]

In [None]:
diffinMAP = pd.DataFrame(diffinMAP, columns=['MAPdiff'])

In [None]:
ivitals = pd.concat([ivitals, diffinMAP], axis=1)

In [None]:
ivitals.head()

In [None]:
patientsWithSepsis = pd.Series(l[0]).unique()

In [None]:
patientsDf = pd.read_csv('../data/patient.csv')

In [None]:
patientsDf.head()

In [None]:
patientslist = patientsDf['patientunitstayid']

In [None]:
labels = []
for i in patientslist:
    if i in patientsWithSepsis:
        labels.append(1)
    else:
        labels.append(0)

In [None]:
len(labels)

In [None]:
len(patientslist)

In [None]:
patientslist = pd.DataFrame(patientslist.tolist(), columns=['patientunitstayid'])
labels = pd.DataFrame(labels, columns=['SepsisLabel'])

In [None]:
finalPatientsList = pd.concat([patientslist, labels], axis=1)

### Final list with Sepsis Labels

In [None]:
finalPatientsList.head(10)

In [None]:
finalPatientsList['SepsisLabel'].unique()

##### Get the number of patients with  SepsisLabels as 1 or 0

In [None]:
from collections import Counter

In [None]:
Counter(finalPatientsList['SepsisLabel'].tolist()).values()