# Imports

In [31]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
import glob
from sklearn.cluster import KMeans
import re
import copy

In [32]:
#from cleanUp import cleanUp
#from fillDf import fillDf
#from fixYearStamp import fixYearStamp

# Functions

## Cleanup Function

In [33]:
# cleanUp takes sensor data in .txt format and transfers it to .csv format whil removing null timestamps and
# correcting for user specified time errors in hours.
# cutoff: str, formatted according to pandas datetime standards. Will cutoff all data before this time
# timeRectifyingParams: dictionary, input dictionary with {condition1:hours to adjust} format in {str:int} datatype
# filePaths: iterable with the correct filepaths to look for

def autoFix(file,df,start = 0):
    indexErrors={}
    for idx,i in enumerate(df['Date_Time'][start:]):
        try:
            pd.Timestamp(i)
        except:
            print('Error encountered when parsing: ',file)
            print('first index',idx,'  ', 'time value \''+i+'\'')
            # print(len(df['Date_Time']))
            indexErrors[idx]=i
            df.drop(df[df['Date_Time'] == i].index, inplace = True)
            df.reset_index(drop=True)
            df = autoFix(file,df,idx)
            break
    return df




def cleanUp(cutoff,timeRectifyingParams,filePaths,columns,badTimes):

    fData = {}
    mod = {}
    cleaningCutOffTime = pd.Timestamp(cutoff)
    for idx,file in enumerate(filePaths):
        
        # Here we are reading in the data from the sensors. if 'all' was put into the columns variable we just
        # take everything.
        if 'all' in columns:
            df = pd.read_csv(
                file,
                header=1,
                parse_dates = [[0,1]]
                ).dropna(how='all')
        else:
            df = pd.read_csv(
                file,
                header=1,
                parse_dates = [[0,1]],
                usecols = columns
                ).dropna(how='all')
        
        # This takes annoying spaces out of the column names
        df.columns = df.columns.str.replace(' ', '') 

        # Here we assume the sensor path is in the from ./Data\\{sensorname}.txt and shorten it to just be {sensorname}
        name = file[7:len(file)-4]

        # Here we check to see if the date parse was successful. If not we filter out known buggy timestamps
        # if there is a new buggy timestamp that cannot be converted we end the function early and return the pertinent
        # info

        if type(df['Date_Time'][0]) == type('string'):
            for time in badTimes:
                df.drop(df[df['Date_Time'] == time].index, inplace = True)
            # df.drop(df[df['Date_Time'] == '     0/0/0      0:0:0'].index, inplace = True)
            # df.drop(df[df['Date_Time'] == '2165/165/165 165:165:85'].index, inplace = True)
            try:
                df['Date_Time'] = pd.to_datetime(df['Date_Time'])
            except:
                df = autoFix(file,df)
                df['Date_Time'] = pd.to_datetime(df['Date_Time'])
                
            # Here we need to set up our time changing parameters
            # For this instance we need to roll back all sensors by 1 hour
            # except the two BU sensors which needed to be rolled back by
            # 8 hours.
        try:
            offset = timeRectifyingParams[name]
            mod[name] = 'yes'
            # print(x,'yes')
            df['Date_Time'] = df['Date_Time']-pd.Timedelta(hours = offset)
        except KeyError:
            mod[name] = 'no'
            # print(x,'no')

        try:
            df.drop(df[df['Date_Time'] < cleaningCutOffTime].index, inplace = True)

        except TypeError:
            print('TypeError: ')
            return file,df
            # In the instance of a TypeError occuring, we are bascically dealing with
            # the 0 timestamps causing an error in the read_csv parser and not 
            # converting the Date_Time column to timestamp data type.

        fData[name] = df.reset_index(drop=True)

        # ends by printing out the new start and stop times of the data sets
    for label in fData:
        try:
            print(label,'   ',fData[label]['Date_Time'].iloc[0],'    ',fData[label]['Date_Time'].iloc[-1],'     ','mod:',mod[label])
        except:
            print(label,' NO DATA PRESENT    NO DATA PRESENT')
    return fData


## fillDf function

In [34]:
def fillDf(df, freq, start, end, cutoff):
    if start:
        startTime = pd.Timestamp(start)
    else:
        startTime = df.values[0][0]

    if end:
        endTime = pd.Timestamp(end)
    else:
        endTime = df.values[-1][0] + pd.Timedelta(seconds=freq)

    volatility = 0
    padding = 0
    nochange = 0

    threshold = pd.Timedelta(seconds=cutoff)

    index = pd.date_range(startTime, endTime, freq=freq)
    columns = df.columns

    count = 0

    overall = []

    for idx, i in enumerate(df.values):
        oldCount = count

        try:
            while i[0] >= index[count]:
                count += 1
        except IndexError:
            continue

        val = count - oldCount

        # if sensor measurements are more frequent than the sampling rate we just skip them
        if not val:
            continue

        if threshold < (index[count] - index[oldCount]):
            if not idx:
                temp = df.values[0][1:]
            else:
                temp = df.values[idx - 1][1:]

            # if the time gap is over the threshold entries are 0 padded instead of interpolated
            for step, ovrwrt in enumerate(range(oldCount, count)):
                padding += 1
                tempdata = np.concatenate(
                    (np.array([index[ovrwrt]]),
                     np.floor(np.array(temp * 0))), 0
                )
                overall.append(tempdata)
            val = 0

        # might error on first value

        if val and val - 1:

            # time gaps < threshold will be linearly interpolated

            if not idx:
                temp = df.values[0][1:]
            else:
                temp = df.values[idx - 1][1:]
            inc = (i[1:] - temp) / val

            for step, ovrwrt in enumerate(range(oldCount, count)):
                volatility += 1
                tempdata = np.concatenate(
                    (np.array([index[ovrwrt]]), np.floor(
                        np.array(temp + inc * step))),
                    0,
                )
                overall.append(tempdata)

        elif val:
            nochange += 1
            temp = i[1:]
            tempdata = np.concatenate(
                (np.array([index[oldCount]]), np.floor(np.array(temp))), 0
            )
            overall.append(tempdata)


    total = len(overall)

    if total:
        accuracy = ["% of values from interpolation : " + str(np.round(volatility/total*100, 3)),
                    "% of values from 0-padding : " +
                    str(np.round(padding/total*100, 3)),
                    "% of values not changed : " + str(np.round(nochange/total*100, 3))]
    else:
        accuracy = 'NO DATA'

    newDF = pd.DataFrame(overall, columns=columns)

    return newDF, accuracy


## fixYearStamp function

In [35]:
def fixYearStamp(filePath,incorrectString,date,charTimeStart,charTimeEnd,offset):

    fin = open(filePath,'rt')
    content = fin.readlines()
    fin.close()
    fout = open(filePath,'wt')
    for idx,i in enumerate(content):
        if re.search(incorrectString,i):
            line = (pd.Timestamp(date+i[charTimeStart:charTimeEnd])-pd.Timedelta(hours=offset)).strftime(' %Y/%m/%d, %H:%M:%S') + i[charTimeEnd:]
        else:
            line = i
        fout.write(line)
    fout.close()
    return

### This function will take in a file, the incorrect date string, the date for it in any format, the pivots for the time char array, and the
### hour offset as an integer to correct the function by

# Data Cleaning

Passing the sensor data through the cleanUp function to get fix timestamps and delete null timestamps.

In [36]:
all_csv_files = glob.glob("./Data/*.txt")
# insert the desired start time
cutOffTime = '4/20/2021 9:30'
endTime = '2021-04-20 14:00'
# insert the time rectifying offsets. default of for nothing {'':0}
sensorConditions = {'S-01':7,'S-02':7,'S-03':7,'S-04':7,'S-05':7,'S-06':7,'S-15':7,'S-19':7}
#This indicates which columns to keep. Here we're taking all of the dP info and the timestamps
columns = [0,1,6,7,8,9,10,11]
# Enable Data Checking
DataChecking = False
# Here are obversed timestamps that need to removed from the data
badTimes = ['     0/0/0      0:0:0','2165/165/165 165:165:85']
# Controls wether zones will be created automatically or by k-means clusters
ZoneAutomation = False
# Sets either the binning or the manual zones
numberOfZones = 4
numAutoZones = 2
# Sensors to exclude from zone
outdoorSensors = ['S-15','S-16','S-18','S-19']
# 10s of seconds before nebulization to include in the expirement csv files
preCursorFactor = 6
# which particle to analyze
particle = 'Dp>0.3'

In [37]:
expTRange = {

    'ICU Room 1 Door Partially Open':
    [pd.Timestamp('2021-04-20 9:45:15'),
    pd.Timestamp('2021-04-20 10:02:40'),
    pd.Timestamp('2021-04-20 10:19:40')],
    'ICU Room 1 Door Open':
    [pd.Timestamp('2021-04-20 10:35:05'),
    pd.Timestamp('2021-04-20 10:51:15'),
    pd.Timestamp('2021-04-20 11:06:30')],
    'ICU Room 1 Negative Pressure':
    [pd.Timestamp('2021-04-20 11:25:00'),
    pd.Timestamp('2021-04-20 11:37:50'),
    pd.Timestamp('2021-04-20 11:47:55')],
    'ICU Room 2 Door Partially Open':
    [pd.Timestamp('2021-04-20 12:13:35'),
    pd.Timestamp('2021-04-20 12:23:30')+pd.Timedelta(140,'S'),
    pd.Timestamp('2021-04-20 12:38:30')+pd.Timedelta(190,'S'),
    pd.Timestamp('2021-04-20 12:49:45')+pd.Timedelta(190,'S')],
    'ICU Room 2 Door Open':
    [pd.Timestamp('2021-04-20 13:00:30')+pd.Timedelta(190,'S'),
    pd.Timestamp('2021-04-20 13:13:30'),
    pd.Timestamp('2021-04-20 13:23:30'),
    pd.Timestamp('2021-04-20 13:33:00')],
}

#enter in the expirement length as seconds/10
expTLen = {
    'ICU Room 1 Door Partially Open' : 15*6,
    'ICU Room 1 Door Open':15*6,
    'ICU Room 1 Negative Pressure':10*6,
    'ICU Room 2 Door Partially Open':10*6,
    'ICU Room 2 Door Open':10*6   
}
# Manual Zone set up notice how we are missing S-14
zoneList = {
    'Zone 1' : ['S-01','S-04'],
    'Zone 2' : ['S-02','S-03','S-05','S-06'],
    'Zone 3' : ['S-07','S-08','S-09','S-10', 'S-12', 'S-13','S-14'], #took out 'S-11' as its data file is missing rn
    'Zone 4' : ['S-15','S-18'],
    'Zone 5' : ['S-16','S-19']
}
if not ZoneAutomation:
    numberOfZones = len(zoneList)
    
#all_csv_files

In [38]:
data = cleanUp(cutOffTime,sensorConditions,all_csv_files,columns,badTimes)

S-01     2021-04-20 09:30:00      2021-04-20 13:45:59       mod: yes
S-02     2021-04-20 09:30:00      2021-04-20 13:46:00       mod: yes
S-03     2021-04-20 09:30:00      2021-04-20 13:45:19       mod: yes
S-04     2021-04-20 09:30:09      2021-04-20 13:49:09       mod: yes
S-05     2021-04-20 09:30:00      2021-04-20 13:43:50       mod: yes
S-06     2021-04-20 09:30:00      2021-04-20 13:44:09       mod: yes
S-07     2021-04-20 09:30:07      2021-04-20 13:45:27       mod: no
S-08     2021-04-20 09:30:09      2021-04-20 13:47:03       mod: no
S-09     2021-04-20 09:30:19      2021-04-20 13:44:35       mod: no
S-10     2021-04-20 09:30:07      2021-04-20 13:44:23       mod: no
S-13     2021-04-20 09:30:08      2021-04-20 13:44:38       mod: no
S-14     2021-04-20 09:30:07      2021-04-20 13:43:57       mod: no
S-15     2021-04-20 09:30:03      2021-04-20 13:48:44       mod: yes
S-16     2021-04-20 09:31:17      2021-04-20 13:49:17       mod: no
S-18     2021-04-20 09:30:08      2021-04

### Exporting Data
Here we can export the organized data frames as csv files

In [39]:
directory = './proccessedData'
for x in data:
    temp=data[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [40]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
fout = open('./dataInfo/interpolation_Effect_Log.txt','wt')
interpDF = {}

for x in data:
    df = data[x]
    cutoff = 40
    freq = '10S'
    try:
        interpDF[x],accuracy = fillDf(df,freq,cutOffTime,endTime,cutoff)
        #print(x,' ',accuracy)
        fout.write(x+' '+ '\n' + accuracy[0]+ '\n'+ accuracy[1]+ '\n'+ accuracy[2] +'\n\n')
    except IndexError:
        #print(x,'NO DATA')
        fout.write(x+'NO DATA'+'\n')
fout.close()        

### Merge the DataFrames

In [41]:
length = []
for x in interpDF:
    length.append(len(interpDF[x]))
index = min(length)
lowIDX,lowValue = [[i,value] for i,value in enumerate(length) if value == index][0]

columns = list(interpDF.keys())
mergedData = pd.DataFrame({'Date_Time':interpDF[columns[lowIDX]]['Date_Time']})
for idx,column in enumerate(columns):
    mergedData[column] = interpDF[column][particle]
Average = np.mean(mergedData[zoneList['Zone 1']+zoneList['Zone 2']+zoneList['Zone 3']],axis=1)
Variance = np.var(mergedData[zoneList['Zone 1']+zoneList['Zone 2']+zoneList['Zone 3']],axis=1)
mergedData['Average'] = Average
mergedData['Variance'] = Variance
mergedData

Unnamed: 0,Date_Time,S-01,S-02,S-03,S-04,S-05,S-06,S-07,S-08,S-09,S-10,S-13,S-14,S-15,S-16,S-18,S-19,Average,Variance
0,2021-04-20 09:30:00,69,39,54,36,27,18,219,120,1302,21,36,63,33,0,42,27,167.000000,120012.500000
1,2021-04-20 09:30:10,48,93,54,84,18,81,147,81,1302,66,72,63,75,0,30,27,175.750000,116169.687500
2,2021-04-20 09:30:20,39,81,48,72,48,99,90,81,750,66,36,45,54,0,57,57,121.250000,36341.187500
3,2021-04-20 09:30:30,9,48,69,39,48,36,126,205,750,69,75,54,21,0,36,57,127.333333,37617.055556
4,2021-04-20 09:30:40,27,39,27,48,42,18,81,591,427,84,75,54,39,0,30,27,126.083333,30864.576389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,2021-04-20 13:43:10,21,9,9,0,36,0,18,9,0,57,0,0,18,9,0,0,13.250000,287.187500
1520,2021-04-20 13:43:20,9,9,0,0,9,0,9,0,28,28,18,4,9,0,0,0,9.500000,95.750000
1521,2021-04-20 13:43:30,18,0,0,0,9,0,0,0,66,0,9,0,0,0,0,21,8.500000,331.250000
1522,2021-04-20 13:43:40,9,0,9,0,18,0,9,0,66,0,9,0,0,18,126,30,10.000000,317.000000


## Increase Resolution on Merged Data and Export

In [42]:
for i in mergedData:
    tempFrame = mergedData.values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    hiResMergedDF = pd.DataFrame(tempList, columns = mergedData.keys())
    
directory = './mergedData/'
if not os.path.exists(directory):

    os.makedirs(directory)

location = os.path.join(directory+'mergedFrame.csv')
hiResMergedDF.to_csv(location,index=False)

In [43]:
mergedData = pd.read_csv('./mergedData/mergedFrame.csv',parse_dates=[0])
time = mergedData['Date_Time']
expIndexes = {}
for i in expTRange:
    expIndexes[i] = []
    for x in expTRange[i]:
        for start,n in enumerate(time):
           if n >= x:
               expIndexes[i].append(start)
               break

## Determining Zones
Here we first create 'averagedFrame's. These are dictionaries that at each 'label' (which corresponds to the name of an expirement) we have a pandas dataframe containing the results of all of the trails in an expirement summed, and then divided by the total number of trails.
Anytime you are adjusting the Zones, everything below here must be run. The values of many of these DataFrames are mutated

In [44]:
# preCursorFactor is defined at the start
averagedFrame = {}
expirementFrame = {}

for label in expIndexes:
    runSumFrames = expIndexes[label][0]-expIndexes[label][0]
    for idx,time in enumerate(expIndexes[label]):
        start = expIndexes[label][idx] - preCursorFactor
        end = expIndexes[label][idx] + expTLen[label]
        expirementFrame[label+' Exp '+str(idx+1)] = mergedData.iloc[ start : end , 1: ].reset_index(drop = True)
        runSumFrames += expirementFrame[label+' Exp '+str(idx+1)]
        
    averagedFrame[label] = runSumFrames/(idx+1)

In [45]:
# numAutoZones is defined at the start
AutoZoneAssignments = {}
for frame in averagedFrame:
    # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
    avgFrm = averagedFrame[frame]
    # outdoorSensors must have its spelling exactly match
    columns = list(set(avgFrm.keys()[:-2])- set(outdoorSensors))
    columns.sort()

    X = {}
    for column in columns:
        value,index = max([(value,index) for index,value in enumerate(avgFrm[column])]) 
        X[column] = np.array([np.log(value+.01),index])
    X = [X[i] for i in X]
    kmeans = KMeans(n_clusters=numAutoZones,random_state=0).fit(X)
    idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
    lut = np.zeros_like(idx)
    lut[idx] = np.arange(numAutoZones)
    #lut = lut[::-1]
    orderedZones = [[]]*numAutoZones
    for index, zone in enumerate(lut):
        orderedZones[index] = [index if zone == kmeans.labels_[i] else 0 for i in range(len(kmeans.labels_))]
    AutoZoneAssignments[frame] = np.sum(orderedZones,axis=0)
z = numAutoZones
ZDfAuto = pd.DataFrame(AutoZoneAssignments)
ZDfAuto = ZDfAuto.append(pd.DataFrame([[z]*len(expIndexes)]*len(outdoorSensors),columns = AutoZoneAssignments.keys()),ignore_index=True)
AutoZoneAssignments = ZDfAuto
if len(outdoorSensors):
    numAutoZones += 1

if not ZoneAutomation:
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2]))
        columns.sort()
        ZoneAssignments[frame] = [0]*len(columns)
        for value,zone in enumerate(zoneList):
            for sensor in zoneList[zone]:
                ZoneAssignments[frame][columns.index(sensor)] = value
    ZDf = pd.DataFrame(ZoneAssignments)

In [46]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'ZoneAssignments.csv')
ZDf.to_csv(location,index=False)

directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'AutoZoneAssignments.csv')
ZDfAuto.to_csv(location,index=False)

expirementFrameAuto = copy.deepcopy(expirementFrame)
averagedFrameAuto = copy.deepcopy(averagedFrame)

In [47]:
stretchedDF = {}
for i in averagedFrame:
    tempFrame = averagedFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDF[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

stretchExpDf = {}
for i in expirementFrame:
    tempFrame = expirementFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDf[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns) 
stretchedDFAuto = {}
for i in averagedFrameAuto:
    tempFrame = averagedFrameAuto[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDFAuto[i] = pd.DataFrame(tempList, columns = expirementFrameAuto[list(expirementFrameAuto.keys())[0]].columns) 

stretchExpDfAuto = {}
for i in expirementFrameAuto:
    tempFrame = expirementFrameAuto[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDfAuto[i] = pd.DataFrame(tempList, columns = expirementFrameAuto[list(expirementFrameAuto.keys())[0]].columns)    

In [48]:
directory = './stretchedAvgData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDF:
    temp=stretchedDF[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedExpirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDf:
    temp=stretchExpDf[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedAvgDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDFAuto:
    temp=stretchedDFAuto[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedExpirementDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDfAuto:
    temp=stretchExpDfAuto[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)