## General Imports


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
#Change working directory to directory working in
os.chdir(r"C:\Users\kaitl\Desktop\HospitalAerosolTesting\Data\UWMedApril20")
import glob
from cleanUp import cleanUp
from fillDf import fillDf
from fixYearStamp import fixYearStamp

## Data Cleaning

Passing the sensor data through the cleanUp function to get fix timestamps and delete null timestamps

In [5]:
all_csv_files = glob.glob("./Data/*.txt")
# insert the desired start time
cutOffTime = '4/20/2021 9:30'
# insert the time rectifying offsets. default of for nothing {'':0}
sensorConditions = {'S-15':7,'S-19':7}
#This indicates which columns to keep. Here we're taking all of the dP info and the timestamps
columns = [0,1,6,7,8,9,10,11]

In [6]:
all_csv_files

['./Data\\S-01.txt',
 './Data\\S-02.txt',
 './Data\\S-03.txt',
 './Data\\S-04.txt',
 './Data\\S-05.txt',
 './Data\\S-06.txt',
 './Data\\S-07.txt',
 './Data\\S-08.txt',
 './Data\\S-10.txt',
 './Data\\S-12.txt',
 './Data\\S-13.txt',
 './Data\\S-14.txt',
 './Data\\S-15.txt',
 './Data\\S-16.txt',
 './Data\\S-18.txt',
 './Data\\S-19.txt']

Changed this to markdown so it won't run twice, had to fix the timestamps on S-12 filePath = all_csv_files[11] incorrectString = '21/3/22' date = '3/22/2021' charTimeStart = 11 charTimeEnd = 21 offset = 0 fixYearStamp(filePath,incorrectString,date,charTimeStart,charTimeEnd,offset)

In [7]:
data = cleanUp(cutOffTime,sensorConditions,all_csv_files,columns)

S-01     2021-04-20 09:30:00      2021-04-20 13:45:59
S-02     2021-04-20 09:30:00      2021-04-20 13:46:00
S-03     2021-04-20 09:30:00      2021-04-20 13:45:19
S-04     2021-04-20 09:30:09      2021-04-20 13:49:09
S-05     2021-04-20 09:30:00      2021-04-20 13:43:50
S-06     2021-04-20 09:30:00      2021-04-20 13:44:09
S-07     2021-04-20 09:30:07      2021-04-20 13:45:27
S-08     2021-04-20 09:30:09      2021-04-20 13:47:03
S-10     2021-04-20 09:30:07      2021-04-20 13:44:23
S-12     2021-04-20 09:30:14      2021-04-20 13:45:15
S-13     2021-04-20 09:30:08      2021-04-20 13:44:38
S-14     2021-04-20 09:30:07      2021-04-20 13:43:57
S-15     2021-04-20 09:30:03      2021-04-20 13:48:44
S-16     2021-04-20 09:31:17      2021-04-20 13:49:17
S-18     2021-04-20 09:30:08      2021-04-20 13:48:39
S-19     2021-04-20 09:30:02      2021-04-20 13:49:12


## Exporting Data

Here we can export the organized data frames as csv files

In [8]:
directory = './proccessedData'
for x in data:
    temp=data[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

## Checking Data

Here we scan through the data for irregularities in data recording.

In [9]:
data = data
fout = open('./dataInfo/time_Frequency_Error_Log.txt','wt')
errors = {}
errorCount = {}
# Enter the expected interval here
interval = 10
for x in data:
    # errors keeps track of length of each time interval error that occurs
    errors[x] = set(())
    # errorCount keeps track of how many times each time interval error occured
    errorCount[x] = {}
    # counter keeps track of the total time interval errors per sensor
    counter = 0
    #shows the total
    temp = data[x]
    for idx,i in enumerate(temp['Date_Time']):
        try:
            if not ((temp['Date_Time'][idx+1] - i) == pd.Timedelta(seconds=interval)):
                timeErr = temp['Date_Time'][idx+1] - i
                if str(timeErr.seconds) in errorCount[x]:
                    errorCount[x][str(timeErr.seconds)] +=1
                else:
                    errorCount[x][str(timeErr.seconds)] = 1

                errors[x].add(timeErr)


                counter += 1
        except:
                        
            continue

    print(str(round(counter/len(temp)*100,2)),'% potential error in ', x)
    fout.write('potential error in '+ x +'\n' + str(round(counter/len(temp)*100,2))+'%'+'\n')

    # display the different types of errors
    lst = [i.seconds for i in errors[x]]
    frmt = "{:>4}"*len(lst)
    print(frmt.format(*lst))
    fout.write("Time Errors" + frmt.format(*lst)+ '\n')
        # display the quantity of each type of error
    lst = [errorCount[x][str(i.seconds)] for i in errors[x]]
    frmt = "{:>4}"*len(lst)
    print(frmt.format(*lst))
    fout.write("# Observed " + frmt.format(*lst)+ '\n')

    print()
    fout.write('\n')


fout.close()

0.59 % potential error in  S-01
   9 43085990
   1   4   4

0.52 % potential error in  S-02
 43085990
   4   4

0.65 % potential error in  S-03
   9  20 43085990
   1   1   4   4

0.84 % potential error in  S-04
   9 43085990  81  15   5  20
   1   4   4   1   1   1   1

1.39 % potential error in  S-05
  73   9 43085990 611 810  17  81  20
   1   2   4   4   1   1   1   1   4

0.66 % potential error in  S-06
   9 43085990  20
   1   4   4   1

19.95 % potential error in  S-07
  20
 255

50.0 % potential error in  S-08
  21  17  11 126  19  20
   3   1   1   1   3 502

50.0 % potential error in  S-10
  17  44  19  26  20
   1   1   1   1 504

99.81 % potential error in  S-12
  20  80 400  211820  60 660 260  61 160 620  19  40 820
 494   1   1   5   1   3   1   1   1   1   1   5   4   1

19.94 % potential error in  S-13
  20
 254

50.76 % potential error in  S-14
  20 100 420  30 460  40 340
 461   1   1   1   1   3   1

0.06 % potential error in  S-15
  11
   1

0.0 % potential error i

Notice there are quite a few repeating errors here in our data set. We can either choose to interpolate the data inbetween or pad it with 0s. For gaps <40s i will interpolate, but for gaps >40 i will 0 pad.

In [10]:
fout = open('./dataInfo/interpolation_Effect_Log.txt','wt')
interpDF = {}

for x in data:
    df = data[x]
    cutoff = 40
    freq = '10S'
    try:
        interpDF[x],accuracy = fillDf(df,freq,'2021-04-20 9:30','2021-04-20 14:00',cutoff)
        print(x,' ',accuracy)
        fout.write(x+' '+ '\n' + accuracy[0]+ '\n'+ accuracy[1]+ '\n'+ accuracy[2] +'\n\n')
    except IndexError:
        print(x,'NO DATA')
        fout.write(x+'NO DATA'+'\n')
fout.close()


S-01   ['% of values from interpolation : 0.0', '% of values from 0-padding : 8.073', '% of values not changed : 91.927']
S-02   ['% of values from interpolation : 0.0', '% of values from 0-padding : 8.068', '% of values not changed : 91.932']
S-03   ['% of values from interpolation : 0.0', '% of values from 0-padding : 8.094', '% of values not changed : 91.906']
S-04   ['% of values from interpolation : 0.257', '% of values from 0-padding : 8.553', '% of values not changed : 91.19']
S-05   ['% of values from interpolation : 0.656', '% of values from 0-padding : 18.57', '% of values not changed : 80.774']
S-06   ['% of values from interpolation : 0.131', '% of values from 0-padding : 8.131', '% of values not changed : 91.738']
S-07   ['% of values from interpolation : 33.268', '% of values from 0-padding : 0.0', '% of values not changed : 66.732']
S-08   ['% of values from interpolation : 65.911', '% of values from 0-padding : 0.778', '% of values not changed : 33.312']
S-10   ['% of v

## Export Data

export the newly interpolated data

In [11]:
directory = './interpolatedData'
for x in interpDF:
    temp=interpDF[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

## Merge the DataFrames

Also remove 'S-02' from the dictionary as it has no real data and find the least common index

In [12]:
# interpDF.pop('S-02',None)
# interpDF.pop('S-BU2',None)
# interpDF.pop('S-BU1',None)
#interpDF.pop('S19',None)
#interpDF.pop('S15',None)
length = []
for x in interpDF:
    length.append(len(interpDF[x]))
index = min(length)
print(index)

1524


In [13]:
tempList = temp[15:19]
tempList

Unnamed: 0,Date_Time,Dp>0.3,Dp>0.5,Dp>1.0,Dp>2.5,Dp>5.0,Dp>10.0
15,2021-04-20 09:32:30,60,13,0,0,0,0
16,2021-04-20 09:32:40,81,20,0,0,0,0
17,2021-04-20 09:32:50,39,13,0,0,0,0
18,2021-04-20 09:33:00,39,10,0,0,0,0


In [14]:
for count,key in enumerate(list(interpDF.keys())):
    print(count+1,key)

1 S-01
2 S-02
3 S-03
4 S-04
5 S-05
6 S-06
7 S-07
8 S-08
9 S-10
10 S-12
11 S-13
12 S-14
13 S-15
14 S-16
15 S-18
16 S-19


In [15]:
interpDF

{'S-01':                Date_Time  Dp>0.3  Dp>0.5  Dp>1.0  Dp>2.5  Dp>5.0  Dp>10.0
 0    2021-04-20 09:30:00      69      20       0       0       0        0
 1    2021-04-20 09:30:10      48      13       0       0       0        0
 2    2021-04-20 09:30:20      39      13       0       0       0        0
 3    2021-04-20 09:30:30       9       3       0       0       0        0
 4    2021-04-20 09:30:40      27       9       0       0       0        0
 ...                  ...     ...     ...     ...     ...     ...      ...
 1531 2021-04-20 13:45:10       0       0       0       0       0        0
 1532 2021-04-20 13:45:20       0       0       0       0       0        0
 1533 2021-04-20 13:45:30       0       0       0       0       0        0
 1534 2021-04-20 13:45:40       9       3       0       0       0        0
 1535 2021-04-20 13:45:50       9       3       0       0       0        0
 
 [1536 rows x 7 columns],
 'S-02':                Date_Time  Dp>0.3  Dp>0.5  Dp>1.0  Dp>2.

In [16]:
dfMerged = []
columns = list(interpDF.keys())
columns.extend(['Average',
'Variance',
'Zone 1',
'Var Z1',
'Zone 2',
'Var Z2',
'Zone 3',
'Var Z3'])
# 'Zone 4',
# 'Var Z4'])

for idx,i in enumerate(interpDF[columns[0]].values[:index]):
    temp = []
    temp.append(i[0])
    for x in interpDF:
        temp.append(interpDF[x].values[idx][1])
    #So we now have a list with the timestamp and then sensors
    
    #here we add the overall average and variance columns
    temp.append(np.average(temp[1:16]))
    temp.append(np.std(temp[1:16]))

    #here we're segregating the zones in the file giving their variance and avg

    #Zone 1 the 2 sensors right on top of the nebulizer
    lst = temp[1:7]
    temp.append(np.average(lst))
    temp.append(np.std(lst))
    # #Zone 2 the perimeter of the bed
    # lst = [temp[2],temp[3],temp[5],temp[6]]
    # temp.append(np.average(lst))
    # temp.append(np.std(lst))
    #Zone 3 the perimeter of the room
    lst = temp[7:16]
    temp.append(np.average(lst))
    temp.append(np.std(lst))
    #Zone 4 is just the outside sensor
    lst = temp[16:19]
    temp.append(np.average(lst))
    temp.append(np.std(lst))
    dfMerged.append(temp)
columns.insert(0,'Date_Time')

In [17]:
interpDF.keys()

dict_keys(['S-01', 'S-02', 'S-03', 'S-04', 'S-05', 'S-06', 'S-07', 'S-08', 'S-10', 'S-12', 'S-13', 'S-14', 'S-15', 'S-16', 'S-18', 'S-19'])

In [18]:
mergedData = pd.DataFrame(dfMerged,columns = columns)

In [19]:
mergedData

Unnamed: 0,Date_Time,S-01,S-02,S-03,S-04,S-05,S-06,S-07,S-08,S-10,...,S-18,S-19,Average,Variance,Zone 1,Var Z1,Zone 2,Var Z2,Zone 3,Var Z3
0,2021-04-20 09:30:00,69,39,54,36,27,18,219,120,21,...,42,27,54.400000,51.379373,40.5,16.859715,63.666667,63.210407,44.259791,12.266658
1,2021-04-20 09:30:10,48,93,54,84,18,81,147,81,66,...,30,27,63.400000,33.903392,63.0,25.806976,63.666667,38.360860,41.434464,15.785601
2,2021-04-20 09:30:20,39,81,48,72,48,99,90,81,66,...,57,57,57.000000,24.445859,64.5,21.266170,52.000000,25.139610,46.148620,15.346169
3,2021-04-20 09:30:30,9,48,69,39,48,36,126,205,69,...,36,57,60.466667,48.681094,41.5,17.951323,73.111111,57.751276,55.382587,4.945500
4,2021-04-20 09:30:40,27,39,27,48,42,18,81,591,84,...,30,27,84.000000,138.182488,33.5,10.307764,117.666667,170.057507,83.060829,45.394919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,2021-04-20 13:43:10,21,9,9,0,36,0,18,9,57,...,0,0,12.666667,15.421486,12.5,12.658989,12.777778,17.014881,9.362717,6.715287
1520,2021-04-20 13:43:20,9,9,0,0,9,0,9,0,28,...,0,0,6.333333,7.862711,4.5,4.500000,7.555556,9.262962,4.732015,3.403794
1521,2021-04-20 13:43:30,18,0,0,0,9,0,0,0,0,...,0,21,2.666667,5.133766,4.5,6.873864,1.444444,2.948111,9.600144,8.123594
1522,2021-04-20 13:43:40,9,0,9,0,18,0,9,0,0,...,126,30,13.800000,30.629398,6.0,6.708204,19.000000,38.288379,24.809799,7.789343


## Increase Resolution on Merged Data

In [20]:
for i in mergedData:
    tempFrame = mergedData.values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    hiResMergedDF = pd.DataFrame(tempList, columns = mergedData.keys())

## Export Merged Frames

In [21]:
directory = './mergedData/'
if not os.path.exists(directory):

    os.makedirs(directory)

location = os.path.join(directory+'mergedFrame.csv')
hiResMergedDF.to_csv(location,index=False)

## Create csv File for each Animation

We have 3 expirements in each that we want to average across the range

In [22]:
expTRange = {

    'ICU Room 1 Door Partially Open':
    [pd.Timestamp('2021-04-20 9:45:15'),
    pd.Timestamp('2021-04-20 10:02:40'),
    pd.Timestamp('2021-04-20 10:19:40')],
    'ICU Room 1 Door Open':
    [pd.Timestamp('2021-04-20 10:35:05'),
    pd.Timestamp('2021-04-20 10:51:15'),
    pd.Timestamp('2021-04-20 11:06:30')],
    'ICU Room 1 Negative Pressure':
    [pd.Timestamp('2021-04-20 11:25:00'),
    pd.Timestamp('2021-04-20 11:37:50'),
    pd.Timestamp('2021-04-20 11:47:55')],
    'ICU Room 2 Door Partially Open':
    [pd.Timestamp('2021-04-20 12:13:35'),
    pd.Timestamp('2021-04-20 12:23:30'),
    pd.Timestamp('2021-04-20 12:38:30'),
    pd.Timestamp('2021-04-20 12:49:45')],
    'ICU Room 2 Door Open':
    [pd.Timestamp('2021-04-20 13:00:30'),
    pd.Timestamp('2021-04-20 13:13:30'),
    pd.Timestamp('2021-04-20 13:23:30'),
    pd.Timestamp('2021-04-20 13:33:00')],
}

#enter in the expirement length as seconds/10
expTLen = {
    'ICU Room 1 Door Partially Open' : 15*6,
    'ICU Room 1 Door Open':15*6,
    'ICU Room 1 Negative Pressure':10*6,
    'ICU Room 2 Door Partially Open':15*6,
    'ICU Room 2 Door Open':10*6   
}

In [23]:
#mergedData = pd.read_csv('./mergedData/mergedFrame.csv',parse_dates=[0])

In [24]:
time = mergedData['Date_Time']
expIndexes = {}
for i in expTRange:
    expIndexes[i] = []
    for x in expTRange[i]:
        for start,n in enumerate(time):
           if n >= x:
               expIndexes[i].append(start)
               break

In [25]:
os.getcwd()

'C:\\Users\\kaitl\\Desktop\\HospitalAerosolTesting\\Data\\UWMedApril20'

In [26]:
time

0      2021-04-20 09:30:00
1      2021-04-20 09:30:10
2      2021-04-20 09:30:20
3      2021-04-20 09:30:30
4      2021-04-20 09:30:40
               ...        
1519   2021-04-20 13:43:10
1520   2021-04-20 13:43:20
1521   2021-04-20 13:43:30
1522   2021-04-20 13:43:40
1523   2021-04-20 13:43:50
Name: Date_Time, Length: 1524, dtype: datetime64[ns]

In [27]:
expIndexes#expTLen[label]

{'ICU Room 1 Door Partially Open': [92, 196, 298],
 'ICU Room 1 Door Open': [391, 488, 579],
 'ICU Room 1 Negative Pressure': [690, 767, 828],
 'ICU Room 2 Door Partially Open': [982, 1041, 1131, 1199],
 'ICU Room 2 Door Open': [1263, 1341, 1401, 1458]}

In [28]:
# controls how many seconds of data before each experiment to include
preCursorFactor = 0
averagedFrame = {}
expirementFrame = {}

for label in expIndexes:

    df1Index1 = expIndexes[label][0] - preCursorFactor
    df1Index2 = expIndexes[label][0] + expTLen[label]
    df1 = mergedData.iloc[df1Index1 : df1Index2 , 1: ].reset_index(drop = True)

    df2Index1 = expIndexes[label][1] - preCursorFactor
    df2Index2 = expIndexes[label][1] + expTLen[label]
    df2 = mergedData.iloc[df2Index1 : df2Index2 , 1: ].reset_index(drop = True)

    df3Index1 = expIndexes[label][2] - preCursorFactor
    df3Index2 = expIndexes[label][2] + expTLen[label]
    df3 = mergedData.iloc[df3Index1 : df3Index2 , 1: ].reset_index(drop = True)

    averagedFrame[label] = (df1 + df2 + df3)/3

    expirementFrame[label+' Exp1'] = df1
    expirementFrame[label+' Exp2'] = df2
    expirementFrame[label+' Exp3'] = df3
    
#assuming there were 3 expirements for each one

In [29]:
directory = './averagedData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in averagedFrame:
    temp=averagedFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [30]:
directory = './expirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in expirementFrame:
    temp=expirementFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

## Increase the Resolution

pad out the dataframes to have values for every second.

In [31]:
stretchedDF = {}
for i in averagedFrame:
    tempFrame = averagedFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDF[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

In [32]:
stretchExpDf = {}
for i in expirementFrame:
    tempFrame = expirementFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDf[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

In [33]:
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDF:
    temp=stretchedDF[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [34]:
directory = './stretchedExpirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDf:
    temp=stretchExpDf[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)