# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [24]:
import os 
import pandas as pd

labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName", "message", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID", "Name", "room Name", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID", "Device Name", "room Name", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID", "Device Name", "Signal Name", "Signal Value string", "Signal direction", "message", # SignalChangedEventWithStrings
]

device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
clean_lines = []

for log in os.listdir('data'):
    with open('data/{}'.format(log)) as logfile:
        for line in logfile:
            line = line.rstrip('\n')
            if line[-1] == '|':
                line = line[:-1]
            all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                        + line[PIPE_SEPARATED_DATA_START:].split('|'))

            event_type_dependent_strings = all_data[len(labels):]
            clean_line = all_data[:len(labels)]
            
            START_INDEX = 7
            if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                START_INDEX += event_type_labels.index('string message')
                # account for pipes in the message string
                event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                assert START_INDEX == 7
                assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

            elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                START_INDEX += event_type_labels.index('load ID')
                assert START_INDEX == 8
                assert len(event_type_dependent_strings) == 6

            elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                START_INDEX += event_type_labels.index('keypad ID')
                assert START_INDEX == 14
                assert len(event_type_dependent_strings) == 5

            elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                START_INDEX += event_type_labels.index('signalID')
                assert START_INDEX == 19
                assert len(event_type_dependent_strings) == 4

            elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                  clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                  clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                START_INDEX += event_type_labels.index('ID')
                assert START_INDEX == 23
                assert len(event_type_dependent_strings) == 4

            elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                  clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                START_INDEX += device_id_offsets[0]
                assert START_INDEX == 27
                assert (len(event_type_dependent_strings) == 4 or
                        len(event_type_dependent_strings) == 5 or
                        len(event_type_dependent_strings) == 6)

            elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                START_INDEX += device_id_offsets[1]
                assert START_INDEX == 33
                assert len(event_type_dependent_strings) == 5

            elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                START_INDEX += device_id_offsets[2]
                assert START_INDEX == 38
                # to correct for the double pipe in "Basement Mudroom"
                event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
            else:
                raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))
            
            clean_line = clean_line + [0]*len(event_type_labels)
            clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
            clean_lines.append(clean_line)

In [25]:
df = pd.DataFrame(clean_lines, columns=labels + event_type_labels)

In [26]:
df.head()

Unnamed: 0,LogLevel,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,...,Device Name,room Name,signal event ID,signal Value,device ID,Device Name.1,Signal Name,Signal Value string,Signal direction,message
0,L:0,00:00:05,NotUserVisible,Information,System,,GeneralMessage,uptime:The system has been running for 13 days...,0,0,...,0,0,0,0,0,0,0,0,0,0
1,L:0,00:00:05,UserVisible,Information,Lights,Scene,SceneChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,L:0,00:00:05,UserVisible,Information,System,Event,TimeClockChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,L:0,00:00:05,UserVisible,Information,DoorLock,Device,SignalChangedEventWithStrings,0,0,0,...,0,0,0,0,208,Basement Mudroom Door,Basement Mudroom,Lock,ToDevice,0
4,L:0,00:00:05,UserVisible,Information,DoorLock,Scene,SceneChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df.iloc[10:, :]

Unnamed: 0,LogLevel,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,...,Device Name,room Name,signal event ID,signal Value,device ID,Device Name.1,Signal Name,Signal Value string,Signal direction,message
10,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,netstat:Proto Local Address Foreign ...,0,0,...,0,0,0,0,0,0,0,0,0,0
11,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,listenstat:!Port Number |Max Listeners |Cu...,0,0,...,0,0,0,0,0,0,0,0,0,0
12,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,threadpoolinfo:Maximum number of threads: 105!...,0,0,...,0,0,0,0,0,0,0,0,0,0
13,L:0,00:00:05,UserVisible,Information,Lights,Load,LevelChangedEvent,0,118,Sconces,...,0,0,0,0,0,0,0,0,0,0
14,L:0,00:00:07,NotUserVisible,Information,Lights,Load,LevelChangedEvent,0,122,Chandelier,...,0,0,0,0,0,0,0,0,0,0
15,L:0,00:00:07,NotUserVisible,Information,Lights,Load,LevelChangedEvent,0,120,Overhead,...,0,0,0,0,0,0,0,0,0,0
16,L:0,00:00:07,NotUserVisible,Information,Lights,Load,LevelChangedEvent,0,118,Sconces,...,0,0,0,0,0,0,0,0,0,0
17,L:0,00:00:07,UserVisible,Information,Lights,Load,LevelChangedEvent,0,118,Sconces,...,0,0,0,0,0,0,0,0,0,0
18,L:0,00:00:07,UserVisible,Information,Lights,Device,SignalChangedEvent,0,0,0,...,Sconces,Upstairs Landing,13,1,0,0,0,0,0,0
19,L:0,00:06:13,UserVisible,Information,Climate,Device,SignalChangedEventWithStrings,0,0,0,...,0,0,0,0,194,Master Bed,Master Bed,Cool Stage 1,Inactive,FromDevice


In [21]:
df.head().to_string()

'  LogLevel TimeStamp   LogVisibility  LogSeverity entryType entrySubType                      eventType                                     string message load ID loadName roomName rampTime rampBaseValue rampFinalValue keypad ID keypadName roomName buttonNum buttonState signalID signalName roomName RemoteSystemEvent string  ID                        name          roomName         message device ID Name room Name connection status Load 1 Room Name:Load 1 Name Load 2 Room Name:Load 2 Name device ID Device Name room Name signal event ID signal Value device ID            Device Name       Signal Name Signal Value string Signal direction message\n0      L:0  00:00:05  NotUserVisible  Information    System           NA                 GeneralMessage  uptime:The system has been running for 13 days...       0        0        0        0             0              0         0          0        0         0           0        0          0        0                        0   0                     