# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [1]:
import os 
import pandas as pd
import numpy as np

In [2]:
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

# device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
clean_lines = []

for log in os.listdir('data'):
    with open('data/{}'.format(log)) as logfile:
        for line in logfile:
            line = line.rstrip('\n')
            if line[-1] == '|':
                line = line[:-1]
            all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                        + line[PIPE_SEPARATED_DATA_START:].split('|'))

            event_type_dependent_strings = all_data[len(labels):]
            clean_line = all_data[:len(labels)]
            
            START_INDEX = 7
            if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                START_INDEX += event_type_labels.index('string message')
                # account for pipes in the message string
                event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                assert START_INDEX == 7
                assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

            elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                START_INDEX += event_type_labels.index('load ID')
                assert START_INDEX == 8
                assert len(event_type_dependent_strings) == 6

            elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                START_INDEX += event_type_labels.index('keypad ID')
                assert START_INDEX == 14
                assert len(event_type_dependent_strings) == 5

            elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                START_INDEX += event_type_labels.index('signalID')
                assert START_INDEX == 19
                assert len(event_type_dependent_strings) == 4

            elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                  clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                  clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                START_INDEX += event_type_labels.index('ID')
                assert START_INDEX == 23
                assert len(event_type_dependent_strings) == 4

            elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                  clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                START_INDEX += event_type_labels.index('device ID1')
                assert START_INDEX == 27
                assert (len(event_type_dependent_strings) == 4 or
                        len(event_type_dependent_strings) == 5 or
                        len(event_type_dependent_strings) == 6)

            elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                START_INDEX += event_type_labels.index('device ID2')
                assert START_INDEX == 33
                assert len(event_type_dependent_strings) == 5

            elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                START_INDEX += event_type_labels.index('device ID3')
                assert START_INDEX == 38
                # to correct for the double pipe in "Basement Mudroom"
                event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
            else:
                raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))
            
            clean_line = clean_line + [0]*len(event_type_labels)
            clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
            clean_lines.append(clean_line)

In [3]:
df = pd.DataFrame(clean_lines, columns=labels + event_type_labels)

In [5]:
df.head(50)

Unnamed: 0,LogLevel,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,...,Device Name1,roomName6,signal event ID,signal Value,device ID3,Device Name2,Signal Name,Signal Value string,Signal direction,message2
0,L:0,00:00:05,NotUserVisible,Information,System,,GeneralMessage,uptime:The system has been running for 13 days...,0,0,...,0,0,0,0,0,0,0,0,0,0
1,L:0,00:00:05,UserVisible,Information,Lights,Scene,SceneChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,L:0,00:00:05,UserVisible,Information,System,Event,TimeClockChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,L:0,00:00:05,UserVisible,Information,DoorLock,Device,SignalChangedEventWithStrings,0,0,0,...,0,0,0,0,208,Basement Mudroom Door,Basement Mudroom,Lock,ToDevice,0
4,L:0,00:00:05,UserVisible,Information,DoorLock,Scene,SceneChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,L:0,00:00:06,UserVisible,Information,System,Event,TimeClockChangedEvent,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,L:0,00:00:06,NotUserVisible,Information,System,Cloud,GeneralMessage,Started nightlyFwCheckTimer with dueTime 1550000,0,0,...,0,0,0,0,0,0,0,0,0,0
7,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,ramfree:37 percent of memory in use!203366400 ...,0,0,...,0,0,0,0,0,0,0,0,0,0
8,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,ssptasks:!Methods ...,0,0,...,0,0,0,0,0,0,0,0,0,0
9,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,taskstat:App Name ...,0,0,...,0,0,0,0,0,0,0,0,0,0


# One hot encoding

In [6]:
for column in df:
    print(column, len(df[column].unique().tolist()))

LogLevel 1
TimeStamp 76448
LogVisibility 2
LogSeverity 3
entryType 10
entrySubType 11
eventType 11
string message 2677
load ID 81
loadName 57
roomName1 30
rampTime 1529
rampBaseValue 7354
rampFinalValue 747
keypad ID 32
keypadName 89
roomName2 19
buttonNum 75
buttonState 7
signalID 90
signalName 87
roomName3 2
RemoteSystemEvent string 5
ID 107
name 52
roomName4 33
message1 7
device ID1 124
Name 22
roomName5 4
connection status 3
Load 1 Room Name:Load 1 Name 121
Load 2 Room Name:Load 2 Name 84
device ID2 108
Device Name1 80
roomName6 32
signal event ID 15
signal Value 359
device ID3 6
Device Name2 6
Signal Name 6
Signal Value string 49
Signal direction 66
message2 7


delete the logLevel as there is only 1 unique val so no info



In [7]:
df = df.drop("LogLevel",axis=1)

In [8]:
def to_categorical(df, column):
    new_column_name = column+'_cat'
    new_df = df.copy()
    new_df[new_column_name] = df[column].astype('category').cat.codes
    return new_df

In [9]:
columns_to_change = ['LogVisibility']
new_df = to_categorical(df, columns_to_change[0])

In [19]:
new_df.iloc[25]

TimeStamp                                            00:19:06
LogVisibility                                     UserVisible
LogSeverity                                       Information
entryType                                             Climate
entrySubType                                           Device
eventType                       SignalChangedEventWithStrings
string message                                              0
load ID                                                     0
loadName                                                    0
roomName1                                                   0
rampTime                                                    0
rampBaseValue                                               0
rampFinalValue                                              0
keypad ID                                                   0
keypadName                                                  0
roomName2                                                   0
buttonNu

In [11]:
def merge_columns(row, *args):
#     import pdb; pdb.set_trace()
    count = 0 
    for column_name in args:
        count += 1 if row[column_name] != 0 else 0
    assert count < 2
    for column_name in args:
        if row[column_name] != 0:
            return row[column_name]
    return 0

In [12]:
df['device_id_merged'] = df.apply(merge_columns, args=("device ID1", "device ID2", "device ID3"), axis=1)
df = df.drop(["device ID1", "device ID2", "device ID3"], axis=1)

In [13]:
df['room_name_merged'] = df.apply(merge_columns, args=("roomName1","roomName2","roomName3","roomName4","roomName5","roomName6"), axis=1)
df = df.drop(["roomName1","roomName2","roomName3","roomName4","roomName5","roomName6"], axis=1)

In [14]:
df['device_name_merged'] = df.apply(merge_columns, args=("Device Name1", "Device Name2"), axis=1)
df = df.drop(["Device Name1", "Device Name2"], axis=1)

In [15]:
df['message_merged'] = df.apply(merge_columns, args=("message1", "message2"), axis=1)
df = df.drop(["message1", "message2"], axis=1)

In [None]:
# df['name_merged'] = df.apply(merged_columns, args=("name", "Name"), axis=1)

In [None]:
# contains the same information:
# load name, load id
# device_name_merged, device_id_megered

# df = df.drop(["load ID"], axis=1)

In [16]:
df.head()

Unnamed: 0,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,rampTime,...,Load 2 Room Name:Load 2 Name,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,device_id_merged,room_name_merged,device_name_merged,message_merged
0,00:00:05,NotUserVisible,Information,System,,GeneralMessage,uptime:The system has been running for 13 days...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00:00:05,UserVisible,Information,Lights,Scene,SceneChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Upstairs Landing,0,SceneRecalled
2,00:00:05,UserVisible,Information,System,Event,TimeClockChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Unassigned,0,EventTriggered
3,00:00:05,UserVisible,Information,DoorLock,Device,SignalChangedEventWithStrings,0,0,0,0,...,0,0,0,Basement Mudroom,Lock,ToDevice,208,0,Basement Mudroom Door,0
4,00:00:05,UserVisible,Information,DoorLock,Scene,SceneChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Basement Mudroom,0,SceneRecalled


In [17]:
df.columns

Index(['TimeStamp', 'LogVisibility', 'LogSeverity', 'entryType',
       'entrySubType', 'eventType', 'string message', 'load ID', 'loadName',
       'rampTime', 'rampBaseValue', 'rampFinalValue', 'keypad ID',
       'keypadName', 'buttonNum', 'buttonState', 'signalID', 'signalName',
       'RemoteSystemEvent string', 'ID', 'name', 'Name', 'connection status',
       'Load 1 Room Name:Load 1 Name', 'Load 2 Room Name:Load 2 Name',
       'signal event ID', 'signal Value', 'Signal Name', 'Signal Value string',
       'Signal direction', 'device_id_merged', 'room_name_merged',
       'device_name_merged', 'message_merged'],
      dtype='object')

In [21]:
df.iloc[25]

TimeStamp                                            00:19:06
LogVisibility                                     UserVisible
LogSeverity                                       Information
entryType                                             Climate
entrySubType                                           Device
eventType                       SignalChangedEventWithStrings
string message                                              0
load ID                                                     0
loadName                                                    0
rampTime                                                    0
rampBaseValue                                               0
rampFinalValue                                              0
keypad ID                                                   0
keypadName                                                  0
buttonNum                                                   0
buttonState                                                 0
signalID

In [None]:
# load id and load name refer to the same thing
# signal name and device name refer to the same thing

In [24]:
df.head(2000).to_csv('crestron_data.csv', index=False)

In [25]:
def duplicate_columns(frame):
    # https://github.com/pandas-dev/pandas/issues/11250
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            iv = vs.iloc[:,i].tolist()
            for j in range(i+1, lcs):
                jv = vs.iloc[:,j].tolist()
                if iv == jv:
                    dups.append(cs[i])
                    break

    return dups

In [26]:
duplicate_columns(df)

[]

In [27]:
df.head()

Unnamed: 0,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,rampTime,...,Load 2 Room Name:Load 2 Name,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,device_id_merged,room_name_merged,device_name_merged,message_merged
0,00:00:05,NotUserVisible,Information,System,,GeneralMessage,uptime:The system has been running for 13 days...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00:00:05,UserVisible,Information,Lights,Scene,SceneChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Upstairs Landing,0,SceneRecalled
2,00:00:05,UserVisible,Information,System,Event,TimeClockChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Unassigned,0,EventTriggered
3,00:00:05,UserVisible,Information,DoorLock,Device,SignalChangedEventWithStrings,0,0,0,0,...,0,0,0,Basement Mudroom,Lock,ToDevice,208,0,Basement Mudroom Door,0
4,00:00:05,UserVisible,Information,DoorLock,Scene,SceneChangedEvent,0,0,0,0,...,0,0,0,0,0,0,0,Basement Mudroom,0,SceneRecalled


In [29]:
s = df['keypadName'] == df['buttonNum']

1106153

In [32]:
len(s)

1222633

In [38]:
df['RemoteSystemEvent string'].astype('category')

0                      0
1                      0
2                      0
3                      0
4                      0
5                      0
6                      0
7                      0
8                      0
9                      0
10                     0
11                     0
12                     0
13                     0
14                     0
15                     0
16                     0
17                     0
18                     0
19                     0
20                     0
21                     0
22                     0
23                     0
24                     0
25                     0
26                     0
27                     0
28                     0
29                     0
               ...      
1222603                0
1222604      set to True
1222605                0
1222606                0
1222607      set to True
1222608                0
1222609                0
1222610     set to False
1222611                0


In [None]:
def convert_remote_system_event_string():
    pass