# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [35]:
import os 
import pandas as pd
import numpy as np
import h5py
from numpy import nan
pd.set_option('display.max_columns', 500)

  from ._conv import register_converters as _register_converters


In [2]:
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

def from_bac():
    # device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
    clean_lines = []

    for log in os.listdir('data'):
        with open('data/{}'.format(log)) as logfile:
            for line in logfile:
                line = line.rstrip('\n')
                if line[-1] == '|':
                    line = line[:-1]
                all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                            + line[PIPE_SEPARATED_DATA_START:].split('|'))

                event_type_dependent_strings = all_data[len(labels):]
                clean_line = all_data[:len(labels)]

                START_INDEX = 7
                if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                    START_INDEX += event_type_labels.index('string message')
                    # account for pipes in the message string
                    event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                    assert START_INDEX == 7
                    assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

                elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                    START_INDEX += event_type_labels.index('load ID')
                    assert START_INDEX == 8
                    assert len(event_type_dependent_strings) == 6

                elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                    START_INDEX += event_type_labels.index('keypad ID')
                    assert START_INDEX == 14
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                    START_INDEX += event_type_labels.index('signalID')
                    assert START_INDEX == 19
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                    START_INDEX += event_type_labels.index('ID')
                    assert START_INDEX == 23
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                      clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                    START_INDEX += event_type_labels.index('device ID1')
                    assert START_INDEX == 27
                    assert (len(event_type_dependent_strings) == 4 or
                            len(event_type_dependent_strings) == 5 or
                            len(event_type_dependent_strings) == 6)

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                    START_INDEX += event_type_labels.index('device ID2')
                    assert START_INDEX == 33
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                    START_INDEX += event_type_labels.index('device ID3')
                    assert START_INDEX == 38
                    # to correct for the double pipe in "Basement Mudroom"
                    event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
                else:
                    raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))

                clean_line = clean_line + [np.nan]*len(event_type_labels)
                clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
                clean_lines.append(clean_line)
    df = pd.DataFrame(clean_lines, columns=labels + event_type_labels)
    df.to_hdf('./h5_files/data_0.h5')
    return df

In [36]:
%pwd

'/mnt/c/Users/miperel/Desktop/crestron'

In [4]:
df.head(50)

Unnamed: 0,LogLevel,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,load ID,loadName,roomName1,rampTime,rampBaseValue,rampFinalValue,keypad ID,keypadName,roomName2,buttonNum,buttonState,signalID,signalName,roomName3,RemoteSystemEvent string,ID,name,roomName4,message1,device ID1,Name,roomName5,connection status,Load 1 Room Name:Load 1 Name,Load 2 Room Name:Load 2 Name,device ID2,Device Name1,roomName6,signal event ID,signal Value,device ID3,Device Name2,Signal Name,Signal Value string,Signal direction,message2
0,L:0,00:00:05,NotUserVisible,Information,System,,GeneralMessage,uptime:The system has been running for 13 days...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,L:0,00:00:05,UserVisible,Information,Lights,Scene,SceneChangedEvent,,,,,,,,,,,,,,,,,74.0,Night,Upstairs Landing,SceneRecalled,,,,,,,,,,,,,,,,,
2,L:0,00:00:05,UserVisible,Information,System,Event,TimeClockChangedEvent,,,,,,,,,,,,,,,,,4.0,Upstairs Landing Off,Unassigned,EventTriggered,,,,,,,,,,,,,,,,,
3,L:0,00:00:05,UserVisible,Information,DoorLock,Device,SignalChangedEventWithStrings,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,208.0,Basement Mudroom Door,Basement Mudroom,Lock,ToDevice,
4,L:0,00:00:05,UserVisible,Information,DoorLock,Scene,SceneChangedEvent,,,,,,,,,,,,,,,,,77.0,Lock Basement Mudroom Door,Basement Mudroom,SceneRecalled,,,,,,,,,,,,,,,,,
5,L:0,00:00:06,UserVisible,Information,System,Event,TimeClockChangedEvent,,,,,,,,,,,,,,,,,23.0,Lock Basement Door,Unassigned,EventTriggered,,,,,,,,,,,,,,,,,
6,L:0,00:00:06,NotUserVisible,Information,System,Cloud,GeneralMessage,Started nightlyFwCheckTimer with dueTime 1550000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,ramfree:37 percent of memory in use!203366400 ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,ssptasks:!Methods ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,L:0,00:00:06,NotUserVisible,Information,System,,GeneralMessage,taskstat:App Name ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# One hot encoding

In [5]:
for column in df:
    print(column, len(df[column].unique().tolist()))

LogLevel 1
TimeStamp 76448
LogVisibility 2
LogSeverity 3
entryType 10
entrySubType 11
eventType 11
string message 2677
load ID 81
loadName 57
roomName1 30
rampTime 1529
rampBaseValue 7354
rampFinalValue 747
keypad ID 32
keypadName 89
roomName2 19
buttonNum 75
buttonState 7
signalID 90
signalName 87
roomName3 2
RemoteSystemEvent string 5
ID 107
name 52
roomName4 33
message1 7
device ID1 124
Name 22
roomName5 4
connection status 3
Load 1 Room Name:Load 1 Name 121
Load 2 Room Name:Load 2 Name 84
device ID2 108
Device Name1 80
roomName6 32
signal event ID 15
signal Value 359
device ID3 6
Device Name2 6
Signal Name 6
Signal Value string 49
Signal direction 66
message2 7


delete the logLevel as there is only 1 unique val so no info



In [6]:
df = df.drop("LogLevel",axis=1)

In [7]:
# replace redundant values in buttonNum with NaN when keypadName contains the same value
df['buttonNum'] = df['buttonNum'].replace(df['keypadName'].unique(), np.nan)

In [8]:
df['buttonNum'].unique()

array([nan, '0', '1', '3', '2', '5', '6', '4', '7', '9', '8'],
      dtype=object)

In [9]:
def merge_columns(row, *args):
#     import pdb; pdb.set_trace()
    count = 0 
    for column_name in args:
        count += 1 if row[column_name] is not np.nan else 0
    # checks if more than one value in the merged rows
    assert count < 2
    for column_name in args:
        if row[column_name] is not np.nan:
            return row[column_name]
    return 0

In [10]:
df['device_id'] = df.apply(merge_columns, args=('load ID', 'keypad ID', 'ID', 'device ID1', 'device ID2', 'device ID3'), axis=1)
df = df.drop(['load ID', 'keypad ID', 'ID', 'device ID1', 'device ID2', 'device ID3'], axis=1)

In [11]:
df['device_name'] = df.apply(merge_columns, args=("loadName", "keypadName", "signalName", "name", "Name", "Device Name1", "Device Name2"), axis=1)
df = df.drop(["loadName", "keypadName", "signalName", "name", "Name", "Device Name1", "Device Name2"], axis=1)

In [12]:
df['device_id_name'] = df['device_id'].astype(str) + df['device_name'].astype(str)

In [13]:
df[df['device_id_name'] == '0Backyard and Sideyard Lights Off']

Unnamed: 0,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,roomName1,rampTime,rampBaseValue,rampFinalValue,roomName2,buttonNum,buttonState,signalID,roomName3,RemoteSystemEvent string,roomName4,message1,roomName5,connection status,Load 1 Room Name:Load 1 Name,Load 2 Room Name:Load 2 Name,roomName6,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,message2,device_id,device_name,device_id_name
11068,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Pressed,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11069,03:59:10,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to True,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11071,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Tapped,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11072,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Released,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11073,03:59:10,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to False,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11110,04:00:05,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Pressed,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11111,04:00:05,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to True,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11113,04:00:06,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Tapped,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11114,04:00:06,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Released,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11115,04:00:06,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to False,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off


In [14]:
df[11065:11080]

Unnamed: 0,TimeStamp,LogVisibility,LogSeverity,entryType,entrySubType,eventType,string message,roomName1,rampTime,rampBaseValue,rampFinalValue,roomName2,buttonNum,buttonState,signalID,roomName3,RemoteSystemEvent string,roomName4,message1,roomName5,connection status,Load 1 Room Name:Load 1 Name,Load 2 Room Name:Load 2 Name,roomName6,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,message2,device_id,device_name,device_id_name
11065,03:59:10,UserVisible,Information,Lights,Load,LevelChangedEvent,,First Floor Bathroom,100.0,57944.0,0.0,,,,,,,,,,,,,,,,,,,,45,Shower,45Shower
11066,03:59:10,NotUserVisible,Information,Lights,Device,SignalChangedEvent,,,,,,,,,,,,,,,,,,First Floor Bathroom,14.0,0.0,,,,,43,Overhead,43Overhead
11067,03:59:10,NotUserVisible,Information,Lights,Device,SignalChangedEvent,,,,,,,,,,,,,,,,,,First Floor Bathroom,14.0,0.0,,,,,45,Shower,45Shower
11068,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Pressed,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11069,03:59:10,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to True,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11070,03:59:10,UserVisible,Information,Lights,Scene,SceneChangedEvent,,,,,,,,,,,,Whole House,SceneRecalled,,,,,,,,,,,,72,Backyard Off,72Backyard Off
11071,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Tapped,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11072,03:59:10,UserVisible,Information,Keypad,Device,ButtonChangedEvent,,,,,,Rack Room,,Released,,,,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11073,03:59:10,UserVisible,Information,Remote,TriggerPyng,RemoteSystemEvent,,,,,,,,,"00:10:7f:2a:5b:b0,1,Basement Lights,Backyard a...",Rack Room,set to False,,,,,,,,,,,,,,0,Backyard and Sideyard Lights Off,0Backyard and Sideyard Lights Off
11074,03:59:10,NotUserVisible,Information,Lights,Load,LevelChangedEvent,,Back Yard,-1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,146,Patio Door (upper),146Patio Door (upper)


In [15]:
df = df.drop("LogVisibility", axis=1)

In [16]:
len(df)

1222633

In [17]:
df = df.drop_duplicates()

In [18]:
len(df)

1007145

In [19]:
df.iloc[26592]

TimeStamp                                 22:42:36
LogSeverity                            Information
entryType                                   Lights
entrySubType                                Device
eventType                       SignalChangedEvent
string message                                 NaN
roomName1                                      NaN
rampTime                                       NaN
rampBaseValue                                  NaN
rampFinalValue                                 NaN
roomName2                                      NaN
buttonNum                                      NaN
buttonState                                    NaN
signalID                                       NaN
roomName3                                      NaN
RemoteSystemEvent string                       NaN
roomName4                                      NaN
message1                                       NaN
roomName5                                      NaN
connection status              

In [20]:
# df['room_name_merged'] = df.apply(merge_columns, args=("roomName1","roomName2","roomName3","roomName4","roomName5","roomName6"), axis=1)
# df = df.drop(["roomName1","roomName2","roomName3","roomName4","roomName5","roomName6"], axis=1)

In [21]:
# df['device_name_merged'] = df.apply(merge_columns, args=("Device Name1", "Device Name2"), axis=1)
# df = df.drop(["Device Name1", "Device Name2"], axis=1)

In [22]:
# df['message_merged'] = df.apply(merge_columns, args=("message1", "message2"), axis=1)
# df = df.drop(["message1", "message2"], axis=1)

In [23]:
# df['name_merged'] = df.apply(merged_columns, args=("name", "Name"), axis=1)

In [24]:
# contains the same information:
# load name, load id
# device_name_merged, device_id_megered

# df = df.drop(["load ID"], axis=1)

In [25]:
# df.head()

In [26]:
# df.columns

In [27]:
# df.iloc[25]

In [28]:
# load id and load name refer to the same thing
# signal name and device name refer to the same thing

In [29]:
# df.head(2000).to_csv('crestron_data.csv', index=False)

In [30]:
def duplicate_columns(frame):
    # https://github.com/pandas-dev/pandas/issues/11250
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            iv = vs.iloc[:,i].tolist()
            for j in range(i+1, lcs):
                jv = vs.iloc[:,j].tolist()
                if iv == jv:
                    dups.append(cs[i])
                    break

    return dups

In [31]:
# duplicate_columns(df)

In [32]:
# df.head()

In [33]:
# s = df['keypadName'] == df['buttonNum']

In [34]:
len(s)

NameError: name 's' is not defined

In [None]:
# df['RemoteSystemEvent string'].astype('category')

In [None]:
# def convert_remote_system_event_string():
#     pass

In [None]:
# df['load ID'].unique()