# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [1]:
import os 
import pandas as pd
import numpy as np
import h5py
from numpy import nan
from datetime import datetime, timedelta
from time import mktime
import os.path
pd.set_option('display.max_columns', 500)

  from ._conv import register_converters as _register_converters


In [2]:
def load_from_checkpoint(df, checkpoint_name):
    if not os.path.isfile(checkpoint_name):
        df.to_csv(checkpoint_name, index=False)
    else:
        df = pd.read_csv(checkpoint_name)

In [3]:
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

def from_bac():
    # device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
    clean_lines = []

    for log in os.listdir('data'):
        with open('data/{}'.format(log)) as logfile:
            for line in logfile:
                line = line.rstrip('\n')
                if line[-1] == '|':
                    line = line[:-1]
                all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                            + line[PIPE_SEPARATED_DATA_START:].split('|'))

                event_type_dependent_strings = all_data[len(labels):]
                clean_line = all_data[:len(labels)]

                START_INDEX = 7
                if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                    START_INDEX += event_type_labels.index('string message')
                    # account for pipes in the message string
                    event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                    assert START_INDEX == 7
                    assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

                elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                    START_INDEX += event_type_labels.index('load ID')
                    assert START_INDEX == 8
                    assert len(event_type_dependent_strings) == 6

                elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                    START_INDEX += event_type_labels.index('keypad ID')
                    assert START_INDEX == 14
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                    START_INDEX += event_type_labels.index('signalID')
                    assert START_INDEX == 19
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                    START_INDEX += event_type_labels.index('ID')
                    assert START_INDEX == 23
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                      clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                    START_INDEX += event_type_labels.index('device ID1')
                    assert START_INDEX == 27
                    assert (len(event_type_dependent_strings) == 4 or
                            len(event_type_dependent_strings) == 5 or
                            len(event_type_dependent_strings) == 6)

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                    START_INDEX += event_type_labels.index('device ID2')
                    assert START_INDEX == 33
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                    START_INDEX += event_type_labels.index('device ID3')
                    assert START_INDEX == 38
                    # to correct for the double pipe in "Basement Mudroom"
                    event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
                else:
                    raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))

                clean_line = clean_line + [np.nan]*len(event_type_labels) + [log[-14:-4]]
                clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
                clean_lines.append(clean_line)
    df = pd.DataFrame(clean_lines, columns=labels + event_type_labels + ["date"])
    return df

## First Checkpoint: Load data from BAC files

In [4]:
if not os.path.isfile('./checkpoints/data_0.csv'):
    df = from_bac()
else:
    df = pd.read_csv('./checkpoints/data_0.csv')

### Combine date and time

In [5]:
def to_datetime(row):
    new_date = row['date'] + ' ' + row['TimeStamp']
    dt = datetime.strptime(new_date, '%Y-%m-%d %H:%M:%S')
    dt = dt + timedelta(hours=4)
    unix_secs = mktime(dt.timetuple())
    return unix_secs

In [6]:
df['unix_time'] = df.apply(to_datetime, axis=1)
df = df.drop(["TimeStamp","date"], axis=1)

### Drop useless columns

In [7]:
# Useless columns
df = df.drop(["LogLevel", "LogVisibility", "LogSeverity"],axis=1, errors='ignore')

# useless after deleting General Message
df = df.drop("string message", axis=1, errors='ignore')

# useless after deleting Button Change Event
df = df.drop(["keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState"], axis=1, errors='ignore')

# useless after deleting Connection Status
df = df.drop(["device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name"], axis=1, errors='ignore')

# useless after deleting Remote System Event
df = df.drop(["signalID", "signalName", "roomName3", "RemoteSystemEvent string"], axis=1, errors='ignore')

### Remove useless event types

In [8]:
df = df[df['eventType'] != 'ButtonChangedEvent']
df = df[df['eventType'] != 'GeneralMessage']
df = df[df['eventType'] != 'RemoteSystemEvent']
df = df[df['eventType'] != 'TimeClockChangedEvent']
df = df[df['eventType'] != 'ConnectionStatus']
df = df[df['eventType'] != 'DeviceConnectionStatusWithOptions']

### Merge Ids and Names

In [9]:
def merge_columns(row, *args):
    count = sum(1 for column_name in args if pd.notna(row[column_name]))
    # checks if more than one value in the merged rows
    assert count < 2
    for column_name in args:
        if pd.notna(row[column_name]):
            return row[column_name]
    return np.nan

In [10]:
# Merge room names
df['room_name_merged'] = df.apply(merge_columns, args=("roomName1","roomName4","roomName6"), axis=1)
df = df.drop(["roomName1","roomName4","roomName6"], axis=1)

In [11]:
# Merge device name
df['device_name'] = df.apply(merge_columns, args=("loadName", "name", "Device Name1", "Device Name2"), axis=1)
df = df.drop(["loadName", "name", "Device Name1", "Device Name2"], axis=1)

In [12]:
# Merge IDs 
df['device_id'] = df.apply(merge_columns, args=('load ID', 'ID', 'device ID2', 'device ID3'), axis=1)
df = df.drop(['load ID', 'ID', 'device ID2', 'device ID3'], axis=1)

In [13]:
# Merge messages 
df['message_merged'] = df.apply(merge_columns, args=("message1", "message2"), axis=1)
df = df.drop(["message1", "message2"], axis=1)

In [14]:
df

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,device_name,device_id,message_merged
3,Lights,Scene,SceneChangedEvent,,,,,,,,,1.507435e+09,Upstairs Landing,Night,74,SceneRecalled
5,DoorLock,Device,SignalChangedEventWithStrings,,,,,,Basement Mudroom,Lock,ToDevice,1.507435e+09,,Basement Mudroom Door,208,
6,DoorLock,Scene,SceneChangedEvent,,,,,,,,,1.507435e+09,Basement Mudroom,Lock Basement Mudroom Door,77,SceneRecalled
13,Lights,Load,LevelChangedEvent,0,13621,19275,,,,,,1.507435e+09,Upstairs Landing,Sconces,118,
14,Lights,Load,LevelChangedEvent,-1,0,0,,,,,,1.507435e+09,Upstairs Landing,Chandelier,122,
15,Lights,Load,LevelChangedEvent,-1,0,0,,,,,,1.507435e+09,Upstairs Landing,Overhead,120,
16,Lights,Load,LevelChangedEvent,-1,19275,19275,,,,,,1.507435e+09,Upstairs Landing,Sconces,118,
17,Lights,Load,LevelChangedEvent,-1,19275,19275,,,,,,1.507435e+09,Upstairs Landing,Sconces,118,
18,Lights,Load,LevelChangedEvent,-1,19275,19275,,,,,,1.507435e+09,Upstairs Landing,Sconces,118,
20,Lights,Scene,SceneChangedEvent,,,,,,,,,1.507435e+09,Master Bath,All Off,46,SceneRecalled


In [15]:
# Checkpoint 
# load_from_checkpoint(df, './checkpoints/data_1.csv')    

### Combine device name and id to for truly unique ids

In [16]:
df['device_id_name'] = df['device_id'].astype(str) + df['device_name'].astype(str)
df = df.drop(["device_id", "device_name"], axis=1)

In [17]:
df.head()

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,message_merged,device_id_name
3,Lights,Scene,SceneChangedEvent,,,,,,,,,1507435000.0,Upstairs Landing,SceneRecalled,74Night
5,DoorLock,Device,SignalChangedEventWithStrings,,,,,,Basement Mudroom,Lock,ToDevice,1507435000.0,,,208Basement Mudroom Door
6,DoorLock,Scene,SceneChangedEvent,,,,,,,,,1507435000.0,Basement Mudroom,SceneRecalled,77Lock Basement Mudroom Door
13,Lights,Load,LevelChangedEvent,0.0,13621.0,19275.0,,,,,,1507435000.0,Upstairs Landing,,118Sconces
14,Lights,Load,LevelChangedEvent,-1.0,0.0,0.0,,,,,,1507435000.0,Upstairs Landing,,122Chandelier


In [18]:
df = df.drop_duplicates()

In [19]:
df.to_csv('marg_df.csv')    
# df = pd.read_csv('marg_df.csv')

In [20]:
df = df.fillna('nan')

### Encode Columns

Convert Categorical to Numerical Data. 

In [None]:
from sklearn import preprocessing
columns_to_update = ["entryType", "entrySubType", "device_id_name", "eventType", "Signal Name", "Signal Value string", "Signal direction", "room_name_merged", "message_merged"]

def encode_columns(data_frame, column_names):
    label_encoders = {}
    for col in column_names:
        values = data_frame[col].unique()
        le = preprocessing.LabelEncoder()
        le.fit(values)
        label_encoders[col] = le
        numerical_values = le.transform(data_frame[col])
        data_frame[col] = numerical_values
    return data_frame, label_encoders

categorical_df = df.copy()
categorical_df, label_encoders = encode_columns(categorical_df, columns_to_update)

In [22]:
categorical_df

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,message_merged,device_id_name
3,3,3,2,,,,,,5,48,65,1.507435e+09,30,8,176
5,2,0,4,,,,,,0,30,64,1.507435e+09,32,11,88
6,2,3,2,,,,,,5,48,65,1.507435e+09,3,8,179
13,3,1,0,0,13621,19275,,,5,48,65,1.507435e+09,30,11,20
14,3,1,0,-1,0,0,,,5,48,65,1.507435e+09,30,11,25
15,3,1,0,-1,0,0,,,5,48,65,1.507435e+09,30,11,24
16,3,1,0,-1,19275,19275,,,5,48,65,1.507435e+09,30,11,20
17,3,1,0,-1,19275,19275,,,5,48,65,1.507435e+09,30,11,20
20,3,3,2,,,,,,5,48,65,1.507435e+09,21,8,133
21,3,1,0,100,65535,0,,,5,48,65,1.507435e+09,21,11,0
