# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [1]:
# FOR DATA ANALYSIS PUT HOUR, DAY, WEEK, MONTH, DATE, WEATHER, SUNRISE/SUNUP

In [2]:
import os 
import pandas as pd
import numpy as np
import h5py
from numpy import nan
from datetime import datetime, timedelta
from time import mktime
import os.path
pd.set_option('display.max_columns', 500)

  from ._conv import register_converters as _register_converters


In [3]:
def save_or_load_from_checkpoint(checkpoint_name):
    if os.path.isfile(checkpoint_name):
        return pd.read_hdf(checkpoint_name,'table')
    df.to_hdf(checkpoint_name, 'table', mode='w', append=True, complevel=9, complib='zlib', index=False)
    return df

In [4]:
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

def from_bac():
    # device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
    clean_lines = []

    for log in os.listdir('data'):
        with open('data/{}'.format(log)) as logfile:
            for line in logfile:
                line = line.rstrip('\n')
                if line[-1] == '|':
                    line = line[:-1]
                all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                            + line[PIPE_SEPARATED_DATA_START:].split('|'))

                event_type_dependent_strings = all_data[len(labels):]
                clean_line = all_data[:len(labels)]

                START_INDEX = 7
                if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                    START_INDEX += event_type_labels.index('string message')
                    # account for pipes in the message string
                    event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                    assert START_INDEX == 7
                    assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

                elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                    START_INDEX += event_type_labels.index('load ID')
                    assert START_INDEX == 8
                    assert len(event_type_dependent_strings) == 6

                elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                    START_INDEX += event_type_labels.index('keypad ID')
                    assert START_INDEX == 14
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                    START_INDEX += event_type_labels.index('signalID')
                    assert START_INDEX == 19
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                    START_INDEX += event_type_labels.index('ID')
                    assert START_INDEX == 23
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                      clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                    START_INDEX += event_type_labels.index('device ID1')
                    assert START_INDEX == 27
                    assert (len(event_type_dependent_strings) == 4 or
                            len(event_type_dependent_strings) == 5 or
                            len(event_type_dependent_strings) == 6)

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                    START_INDEX += event_type_labels.index('device ID2')
                    assert START_INDEX == 33
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                    START_INDEX += event_type_labels.index('device ID3')
                    assert START_INDEX == 38
                    # to correct for the double pipe in "Basement Mudroom"
                    event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
                else:
                    raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))

                clean_line = clean_line + [np.nan]*len(event_type_labels) + [log[-14:-4]]
                clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
                clean_lines.append(clean_line)
    df = pd.DataFrame(clean_lines, columns=labels + event_type_labels + ["date"])
    return df

## Load data from BAC files

In [5]:
df = from_bac()

Exception: cannot find the correct atom type -> [dtype->object,items->Index(['LogLevel', 'TimeStamp', 'LogVisibility', 'LogSeverity', 'entryType',
       'entrySubType', 'eventType', 'string message', 'load ID', 'loadName',
       'roomName1', 'rampTime', 'rampBaseValue', 'rampFinalValue', 'keypad ID',
       'keypadName', 'roomName2', 'buttonNum', 'buttonState', 'signalID',
       'signalName', 'roomName3', 'RemoteSystemEvent string', 'ID', 'name',
       'roomName4', 'message1', 'device ID1', 'Name', 'roomName5',
       'connection status', 'Load 1 Room Name:Load 1 Name',
       'Load 2 Room Name:Load 2 Name', 'device ID2', 'Device Name1',
       'roomName6', 'signal event ID', 'signal Value', 'device ID3',
       'Device Name2', 'Signal Name', 'Signal Value string',
       'Signal direction', 'message2', 'date'],
      dtype='object')] 

### Combine date and time

In [6]:
def to_datetime(row):
    new_date = row['date'] + ' ' + row['TimeStamp']
    dt = datetime.strptime(new_date, '%Y-%m-%d %H:%M:%S')
    dt = dt + timedelta(hours=4)
    unix_secs = mktime(dt.timetuple())
    return unix_secs

In [7]:
df['unix_time'] = df.apply(to_datetime, axis=1)
df = df.drop(["TimeStamp","date"], axis=1)

### Drop useless columns

In [8]:
# Useless columns
df = df.drop(["LogLevel", "LogVisibility", "LogSeverity"],axis=1, errors='ignore')

# useless after deleting General Message
df = df.drop("string message", axis=1, errors='ignore')

# useless after deleting Button Change Event
df = df.drop(["keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState"], axis=1, errors='ignore')

# useless after deleting Connection Status
df = df.drop(["device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name"], axis=1, errors='ignore')

# useless after deleting Remote System Event
df = df.drop(["signalID", "signalName", "roomName3", "RemoteSystemEvent string"], axis=1, errors='ignore')

### Remove useless event types

In [9]:
df = df[df['eventType'] != 'ButtonChangedEvent']
df = df[df['eventType'] != 'GeneralMessage']
df = df[df['eventType'] != 'RemoteSystemEvent']
df = df[df['eventType'] != 'TimeClockChangedEvent']
df = df[df['eventType'] != 'ConnectionStatus']
df = df[df['eventType'] != 'DeviceConnectionStatusWithOptions']

### Merge Ids and Names

In [10]:
def merge_columns(row, *args):
    count = sum(1 for column_name in args if pd.notna(row[column_name]))
    # checks if more than one value in the merged rows
    assert count < 2
    for column_name in args:
        if pd.notna(row[column_name]):
            return row[column_name]
    return np.nan

In [11]:
# Merge room names
df['room_name_merged'] = df.apply(merge_columns, args=("roomName1","roomName4","roomName6"), axis=1)
df = df.drop(["roomName1","roomName4","roomName6"], axis=1)

In [12]:
# Merge device name
df['device_name'] = df.apply(merge_columns, args=("loadName", "name", "Device Name1", "Device Name2"), axis=1)
df = df.drop(["loadName", "name", "Device Name1", "Device Name2"], axis=1)

In [13]:
# Merge IDs 
df['device_id'] = df.apply(merge_columns, args=('load ID', 'ID', 'device ID2', 'device ID3'), axis=1)
df = df.drop(['load ID', 'ID', 'device ID2', 'device ID3'], axis=1)

In [14]:
# Merge messages 
df['message_merged'] = df.apply(merge_columns, args=("message1", "message2"), axis=1)
df = df.drop(["message1", "message2"], axis=1)

### Combine device name and id to for truly unique ids

In [15]:
df['device_id_name'] = df['device_id'].astype(str) + df['device_name'].astype(str)
df = df.drop(["device_id", "device_name"], axis=1)

In [16]:
df = df.drop_duplicates()

In [17]:
df = df.fillna('nan')

In [18]:
# Checkpoint 
df = save_or_load_from_checkpoint('./checkpoints/data_1.h5')  

### Normalize Column Values

In [19]:
# dropping more useless data
index_to_drop = df.loc[df["entryType"] == "Auxiliary"][df['eventType'] == "SignalChangedEventWithStrings"].index.tolist()
df = df.drop(index_to_drop)

  


In [20]:
for index, row in df.iterrows():
    signal_direction = str(row['Signal direction'])
    if 'Fahrenheit' in signal_direction and signal_direction[0].isdigit():
        temp = int(''.join(x for x in signal_direction if x.isdigit()))
        df.at[index, 'temperature'] = temp
        
    if len(signal_direction) > 20:
        heat_cool_auto = signal_direction.split(',')
        heat_temp = [int(s[:-1]) for s in heat_cool_auto[0].split() if s[0].isdigit()]
        cool_temp = [int(s[:-1]) for s in heat_cool_auto[1].split() if s[0].isdigit()]
        auto_temp = [int(s[:-1]) for s in heat_cool_auto[2].split() if s[0].isdigit()]
        df.at[index, 'heat_setpoint'] = heat_temp[0]
        df.at[index, 'cool_setpoint'] = cool_temp[0]
        df.at[index, 'auto_setpoint'] = auto_temp[0]

In [54]:
df = save_or_load_from_checkpoint('./checkpoints/data_2.h5')  

In [22]:
test_df = save_or_load_from_checkpoint('./checkpoints/data_2.h5')

In [24]:
test_df = test_df.fillna('nan')
test_df.head()

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,message_merged,device_id_name,temperature,heat_setpoint,cool_setpoint,auto_setpoint
1,Lights,Scene,SceneChangedEvent,,,,,,,,,1502438000.0,Upstairs Landing,SceneRecalled,74Night,,,,
3,DoorLock,Device,SignalChangedEventWithStrings,,,,,,Basement Mudroom,Lock,ToDevice,1502438000.0,,,208Basement Mudroom Door,,,,
4,DoorLock,Scene,SceneChangedEvent,,,,,,,,,1502438000.0,Basement Mudroom,SceneRecalled,77Lock Basement Mudroom Door,,,,
13,Lights,Load,LevelChangedEvent,0.0,0.0,19275.0,,,,,,1502438000.0,Upstairs Landing,,118Sconces,,,,
14,Lights,Load,LevelChangedEvent,-1.0,0.0,0.0,,,,,,1502438000.0,Upstairs Landing,,122Chandelier,,,,


In [60]:
df.loc[df["Signal Value string"] == 'Floor Warming']

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,message_merged,device_id_name,temperature,heat_setpoint,cool_setpoint,auto_setpoint,locked,fan_state
274013,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Heat:False,1.515318e+09,,FromDevice,192Kitchen,,,,,,
274014,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Off:False,1.515318e+09,,FromDevice,192Kitchen,,,,,,
274031,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Heat:False,1.515318e+09,,FromDevice,194Master Bed,,,,,,
274032,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Off:False,1.515318e+09,,FromDevice,194Master Bed,,,,,,
274049,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Heat:False,1.515318e+09,,FromDevice,195Great Room,,,,,,
274050,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Off:False,1.515318e+09,,FromDevice,195Great Room,,,,,,
274789,Climate,Device,SignalChangedEventWithStrings,,,,,,Great Room,Floor Warming,Off:True,1.515318e+09,,FromDevice,195Great Room,,,,,,
274793,Climate,Device,SignalChangedEventWithStrings,,,,,,Master Bed,Floor Warming,Off:True,1.515318e+09,,FromDevice,194Master Bed,,,,,,
274794,Climate,Device,SignalChangedEventWithStrings,,,,,,Kitchen,Floor Warming,Off:True,1.515318e+09,,FromDevice,192Kitchen,,,,,,
284990,Climate,Device,SignalChangedEventWithStrings,,,,,,Unassigned,Floor Warming,Heat:False,1.520819e+09,,FromDevice,192Kitchen,,,,,,


In [34]:
# can ignore Temperature, Get Information, "Leave" Event,"Return" Event,"Sleep" Event,"Wake Weekend" Event,
# "Wake" Event, "Weekend Wake" Event, Fan, Auto Deadband, 

# ignore 2 deg temp as it's from Auto Deadband

sorted(df["Signal Value string"].unique())

['"Leave" Event',
 '"Return" Event',
 '"Sleep" Event',
 '"Wake Weekend" Event',
 '"Wake" Event',
 '"Weekend Wake" Event',
 'Active Fan Level',
 'Auto Deadband',
 'Auto Mode',
 'Auto Setpoint',
 'Cool Mode',
 'Cool Setpoint',
 'Cool Stage 1',
 'Fan',
 'Floor Warming',
 'Get Information',
 'Heat Mode',
 'Heat Setpoint',
 'Heat Stage 1',
 'Hold',
 'Humidifier Enable',
 'Humidifier Off',
 'Humidity Mode',
 'Humidity Setpoint',
 'Humidity View',
 'Lock',
 'Mode',
 'Scheduled Setpoints',
 'Single Setpoint Mode',
 'Slab 1',
 'Slab 2',
 'Slab 3',
 'Slab 4A',
 'Slab 4B',
 'Slab 5A',
 'Slab 5B',
 'Slab Setpoint',
 'Slab Temperature',
 'Temperature',
 'Unlock',
 'nan']

In [27]:
df.head()

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,unix_time,room_name_merged,message_merged,device_id_name,temperature,heat_setpoint,cool_setpoint,auto_setpoint
1,Lights,Scene,SceneChangedEvent,,,,,,,,1502438000.0,Upstairs Landing,SceneRecalled,74Night,,,,
3,DoorLock,Device,SignalChangedEventWithStrings,,,,,,Basement Mudroom,Lock,1502438000.0,,,208Basement Mudroom Door,,,,
4,DoorLock,Scene,SceneChangedEvent,,,,,,,,1502438000.0,Basement Mudroom,SceneRecalled,77Lock Basement Mudroom Door,,,,
13,Lights,Load,LevelChangedEvent,0.0,0.0,19275.0,,,,,1502438000.0,Upstairs Landing,,118Sconces,,,,
14,Lights,Load,LevelChangedEvent,-1.0,0.0,0.0,,,,,1502438000.0,Upstairs Landing,,122Chandelier,,,,


In [55]:
def is_locked(row):
    if str(row['Signal Value string']) == 'Lock':
        return 1
    elif str(row['Signal Value string']) == 'Unlock':
        return 0
    else:
        return 'nan'

In [56]:
df['binary'] = df.apply(is_locked, axis=1)

In [57]:
def is_binary(row, *args):
    if str(row['Signal Value string']) == args[0]:
        if str(row['Signal direction']) == args[1]:
            return 1
        elif str(row['Signal direction']) == args[2]:
            return 0
    elif str(row['binary']) != 'nan':
        return str(row['binary'])
    else:
        return 'nan'

In [58]:
df['binary'] = df.apply(is_fan_on, args = ['Active Fan Level', 'High', 'Off'], axis=1)

In [59]:
df.head()

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,signal event ID,signal Value,Signal Name,Signal Value string,Signal direction,unix_time,room_name_merged,message_merged,device_id_name,temperature,heat_setpoint,cool_setpoint,auto_setpoint,locked,fan_state
1,Lights,Scene,SceneChangedEvent,,,,,,,,,1502438000.0,Upstairs Landing,SceneRecalled,74Night,,,,,,
3,DoorLock,Device,SignalChangedEventWithStrings,,,,,,Basement Mudroom,Lock,ToDevice,1502438000.0,,,208Basement Mudroom Door,,,,,1.0,
4,DoorLock,Scene,SceneChangedEvent,,,,,,,,,1502438000.0,Basement Mudroom,SceneRecalled,77Lock Basement Mudroom Door,,,,,,
13,Lights,Load,LevelChangedEvent,0.0,0.0,19275.0,,,,,,1502438000.0,Upstairs Landing,,118Sconces,,,,,,
14,Lights,Load,LevelChangedEvent,-1.0,0.0,0.0,,,,,,1502438000.0,Upstairs Landing,,122Chandelier,,,,,,


### Encode Columns

Convert Categorical to Numerical Data. 

In [None]:
from sklearn import preprocessing
columns_to_update = ["entryType", "entrySubType", "device_id_name", "eventType", "Signal Name", "Signal Value string", "Signal direction", "room_name_merged", "message_merged"]

def encode_columns(data_frame, column_names):
    label_encoders = {}
    for col in column_names:
        values = data_frame[col].unique()
        le = preprocessing.LabelEncoder()
        le.fit(values)
        label_encoders[col] = le
        numerical_values = le.transform(data_frame[col])
        data_frame[col] = numerical_values
    return data_frame, label_encoders

categorical_df = df.copy()
categorical_df, label_encoders = encode_columns(categorical_df, columns_to_update)