# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [None]:
import os 
import pandas as pd
import numpy as np
import h5py
import json
from numpy import nan
from datetime import datetime, timedelta, date
from fractions import Fraction
from time import mktime
import time
import requests
import os.path
pd.set_option('display.max_columns', 500)

In [2]:
def save_or_load_from_checkpoint(checkpoint_name):
    if os.path.isfile(checkpoint_name):
        return pd.read_hdf(checkpoint_name,'table')
    df.to_hdf(checkpoint_name, 'table', mode='w', append=True, complevel=9, complib='zlib', index=False)
    return df

In [3]:
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

def from_bac():
    # device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
    clean_lines = []

    for log in os.listdir('data'):
        with open('data/{}'.format(log)) as logfile:
            for line in logfile:
                line = line.rstrip('\n')
                if line[-1] == '|':
                    line = line[:-1]
                all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                            + line[PIPE_SEPARATED_DATA_START:].split('|'))

                event_type_dependent_strings = all_data[len(labels):]
                clean_line = all_data[:len(labels)]

                START_INDEX = 7
                if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                    START_INDEX += event_type_labels.index('string message')
                    # account for pipes in the message string
                    event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                    assert START_INDEX == 7
                    assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

                elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                    START_INDEX += event_type_labels.index('load ID')
                    assert START_INDEX == 8
                    assert len(event_type_dependent_strings) == 6

                elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                    START_INDEX += event_type_labels.index('keypad ID')
                    assert START_INDEX == 14
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                    START_INDEX += event_type_labels.index('signalID')
                    assert START_INDEX == 19
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                    START_INDEX += event_type_labels.index('ID')
                    assert START_INDEX == 23
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                      clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                    START_INDEX += event_type_labels.index('device ID1')
                    assert START_INDEX == 27
                    assert (len(event_type_dependent_strings) == 4 or
                            len(event_type_dependent_strings) == 5 or
                            len(event_type_dependent_strings) == 6)

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                    START_INDEX += event_type_labels.index('device ID2')
                    assert START_INDEX == 33
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                    START_INDEX += event_type_labels.index('device ID3')
                    assert START_INDEX == 38
                    # to correct for the double pipe in "Basement Mudroom"
                    event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
                else:
                    raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))

                clean_line = clean_line + [np.nan]*len(event_type_labels) + [log[-14:-4]]
                clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
                clean_lines.append(clean_line)
    df = pd.DataFrame(clean_lines, columns=labels + event_type_labels + ["date"])
    return df

## Load data from BAC files

In [4]:
df = from_bac()

### Combine date and time

In [5]:
def to_datetime(row):
    new_date = row['date'] + ' ' + row['TimeStamp']
    dt = datetime.strptime(new_date, '%Y-%m-%d %H:%M:%S')
    dt = dt + timedelta(hours=4)
    unix_secs = mktime(dt.timetuple())
    return unix_secs

In [6]:
df['unix_time'] = df.apply(to_datetime, axis=1)
df = df.drop(["TimeStamp","date"], axis=1)

### Drop useless columns

In [7]:
# Useless columns
df = df.drop(["LogLevel", "LogVisibility", "LogSeverity"],axis=1, errors='ignore')

# useless after deleting General Message
df = df.drop("string message", axis=1, errors='ignore')

# useless after deleting Button Change Event
df = df.drop(["keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState"], axis=1, errors='ignore')

# useless after deleting Connection Status
df = df.drop(["device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name"], axis=1, errors='ignore')

# useless after deleting Remote System Event
df = df.drop(["signalID", "signalName", "roomName3", "RemoteSystemEvent string"], axis=1, errors='ignore')

### Remove useless event types

In [8]:
df = df[df['eventType'] != 'ButtonChangedEvent']
df = df[df['eventType'] != 'GeneralMessage']
df = df[df['eventType'] != 'RemoteSystemEvent']
df = df[df['eventType'] != 'TimeClockChangedEvent']
df = df[df['eventType'] != 'ConnectionStatus']
df = df[df['eventType'] != 'DeviceConnectionStatusWithOptions']

### Merge Ids and Names

In [9]:
# Merge room names
df['room_name_merged'] = df['roomName1'].fillna('') + df['roomName4'].fillna('') + df['roomName6'].fillna('')
df = df.drop(["roomName1","roomName4","roomName6"], axis=1)

In [10]:
# Merge device name
df['device_name'] = df['loadName'].fillna('') + df['name'].fillna('') + df['Device Name1'].fillna('') + df['Device Name2'].fillna('')
df = df.drop(["loadName", "name", "Device Name1", "Device Name2"], axis=1)

In [11]:
# Merge IDs 
df['device_id'] = df['load ID'].fillna('') + df['ID'].fillna('') + df['device ID2'].fillna('') + df['device ID3'].fillna('')
df = df.drop(['load ID', 'ID', 'device ID2', 'device ID3'], axis=1)

In [12]:
# Merge messages 
df['message_merged'] = df['message1'].fillna('') + df['message2'].fillna('')
df = df.drop(["message1", "message2"], axis=1)

In [13]:
df = df.replace('', np.NaN)

### Combine device name and id to for truly unique ids

In [14]:
df['device_id_name'] = df['device_id'].astype(str) + df['device_name'].astype(str)
# df = df.drop(["device_id", "device_name"], axis=1)

In [15]:
df = df.drop_duplicates()

In [16]:
# Checkpoint 
df = save_or_load_from_checkpoint('./checkpoints/data_0.h5')  

### Normalize Column Values

In [17]:
# dropping more useless data
index_to_drop = df.loc[df["entryType"] == "Auxiliary"][df['eventType'] == "SignalChangedEventWithStrings"].index.tolist()
df = df.drop(index_to_drop)

  


In [18]:
def get_setpoints(row):
    signal_direction = str(row['Signal direction'])
    temp = np.nan
    if 'Fahrenheit' in signal_direction and signal_direction[0].isdigit():
        temp = int(''.join(x for x in signal_direction if x.isdigit()))
    return temp
    

In [19]:
df['temperature'] = df.apply(get_setpoints, axis=1)

In [5]:
df = save_or_load_from_checkpoint('./checkpoints/data_1.h5')  

## Get Binary Data

In [7]:
def check_signal_direction(df, string_val):
    return df[df["Signal Value string"] == string_val]['Signal direction'].unique().astype(str)

In [8]:
d = {}
for i in sorted(df["Signal Value string"].astype(str).unique()):
    d[i] = check_signal_direction(df, i)
for k,v in d.items():
    print(k, v)

"Leave" Event ['Heat Setpoint 68° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Return" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Sleep" Event ['Heat Setpoint 67° Fahrenheit, Cool Setpoint 77° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 72° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 71° Fahrenheit, and Auto Setpoint 73° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 73° Fahrenheit, and Auto Setpoint 73° Fahrenheit']
"Wake Weekend" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 70° Fahrenheit']
"Wake" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 75° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Weekend Wake" Event ['Heat Setpoint 71° Fahrenheit, C

In [9]:
d = {'Active Fan Level' : ['High','Off'],
'Auto Mode': ['Enabled', 'Disabled'],
'Cool Stage 1' : ['Active', 'Inactive'],
'Fan' : ['On', 'Auto'],
'Floor Warming' : ['Heat:False','Off:False', 'Off:True'],
'Heat Stage 1' : ['Active', 'Inactive'],
'Hold' : ['On', 'Off'],
'Humidifier Off' : ['Inactive', 'Active'],
'Mode' : ['Heat' 'Cool' 'Off']}

In [10]:
def is_binary(row, **kwargs):
    """kwargs = [Signal Value, values_dict]"""
    if str(row['eventType']) == 'SignalChangedEventWithStrings':
#         import pdb; pdb.set_trace()
        for key, values in kwargs.items():
            if str(row['Signal Value string']) == key:
                for val in values:
                    if str(row['Signal direction']) == val:
                        return key + val
    return np.nan

In [11]:
df['event'] = df.apply(is_binary, **d, axis=1)

In [12]:
def is_locked_or_occupied(row):
    if str(row['eventType']) == 'OccupancyChangedEvent':
        return row['message_merged']
    elif str(row['Signal Value string']) == 'Lock' or str(row['Signal Value string']) == 'Unlock':
        return row['entryType'] + row['Signal Value string']
    elif pd.notna(row['event']):
        return str(row['event'])
    return np.nan

In [13]:
df['event'] = df.apply(is_locked_or_occupied, axis=1)

## Get Regression Data

In [14]:
def get_value(row):
    value = np.nan
    if pd.notna(row['rampFinalValue']):
        value = int(row['rampFinalValue'])
    elif pd.notna(row['temperature']):
        value = int(row['temperature'])
    elif str(row['entryType']) == 'Shades' and str(row['eventType']) == 'SceneChangedEvent':
        base_str = str(row['device_id_name'])
        split_str = base_str.split(' ')
        first_str = split_str[0][-3:]
        is_closed = True if split_str[1] == 'Closed' else False
        if first_str[-1].isdigit():
            frac = float(Fraction(first_str))
            if is_closed:
                frac = 1 - frac
            value = frac
        elif is_closed:
            value = 0
        else:
            value = 1
    return value

In [15]:
df['value'] = df.apply(get_value, axis=1)

In [16]:
def get_event_type(row): 
    event_type = np.nan
    if pd.notna(row['value']):
        event_type = str(row['entryType'])
    return event_type

In [17]:
df['regression_value_type'] = df.apply(get_event_type, axis=1)

In [4]:
df = save_or_load_from_checkpoint('./checkpoints/data_2.h5')  

## Extract final dataframe

In [5]:
df = df[['device_id_name', 'event', 'regression_value_type', 'value', 'unix_time']]

### Add week, day, month, hour

In [6]:
def add_dates(data_frame):
    date = pd.to_datetime(data_frame['unix_time'], unit='s')
    data_frame['week'] = date.dt.week
    data_frame['day'] = date.dt.day
    data_frame['month'] = date.dt.month
    data_frame['hour'] = date.dt.hour
    data_frame['minute'] = date.dt.minute
    data_frame['second'] = date.dt.second
    return data_frame

df = add_dates(df)
df = df.drop('unix_time', axis=1)

In [16]:
json_data = None
with open('temperature_data.json', 'r') as data_file:
    json_data = json.load(data_file)

In [25]:
json_data['1503547200']

{'currently': {'apparentTemperature': 63.61,
  'cloudCover': 0,
  'dewPoint': 59.75,
  'humidity': 0.88,
  'icon': 'clear-night',
  'precipIntensity': 0,
  'precipProbability': 0,
  'pressure': 1011.92,
  'summary': 'Clear',
  'temperature': 63.44,
  'time': 1503547200,
  'visibility': 10,
  'windBearing': 298,
  'windGust': 0.32,
  'windSpeed': 0.28},
 'daily': {'data': [{'apparentTemperatureHigh': 80.25,
    'apparentTemperatureHighTime': 1503601200,
    'apparentTemperatureLow': 60.67,
    'apparentTemperatureLowTime': 1503658800,
    'apparentTemperatureMax': 80.25,
    'apparentTemperatureMaxTime': 1503601200,
    'apparentTemperatureMin': 59.91,
    'apparentTemperatureMinTime': 1503568800,
    'cloudCover': 0.09,
    'dewPoint': 57.28,
    'humidity': 0.69,
    'icon': 'partly-cloudy-day',
    'moonPhase': 0.1,
    'precipIntensity': 0.0001,
    'precipIntensityMax': 0.0007,
    'precipIntensityMaxTime': 1503576000,
    'precipProbability': 0.09,
    'precipType': 'rain',
    'p

In [None]:
temp_time_dict = {}

    if not current_timestamp in temp_time_dict.keys():
        r = requests.get(base_http + str(current_timestamp))
        temp_json = r.json()
        temp_time_dict[current_timestamp] = temp_json

### Add temperature, sunrise/sunset data

In [17]:
# base_http = 'https://api.darksky.net/forecast/d13a0a435a413b360106227fd68a8e0e/39.833851,-74.871826,'
# temp_time_dict = {}

def create_weather_sunset_columns(row):
    #hard-coded year for now. 
    date_obj = date(2017, row['month'], row['day'])
    #format for API
    current_timestamp = int(time.mktime(date_obj.timetuple())) + 4 * 60 * 60

#     if not current_timestamp in temp_time_dict.keys():
#         r = requests.get(base_http + str(current_timestamp))
#         temp_json = r.json()
#         temp_time_dict[current_timestamp] = temp_json

    
    temperature_info = json_data[current_timestamp]['hourly']['data'][row['hour']]['temperature']

    sunrise = json_data[current_timestamp]['daily']['data'][0]['sunriseTime'] #in GMT 
    sunset = json_data[current_timestamp]['daily']['data'][0]['sunsetTime']
    current_timestamp_with_seconds = current_timestamp + (row['hour'] * 60 * 60) + (row['minute'] * 60)

    is_sun_up = 1 if sunrise <= current_timestamp_with_seconds <= sunset else 0
    return is_sun_up, temperature_info

In [18]:
df['sun'], df['outside_temperature'] = zip(*df.apply(create_weather_sunset_columns, axis=1))

KeyError: (1502438400, 'occurred at index 1')

In [None]:
df = save_or_load_from_checkpoint('./checkpoints/data_3.h5')  

### Encode Columns

Convert Categorical to Numerical Data. 


In [None]:
from sklearn import preprocessing
columns_to_update = ["entryType", "entrySubType", "device_id_name", "eventType", "Signal Name", "Signal Value string", "Signal direction", "room_name_merged", "message_merged"]

def encode_columns(data_frame, column_names):
    label_encoders = {}
    for col in column_names:
        values = data_frame[col].unique()
        le = preprocessing.LabelEncoder()
        le.fit(values)
        label_encoders[col] = le
        numerical_values = le.transform(data_frame[col])
        data_frame[col] = numerical_values
    return data_frame, label_encoders

categorical_df = df.copy()
categorical_df, label_encoders = encode_columns(categorical_df, columns_to_update)

### One-hot encode columns
