# Clean up data, convert to dataframes, and save to csv

The BAC file is pipe separated with the following fields:
LogLevel [TimeStamp]:[LogVisibility][LogSeverity][entryType][entrySubType][eventType][EventType dependent strings]

The information changes at the end of each entry based on its EventType.  Here is a list of the supported event types and the subsequent additional information for each listed below.

GeneralMessage - [string message]

LevelChangedEvent - [load ID][loadName][roomName][rampTime][rampBaseValue][rampFinalValue]

ButtonChangedEvent - [keypad ID][keypadName][roomName][buttonNum][buttonState]

RemoteSystemEvent - [signalID][signalName][roomName][RemoteSystemEvent string]
TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent - [ID][name][roomName][message]

ConnectionStatus - [device ID][Name][room Name][connection status][Load 1 Room Name:Load 1 Name]|[Load 2 Room Name:Load 2 Name]

    NOTE: DeviceConnectionStatusWithOptions is the same format as ConnectionStatus. 
    
SignalChangedEvent - [device ID][Device Name][room Name][signal event ID][signal Value] - Signal event ID differs by device and signal value is either bool or int based on the eventID.

SignalChangedEventWithStrings - [device ID][Device Name][Signal Name][Signal Value string][Signal direction][message]


In [1]:
import os 
import pandas as pd
import numpy as np
import h5py
import json
from numpy import nan
from datetime import datetime, timedelta, date
from fractions import Fraction
from time import mktime
import time
import requests
import os.path
pd.set_option('display.max_columns', 500)

  from ._conv import register_converters as _register_converters


Function to quickly load or save dataframes as h5 files

In [2]:
def save_or_load_from_checkpoint(checkpoint_name):
    if os.path.isfile(checkpoint_name):
        return pd.read_hdf(checkpoint_name,'table')
    df.to_hdf(checkpoint_name, 'table', mode='w', append=True, complevel=9, complib='zlib', index=False)
    return df

### Loading logs and initial cleaning

First we read in the data from the original .BAC files, do a basic first pass of formatting the data, and set up a large  dataframe to hold the data for each event type. We start by separating the log level and time stamp since they are not pipe separeated, and then split the message into columns the correspond to the the log fields and the EventType dependent strings

In [3]:
# column labels
labels = ['LogLevel',
          'TimeStamp',
          'LogVisibility',
          'LogSeverity',
          'entryType',
          'entrySubType',
          'eventType']
# from label list
EVENT_TYPE_INDEX = labels.index('eventType')

# from line in .bac file
LOG_LEVEL_START = 0
LOG_LEVEL_END = 3
TIMESTAMP_START = 6
TIMESTAMP_END = 14
PIPE_SEPARATED_DATA_START = 17

event_type_labels = [
    "string message", # GeneralMessage
    "load ID", "loadName", "roomName1", "rampTime", "rampBaseValue", "rampFinalValue", # LevelChangedEvent
    "keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState", # ButtonChangedEvent
    "signalID", "signalName", "roomName3", "RemoteSystemEvent string", # RemoteSystemEvent
    "ID", "name", "roomName4", "message1", # TimeClockChangedEvent/OccupancyChangedEvent/SceneChangedEvent
    "device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name", # ConnectionStatus/DeviceConnectionStatusWithOptions
    "device ID2", "Device Name1", "roomName6", "signal event ID", "signal Value", # SignalChangedEvent
    "device ID3", "Device Name2", "Signal Name", "Signal Value string", "Signal direction", "message2", # SignalChangedEventWithStrings
]

def from_bac():
    # device_id_offsets = [i for i, x in enumerate(event_type_labels) if x == "device ID"]
    clean_lines = []

    for log in os.listdir('data'):
        with open('data/{}'.format(log)) as logfile:
            for line in logfile:
                line = line.rstrip('\n')
                if line[-1] == '|':
                    line = line[:-1]
                all_data = ([line[LOG_LEVEL_START:LOG_LEVEL_END], line[TIMESTAMP_START:TIMESTAMP_END]]
                            + line[PIPE_SEPARATED_DATA_START:].split('|'))

                event_type_dependent_strings = all_data[len(labels):]
                clean_line = all_data[:len(labels)]

                START_INDEX = 7
                if clean_line[EVENT_TYPE_INDEX] == 'GeneralMessage':
                    START_INDEX += event_type_labels.index('string message')
                    # account for pipes in the message string
                    event_type_dependent_strings = ['|'.join(event_type_dependent_strings)]
                    assert START_INDEX == 7
                    assert len(event_type_dependent_strings) == 1, event_type_dependent_strings

                elif clean_line[EVENT_TYPE_INDEX] == 'LevelChangedEvent':
                    START_INDEX += event_type_labels.index('load ID')
                    assert START_INDEX == 8
                    assert len(event_type_dependent_strings) == 6

                elif clean_line[EVENT_TYPE_INDEX] == 'ButtonChangedEvent':
                    START_INDEX += event_type_labels.index('keypad ID')
                    assert START_INDEX == 14
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'RemoteSystemEvent':
                    START_INDEX += event_type_labels.index('signalID')
                    assert START_INDEX == 19
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'TimeClockChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'OccupancyChangedEvent' or
                      clean_line[EVENT_TYPE_INDEX] == 'SceneChangedEvent'):
                    START_INDEX += event_type_labels.index('ID')
                    assert START_INDEX == 23
                    assert len(event_type_dependent_strings) == 4

                elif (clean_line[EVENT_TYPE_INDEX] == 'ConnectionStatus' or
                      clean_line[EVENT_TYPE_INDEX] == 'DeviceConnectionStatusWithOptions'):
                    START_INDEX += event_type_labels.index('device ID1')
                    assert START_INDEX == 27
                    assert (len(event_type_dependent_strings) == 4 or
                            len(event_type_dependent_strings) == 5 or
                            len(event_type_dependent_strings) == 6)

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEvent':
                    START_INDEX += event_type_labels.index('device ID2')
                    assert START_INDEX == 33
                    assert len(event_type_dependent_strings) == 5

                elif clean_line[EVENT_TYPE_INDEX] == 'SignalChangedEventWithStrings':
                    START_INDEX += event_type_labels.index('device ID3')
                    assert START_INDEX == 38
                    # to correct for the double pipe in "Basement Mudroom"
                    event_type_dependent_strings = [i for i in event_type_dependent_strings if i]
                else:
                    raise ValueError("Wrong event type: {}".format(clean_line[EVENT_TYPE_INDEX]))

                clean_line = clean_line + [np.nan]*len(event_type_labels) + [log[-14:-4]]
                clean_line[START_INDEX:START_INDEX + len(event_type_dependent_strings)] = event_type_dependent_strings
                clean_lines.append(clean_line)
    df = pd.DataFrame(clean_lines, columns=labels + event_type_labels + ["date"])
    return df

## Load data from BAC files

In [4]:
df = from_bac()

### Combine date and time

In [5]:
def to_datetime(row):
    new_date = row['date'] + ' ' + row['TimeStamp']
    dt = datetime.strptime(new_date, '%Y-%m-%d %H:%M:%S')
    dt = dt + timedelta(hours=4)
    unix_secs = mktime(dt.timetuple())
    return unix_secs

In [6]:
df['unix_time'] = df.apply(to_datetime, axis=1)
df['unix_time'] = pd.to_datetime(df['unix_time'], unit='s')
df = df.sort_values(by='unix_time')
df = df.drop(["TimeStamp","date"], axis=1)

### Drop columns that don't provide additional information that the algorithms can use

In [8]:
# Unnecessary general columns
df = df.drop(["LogLevel", "LogVisibility", "LogSeverity"],axis=1, errors='ignore')
 
# Unnecessary after deleting General Message
df = df.drop("string message", axis=1, errors='ignore')
 
# Unnecessary after deleting Button Change Event
df = df.drop(["keypad ID", "keypadName", "roomName2", "buttonNum", "buttonState"], axis=1, errors='ignore')
 
# Unnecessary after deleting Connection Status
df = df.drop(["device ID1", "Name", "roomName5", "connection status", "Load 1 Room Name:Load 1 Name", "Load 2 Room Name:Load 2 Name"], axis=1, errors='ignore')
 
# Unnecessary after deleting Remote System Event
df = df.drop(["signalID", "signalName", "roomName3", "RemoteSystemEvent string"], axis=1, errors='ignore')

# Unnecessary because redundant with Signal Name
df = df.drop(["Device Name2"], axis=1, errors='ignore')

In [33]:
df[df['Signal direction'].notnull()]

Unnamed: 0,entryType,entrySubType,eventType,rampTime,rampBaseValue,rampFinalValue,name,signal event ID,signal Value,Signal Value string,Signal direction,unix_time,room_name_merged,location_in_room,message_merged,device_entry_id
5195,DoorLock,Device,SignalChangedEventWithStrings,,,,,,,Lock,ToDevice,2017-08-04 08:00:08,Basement Mudroom,,,208DoorLock
5210,Climate,Device,SignalChangedEventWithStrings,,,,,,,Active Fan Level,Off,2017-08-04 08:11:12,Master Bed,,FromDevice,194Climate
5211,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,71° Fahrenheit,2017-08-04 08:11:12,Master Bed,,FromDevice,194Climate
5209,Climate,Device,SignalChangedEventWithStrings,,,,,,,Cool Stage 1,Inactive,2017-08-04 08:11:12,Master Bed,,FromDevice,194Climate
5239,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,72° Fahrenheit,2017-08-04 08:13:12,Master Bed,,FromDevice,194Climate
5240,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,73° Fahrenheit,2017-08-04 08:15:23,Master Bed,,FromDevice,194Climate
5325,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,72° Fahrenheit,2017-08-04 08:28:04,Master Bed,,FromDevice,194Climate
5334,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,73° Fahrenheit,2017-08-04 08:30:04,Master Bed,,FromDevice,194Climate
5364,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,72° Fahrenheit,2017-08-04 08:32:56,Master Bed,,FromDevice,194Climate
5365,Climate,Device,SignalChangedEventWithStrings,,,,,,,Temperature,73° Fahrenheit,2017-08-04 08:34:56,Master Bed,,FromDevice,194Climate


In [32]:
sorted(df['Signal direction'].astype(str).unique())

['0° Fahrenheit',
 '118° Fahrenheit',
 '119° Fahrenheit',
 '120° Fahrenheit',
 '121° Fahrenheit',
 '122° Fahrenheit',
 '2° Fahrenheit',
 '32° Fahrenheit',
 '5%',
 '59° Fahrenheit',
 '60° Fahrenheit',
 '61° Fahrenheit',
 '62° Fahrenheit',
 '63° Fahrenheit',
 '64° Fahrenheit',
 '65° Fahrenheit',
 '66° Fahrenheit',
 '67° Fahrenheit',
 '68° Fahrenheit',
 '69° Fahrenheit',
 '70° Fahrenheit',
 '71° Fahrenheit',
 '72° Fahrenheit',
 '73° Fahrenheit',
 '74° Fahrenheit',
 '75° Fahrenheit',
 '76° Fahrenheit',
 '77° Fahrenheit',
 '78° Fahrenheit',
 '79° Fahrenheit',
 '80° Fahrenheit',
 '81° Fahrenheit',
 'Active',
 'Auto',
 'Cool',
 'Disabled',
 'Enabled',
 'FromDevice',
 'Heat',
 'Heat = 67° Fahrenheit, Cool = 71° Fahrenheit, and Auto = 73° Fahrenheit',
 'Heat = 67° Fahrenheit, Cool = 72° Fahrenheit, and Auto = 72° Fahrenheit',
 'Heat = 67° Fahrenheit, Cool = 73° Fahrenheit, and Auto = 73° Fahrenheit',
 'Heat = 67° Fahrenheit, Cool = 77° Fahrenheit, and Auto = 72° Fahrenheit',
 'Heat = 68° Fahren

### Remove event types that don't provide additional information that we can use

In [21]:
df = df[df['eventType'] != 'ButtonChangedEvent']
df = df[df['eventType'] != 'GeneralMessage']
df = df[df['eventType'] != 'RemoteSystemEvent']
df = df[df['eventType'] != 'TimeClockChangedEvent']
df = df[df['eventType'] != 'ConnectionStatus']
df = df[df['eventType'] != 'DeviceConnectionStatusWithOptions']

### Merge Ids and Names

We combined all of the EventType dependent string fields that seemed to refer to the same thing in order to remore redundant columns

In [22]:
# Merge room names
df['room_name_merged'] = df['roomName1'].fillna('') + df['roomName4'].fillna('') + df['roomName6'].fillna('') + df['Signal Name'].fillna('')
df = df.drop(["roomName1","roomName4","roomName6", "Signal Name"], axis=1)

In [23]:
df['location_in_room'] = df['loadName'].fillna('') + df['Device Name1'].fillna('')
df = df.drop(["loadName","Device Name1"], axis=1)

In [24]:
# Merge IDs 
df['device_id'] = df['load ID'].fillna('') + df['ID'].fillna('') + df['device ID2'].fillna('') + df['device ID3'].fillna('')
df = df.drop(['load ID', 'ID', 'device ID2', 'device ID3'], axis=1)

In [25]:
# Merge messages 
df['message_merged'] = df['message1'].fillna('') + df['message2'].fillna('')
df = df.drop(["message1", "message2"], axis=1)

In [26]:
df = df.replace('', np.NaN)

### Combine device name and id to for truly unique ids

The devices had no unique identifier stored. We created unique identifiers for each device by combining device_name and device_id.

In [27]:
df['device_entry_id'] = df['device_id'].astype(str) + df['entryType'].astype(str)
df = df.drop(["device_id"], axis=1)

In [28]:
df = df.drop_duplicates()

In [29]:
# Checkpoint 
df = save_or_load_from_checkpoint('./checkpoints/data_0.h5')  

### Drop unneccesary rows

In [30]:
# dropping more useless data
index_to_drop = df.loc[df["entryType"] == "Auxiliary"][df['eventType'] == "SignalChangedEventWithStrings"].index.tolist()
df = df.drop(index_to_drop)

  


In [31]:
# dropping repeated data
index_to_drop = df.loc[((df['entryType'] == "System") | (df['entryType'] == "DoorLock"))& (df["name"].notnull())].index.tolist()
df = df.drop(index_to_drop)

### Get temperature values from 'Signal direction'

In [34]:
def get_setpoints(row):
    signal_direction = str(row['Signal direction'])
    temp = np.nan
    if 'Fahrenheit' in signal_direction and signal_direction[0].isdigit():
        temp = int(''.join(x for x in signal_direction if x.isdigit()))
    return temp
    

In [35]:
df['temperature'] = df.apply(get_setpoints, axis=1)

In [36]:
df = save_or_load_from_checkpoint('./checkpoints/data_1.h5')  

## Get Event Data

We looked through the different SignalChangedEventWithStrings messages that have relevant data and picked out the ones that refer to categorical events and stored the values in a separate column named 'event'

In [37]:
def check_signal_direction(df, string_val):
    return df[df["Signal Value string"] == string_val]['Signal direction'].unique().astype(str)

In [53]:
d = {}
for i in sorted(df["Signal Value string"].astype(str).unique()):
    d[i] = check_signal_direction(df, i)
for key, value in d.items():
    print(key,value)

"Leave" Event ['Heat Setpoint 68° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Return" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Sleep" Event ['Heat Setpoint 67° Fahrenheit, Cool Setpoint 77° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 72° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 71° Fahrenheit, and Auto Setpoint 73° Fahrenheit'
 'Heat Setpoint 67° Fahrenheit, Cool Setpoint 73° Fahrenheit, and Auto Setpoint 73° Fahrenheit']
"Wake Weekend" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 70° Fahrenheit']
"Wake" Event ['Heat Setpoint 71° Fahrenheit, Cool Setpoint 75° Fahrenheit, and Auto Setpoint 72° Fahrenheit'
 'Heat Setpoint 71° Fahrenheit, Cool Setpoint 74° Fahrenheit, and Auto Setpoint 72° Fahrenheit']
"Weekend Wake" Event ['Heat Setpoint 71° Fahrenheit, C

After some manual inspection this is the dictionary that we settled on that contains events which are in a format that we can use

In [54]:
# d = {'Active Fan Level' : ['High','Off'],
# 'Auto Mode': ['Enabled', 'Disabled'],
# 'Cool Stage 1' : ['Active', 'Inactive'],
# 'Fan' : ['On', 'Auto'],
# 'Floor Warming' : ['Heat:False','Off:False', 'Off:True'],
# 'Heat Stage 1' : ['Active', 'Inactive'],
# 'Hold' : ['On', 'Off'],
# 'Humidifier Off' : ['Inactive', 'Active'],
# 'Mode' : ['Heat' 'Cool' 'Off']}
d = {
    'Mode': ['Heat'
             'Cool'
             'Off'],
    'Auto Mode': ['Enabled'
                  'Disabled'],
    'Single Setpoint Mode': ['Disabled'],
    'Slab 5B': ['Inactive'],
    'Humidity View': ['Enabled'],
    'Slab 2': ['Inactive'],
    'Heat Stage 1': ['Active'
                     'Inactive'],
    'Cool Stage 1': ['Inactive'
                     'Active'],
    'Active Fan Level': ['Off'
                         'High'],
    'Fan': ['On'
            'Auto'],
    'Hold': ['On'
             'Off'],
    'Humidifier Enable': ['Inactive'],
    'Slab 4B': ['Inactive'],
    'Slab 1': ['Inactive'],
    'Slab 3': ['Inactive'],
    'Humidity Mode': ['Enabled'],
    'Floor Warming': ['Heat:False'
                      'Off:False'
                      'Off:True'],
    'Cool Mode': ['Enabled'],
    'Humidifier Off': ['Inactive'
                       'Active'],
    'Slab 4A': ['Inactive'],
    'Heat Mode': ['Enabled'],
    'Slab 5A': ['Inactive']
}

This function is used to encode the events that have their values in the 'Signal Direction' <br> *This seems like this could be a bug? Are these messages supposed to be in the message column instead?

In [55]:
def encode_event(row, **kwargs):
    """kwargs = [Signal Value, values_dict]"""
    if str(row['eventType']) == 'SignalChangedEventWithStrings':
#         import pdb; pdb.set_trace()
        for key, values in kwargs.items():
            if str(row['Signal Value string']) == key:
                for val in values:
                    if str(row['Signal direction']) == val:
                        return key + val
    return np.nan

In [56]:
df['event'] = df.apply(encode_event, **d, axis=1)

This function performs similarly to the one above, but the values for the lock/unlock and occupancy/vacancy events are in a different column from the rest of the data

In [57]:
def is_locked_or_occupied(row):
    if str(row['eventType']) == 'OccupancyChangedEvent':
        return row['message_merged']
    elif str(row['Signal Value string']) == 'Lock' or str(row['Signal Value string']) == 'Unlock':
        return row['entryType'] + row['Signal Value string']
    elif pd.notna(row['event']):
        return str(row['event'])
    return np.nan

In [58]:
df['event'] = df.apply(is_locked_or_occupied, axis=1)

## Get Regression Data

In [64]:
def get_value(row):
    value = np.nan
    if pd.notna(row['rampFinalValue']):
        value = int(row['rampFinalValue'])
    elif pd.notna(row['temperature']):
        value = int(row['temperature'])
    elif str(row['entryType']) == 'Shades' and str(row['eventType']) == 'SceneChangedEvent':
        base_str = str(row['name'])
        split_str = base_str.split(' ')
        first_str = split_str[0]
        is_closed = True if split_str[1] == 'Closed' else False
        if first_str[-1].isdigit():
            frac = float(Fraction(first_str))
            if is_closed:
                frac = 1 - frac
            value = frac
        elif is_closed:
            value = 0
        else:
            value = 1
    return value

In [65]:
df['value'] = df.apply(get_value, axis=1)

In [66]:
def get_event_type(row): 
    event_type = np.nan
    if pd.notna(row['value']):
        event_type = str(row['entryType'])
    return event_type

In [67]:
df['regression_value_type'] = df.apply(get_event_type, axis=1)

In [68]:
df = save_or_load_from_checkpoint('./checkpoints/data_2.h5')  

## Extract final dataframe

Create a new dataframe containing only the columns that we will be using for statistical analysis and ML exploration## Extract final dataframe

In [71]:
df = df[['device_entry_id', 'room_name_merged', 'location_in_room', 'event', 'regression_value_type', 'value', 'unix_time']]

#### Convert to GMT

In [74]:
df['unix_time'] = df['unix_time'] + pd.DateOffset(hours=4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### Remove null rows

In [75]:
df = df.loc[(df['event'].notnull()) | (df['regression_value_type'].notnull()) | (df['value'].notnull())]

### Add week, day, month, hour

In [None]:
def add_dates(data_frame):
    date = pd.to_datetime(data_frame['unix_time'], unit='s')
    data_frame['week'] = date.dt.week
    data_frame['day'] = date.dt.day
    data_frame['month'] = date.dt.month
    data_frame['hour'] = date.dt.hour
    data_frame['minute'] = date.dt.minute
    data_frame['second'] = date.dt.second
    return data_frame

# df = add_dates(df)
# df = df.drop('unix_time', axis=1)

### Add temperature, sunrise/sunset data

Here we use the darksky weather API to store data about the weather on each given day in a local temperature_data.json file.

In [None]:
def create_temp_time_dict(data_frame):
    base_http = 'https://api.darksky.net/forecast/afeffdaf32e862d1c6d7279c7f5df74f/39.833851,-74.871826,'
    end_http = '?exclude=currently,flags,alerts,minutely'
    hourly_temp_dict = {}
    
    dates = data_frame['unix_time'].map(pd.Timestamp.date).unique()
    
    for date in dates:
        unix_date = int(time.mktime(date.timetuple()))
        response = requests.get(base_http + str(unix_date) + end_http) 
        temp_json = response.json()
        sunset_time = temp_json['daily']['data'][0]['sunsetTime']
        sunrise_time = temp_json['daily']['data'][0]['sunriseTime']
        for hour_data in temp_json['hourly']['data']:
            hourly_temp_dict[hour_data['time']] = hour_data
            hourly_temp_dict[hour_data['time']]['sunset_time'] = sunset_time
            hourly_temp_dict[hour_data['time']]['sunrise_time'] = sunrise_time
        
    with open('temperature_data_hourly.json', 'w') as outfile:
        json.dump(hourly_temp_dict, outfile)


In [79]:
json_data = None
if not os.path.isfile('temperature_data_hourly.json'):
    create_temp_time_dict(df)
with open('temperature_data_hourly.json') as f:
    json_data = json.load(f)

From the json object we just created, we pull the hourly temperature, and determine whether the sun was up or down 
for each row in the data. We add two rows to the dataframe, 'sun', and 'outside_temperature'. 

In [81]:
def create_weather_sunset_columns(row):
    #hard-coded year for now. 
    date = row['unix_time'].date()
    hour = row['unix_time'].hour
    minute = row['unix_time'].minute
    
    #format for API
    date_timestamp = int(time.mktime(date.timetuple()))
    hour_timestamp = date_timestamp + hour * 60 * 60
    minute_timestamp = hour_timestamp + minute * 60
    
    hour_timestamp_string = str(hour_timestamp)
    
    temperature_info = json_data[hour_timestamp_string]['temperature']
    sunrise = json_data[hour_timestamp_string]['sunrise_time'] #in GMT 
    sunset = json_data[hour_timestamp_string]['sunset_time']

    is_sun_up = 1 if sunrise <= minute_timestamp <= sunset else 0
    return is_sun_up, temperature_info

In [82]:
df['sun'], df['outside_temperature'] = zip(*df.apply(create_weather_sunset_columns, axis=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [85]:
df = save_or_load_from_checkpoint('./checkpoints/data_3.h5')  

## Get event only data 

In [87]:
df.groupby('device_entry_id').count()

Unnamed: 0_level_0,room_name_merged,location_in_room,event,regression_value_type,value,unix_time,sun,outside_temperature
device_entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100Lights,12654,12654,0,12654,12654,12654,12654,12654
104Lights,15062,15062,0,15062,15062,15062,15062,15062
104Shades,239,0,0,239,239,239,239,239
105Shades,41,0,0,41,41,41,41,41
106Lights,10311,10311,0,10311,10311,10311,10311,10311
106Shades,1,0,0,1,1,1,1,1
108Lights,1636,1636,0,1636,1636,1636,1636,1636
108Shades,1,0,0,1,1,1,1,1
110Lights,1588,1588,0,1588,1588,1588,1588,1588
112Lights,1730,1730,0,1730,1730,1730,1730,1730


In [None]:
df

In [None]:
event_df = df[['device_id_name', 'event', 'unix_time', 'sun', 'outside_temperature']]

In [None]:
event_df = event_df[event_df["event"].notnull()]
event_df

In [None]:
test_event_df = event_df.copy()

In [None]:
test_event_df.index = pd.DatetimeIndex(test_event_df['unix_time'])

### Encode Columns

Convert Categorical to Numerical Data. 


In [None]:
from sklearn import preprocessing
columns_to_update = ["device_id_name", "event", "regression_value_type"]

def encode_columns(data_frame, column_names):
    label_encoders = {}
    for col in column_names:
        values = data_frame[col].unique()
        le = preprocessing.LabelEncoder()
        le.fit(values)
        label_encoders[col] = le
        numerical_values = le.transform(data_frame[col])
        data_frame[col] = numerical_values
    return data_frame, label_encoders

categorical_df = df.copy()
# to encode the df we need to encode the NaNs as strings
categorical_df = categorical_df.fillna({'regression_value_type':'nan', 'event': 'nan', 'value': 0})
categorical_df, label_encoders = encode_columns(categorical_df, columns_to_update)

In [None]:
categorical_df.to_hdf('./checkpoints/data_categorical.h5', 'table', mode='w', append=True, complevel=9, complib='zlib', index=False)

### One-hot encode columns


starting with binary

In [None]:
def one_hot_encode_col(data_frame, col_name):
    one_hot_encoded = pd.get_dummies(data_frame[col_name], prefix=col_name)
    return one_hot_encoded

In [None]:
binary_df = df.copy()
binary_df = binary_df.drop('regression_value_type', axis=1)
binary_df = binary_df.drop('value', axis=1)

In [None]:
event_encoded = one_hot_encode_col(binary_df, 'event')
binary_df = pd.concat([binary_df, event_encoded], axis=1)
binary_df = binary_df.drop('event', axis=1)

In [None]:
binary_df.to_hdf('./checkpoints/data_binary_encoded.h5', 'table', mode='w', append=True, complevel=9, complib='zlib', index=False)