In [None]:
# getting the utils file here
import os, sys
import xbos_services_getter as xsg
import datetime
import calendar
import pytz
import numpy as np
import pandas as pd
import itertools
import time
from pathlib import Path
import pickle
import yaml

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
import process_indoor_data as pid

In [None]:
import matplotlib.pyplot as plt

In [None]:
building_zone_names_stub = xsg.get_building_zone_names_stub()
all_buildings_zones = xsg.get_all_buildings_zones(building_zone_names_stub)

# Data Getters
Ground truth for how to store and retrieve data. The file naming is building_zone. No differentiation between training and test sets. 

In [None]:
def store_data(data, building, zone):
    data_dir = Path.cwd() / "services_data"
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)
    
    file_path = data_dir / (building + "_" + zone + ".pkl")
    if not os.path.isfile(file_path):
        return None
                                
    with open(str(file_path), "wb") as f:
        pickle.dump(data, f)

def load_data(building, zone):
    data_dir = Path.cwd() / "services_data"
    if not os.path.isdir(data_dir):
        return None

    file_path = data_dir / (building + "_" + zone + ".pkl")
    if not os.path.isfile(file_path):
        return None
                            
    with open(str(file_path), "rb") as f:
        return pickle.load(f)

# Get the data
num_start and num_end are hyperparameters. 

In [None]:
building = "avenal-animal-shelter"
zone = all_buildings_zones[building][0]
prediction_window = "5m"
seconds_prediction_window = xsg.get_window_in_sec(prediction_window)

In [None]:
# daterange
start = datetime.datetime(year=2018, month=7, day=1).replace(tzinfo=pytz.utc)
end = start + datetime.timedelta(days=130)

In [None]:
# TODO add check that the data we have stored is at least as long and has right prediction_window
# TODO Fix how we deal with nan's. some zone temperatures might get set to -1.
loaded_data = load_data(building, zone)
if loaded_data is None:
    processed_data = pid.get_preprocessed_data(building, zone, start, end, prediction_window)
    store_data(processed_data, building, zone)
else:
    processed_data = loaded_data

In [None]:
processed_data.shape

In [None]:
processed_data.head()

### Add features to prepocessed_data

In [None]:
processed_data = pid.indoor_data_cleaning(processed_data)
processed_data = pid.add_feature_last_temperature(processed_data)
processed_data = pid.convert_categorical_action(processed_data, num_start=4, num_end=4, interval_thermal=seconds_prediction_window)

In [None]:
processed_data.head()

# Linear Regressor with all features
This will use all available features and make it into a linear regressor. 

### Get training and test data

In [None]:
train_ratio = 0.7 # how much training data to take from given data
N = processed_data.shape[0] # number of datapoints
train_data = processed_data.iloc[:int(N*train_ratio)]
test_data = processed_data.iloc[int(N*train_ratio):]

In [None]:
columns_to_drop = ["action", "action_prev", "dt", "action_duration"]

# Training data
train_data = train_data[train_data["dt"] == seconds_prediction_window]
train_data = train_data.drop(columns_to_drop, axis=1)

train_y = train_data["t_next"].interpolate(method="time")
train_X = train_data.drop(["t_next"], axis=1).interpolate(method="time")

# Test data
test_data = test_data[test_data["dt"] == seconds_prediction_window]
test_data = test_data.drop(columns_to_drop, axis=1)

test_y = test_data["t_next"].interpolate(method="time")
test_X = test_data.drop(["t_next"], axis=1).interpolate(method="time")

In [None]:
train_X.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

reg = LinearRegression().fit(train_X, train_y)
reg.score(train_X, train_y)

# First Order Linear Regressor

In [None]:
train_ratio = 0.7 # how much training data to take from given data
N = processed_data.shape[0] # number of datapoints
train_data = processed_data.iloc[:int(N*train_ratio)]
test_data = processed_data.iloc[int(N*train_ratio):]

In [None]:
action_to_drop = []
for c in train_data.columns:
    if "action" in c and len("action") != c:
        action_to_drop.append(c)
        
columns_to_drop = action_to_drop + ["t_prev", "action_prev", "dt", "action_duration"]
  
# train data
train_data = train_data[train_data["dt"] == seconds_prediction_window]
train_data = train_data.drop(columns_to_drop, axis=1)

train_y = train_data["t_next"].interpolate(method="time")
train_X = train_data.drop(["t_next"], axis=1).interpolate(method="time")

# test data
test_data = test_data[test_data["dt"] == seconds_prediction_window]
test_data = test_data.drop(columns_to_drop, axis=1)

test_y = test_data["t_next"].interpolate(method="time")
test_X = test_data.drop(["t_next"], axis=1).interpolate(method="time")

In [None]:
train_X.head()

In [None]:
lin_reg = LinearRegression().fit(train_X, train_y)

# Create Tests for Regressor

Will try to do forecasting. If it fails, won't return anything.

In [None]:
def forecasting(thermal_model, data, start, duration, seconds_prediction_window, dt, is_second_order=False):

    true_data = data.loc[start:]
    
    if true_data.index[-1] < start + datetime.timedelta(seconds=duration) or true_data.index[0] != start:
        return None
    
    forecast = []

    curr_time = true_data.index[0]
    
    try:
        while curr_time <= start + datetime.timedelta(seconds=duration):

            curr_row = true_data.loc[curr_time].to_frame().T

            if (len(forecast) < 2 and is_second_order) or (len(forecast) < 1):
                forecast.append(float(curr_row["t_in"].values))
            else:
                curr_row["t_in"] = forecast[-1]
                if is_second_order:
                    curr_row["t_prev"] = forecast[-2]
                forecast.append(thermal_model.predict(curr_row)[0])

            curr_time += datetime.timedelta(seconds=float(dt.loc[curr_time]))

    except:
        return None
    
    forecast = forecast[:-1] # otherwise might predict beyond the set end
    return pd.Series(index=true_data.index[:len(forecast)], data=forecast)
    

In [None]:
dt = processed_data.loc[test_X.index[0]:test_X.index[-1]]["dt"]

N = test_X.shape[0]

forecasts = []

for i in range(N):

    start_time = test_X.index[i]
    forecast = forecasting(reg, test_X, start_time, 6*60*60, 5*60, dt, is_second_order=True)
    
    if forecast is not None:
        forecasts.append(forecast)
        
    if i % 500 == 0:
        print("Iteration:", i)
        print("Successful Forecasts:", len(forecasts))


# Get RMSE plot
Will do shady stuff. At least we can have duration / interval number of predictions. so we will only use that many. Otherwise dimensions don't work

In [None]:
errs = []
least_points = int(6*60*60 / (5*60))

for i in range(len(forecasts)):
    forecast = forecasts[i]
    real_data = test_X.loc[forecast.index]["t_in"]
    if real_data.shape[0] >= least_points:
        errs.append((forecast - real_data).values[:least_points])
        
print("Num before", len(forecasts))
print("Num remaining", len(errs))
errs = np.vstack(errs)
errs = np.square(errs)
errs = np.mean(errs, axis=0)
errs = np.sqrt(errs)
errs.shape

In [None]:
plt.plot(errs)

In [None]:
plt.plot(errs)

In [None]:
# occ_plot = pd.Series(index=date_range, data=test_data["occ"][date_range])

real_plot = test_X.loc[forecast.index]["t_in"]

# real_outside_plot = new_pred_horizon.loc[date_range]["t_out"]

# real_action_plt = new_pred_horizon.loc[date_range]["action"]

# real_action_plt *= 5
# real_action_plt += 65
# real_action_plt.plot(label="action", color="darkblue")

# real_outside_plot.plot(label="t_out")

real_plot.plot(label="real", color="goldenrod")
forecast.plot(label="forecast", color="firebrick")



# first_tm_plot.plot(label="First Order TM", color="firebrick")
# plt.show()

# # second_tm_plot.plot(label="Second order TM", color="steelblue")

# lti_tm_plot.plot(label="LTI TM", color="mediumpurple")
# plt.show()


# random_forest_plot.plot(label="Random Forest",  color="black")

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.show()


# print("plot")
# print(occ_plot)

# occ_plot.plot()
# plt.show()

def get_rmse_plot(pred, real):
    pred = pred[:len(real)]
    real = real[:len(pred)]
    diff = pred - real
    diff = np.square(diff)
    return np.sqrt(diff.mean())

# print("lag", get_rmse_plot(tm_pred_plot, real_plot))
# print("old", get_rmse_plot(old_pred_plot, real_plot))
# print("occ lag", get_rmse_plot(lag_tm_plot, real_plot))
# print("sec order", get_rmse_plot(second_tm_pred, real_plot))
# print("sec order occ", get_rmse_plot(second_occ_tm_pred, real_plot))



In [None]:
def save_results(building, zone, start, end, prediction_window, raw_data_granularity, train_ratio, is_second_order, 
                 curr_action_timesteps, prev_action_timesteps, method, rmse_series, num_forecasts, forecasting_horizon):
    """Stores the results and the methods as a yamls file. The files is stores in a way that it can be used
        to configure the exact same model and get the results. 
    
    :param building: (string) building name
    :param zone: (string) zone name
    :param start: (datetime timezone aware) start of the dataset used
    :param end: (datetime timezone aware) start of the dataset used
    :param prediction_window: (int seconds) number of seconds between predictions
    :param raw_data_granularity: (int seconds) the window size of the raw data. needs to be less than prediction_window.
    :param train_ratio: (float) in (0, 1). the ratio in which to split train and test set from the given dataset. The train set comes before test set in time. 
    :param is_second_order: (bool) Whether we are using second order in temperature.
    :param curr_action_timesteps: (int) The order of the current action. Set 0 if there should only be one action.
    :param prev_action_timesteps: (int) The order of the previous action. Set 0 if it should not be used at all.
    :param method: (str) ["OLS", "random_forest", "LSTM"] are the available methods so far
    :param rmse_series: np.array the rmse of the forecasting procedure. 
    :param num_forecasts: (int) The number of forecasts which contributed to the RMSE. 
    :param forecasting_horizon: (int seconds) The horizon used when forecasting.
    :return: 
    """
    to_store = {"building": building,
               "zone": zone, 
               "start": start, 
                "end": end,
               "raw_data_granularity": raw_data_granularity,
                "prediction_window": prediction_window,
                "train_ratio": train_ratio,
                "is_second_order": is_second_order,
                "curr_action_timesteps": curr_action_timesteps,
                "prev_action_timesteps": prev_action_timesteps,
                "method": method,
                "rmse_series": rmse_series,
                "num_forecasts": num_forecasts,
                "forecasting_horizon": forecasting_horizon
               }
    
    data_dir = Path.cwd() / "model_results"
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)
    
    file_path = data_dir / (building + "_" + zone + ".pkl")
    
    if os.path.isfile(file_path):
        try:
            with open(str(file_path), "rb") as f:
                loaded_results = pickle.load(f)
            loaded_results.append(to_store)
            to_store = loaded_results
        except:
            to_store = [to_store]
    else:
        to_store = [to_store]
        
    with open(str(file_path), "wb") as f:
        pickle.dump(to_store, f)
        
def load_results(building, zone):
    data_dir = Path.cwd() / "model_results"
    if not os.path.isdir(data_dir):
        return None
    
    file_path = data_dir / (building + "_" + zone + ".pkl")
    
    if not os.path.isfile(file_path):
        return None
    
    with open(str(file_path), "rb") as f:
        return pickle.load(f)

In [None]:
save_results(building, zone, start, end, prediction_window, 60, train_ratio, True, 
                 4, 4, "OLS", errs, 6167, 6*60*60)


In [None]:
load_results(building, zone)

In [None]:
def full_automization_baby(building, zone, start, end, prediction_window, raw_data_granularity, train_ratio, is_second_order, 
                 curr_action_timesteps, prev_action_timesteps, method, rmse_series, num_forecasts, forecasting_horizon):
    pass

def create_model(building, zone, start, end, prediction_window, raw_data_granularity, train_ratio, is_second_order, 
                 curr_action_timesteps, prev_action_timesteps, method):
    
    if method != "OLS":
        raise NotImplementedError("%s is not supported. Use OLS instead." % method)
    
    # Get data
    # TODO add check that the data we have stored is at least as long and has right prediction_window
    # TODO Fix how we deal with nan's. some zone temperatures might get set to -1.
    loaded_data = load_data(building, zone)
    if loaded_data is None:
        processed_data = pid.get_preprocessed_data(building, zone, start, end, prediction_window, raw_data_granularity)
        store_data(processed_data, building, zone)
    else:
        processed_data = loaded_data
        
    # add features
    processed_data = pid.indoor_data_cleaning(processed_data)
    if is_second_order:
        processed_data = pid.add_feature_last_temperature(processed_data)
    if curr_action_timesteps > 0 or prev_action_timesteps > 0:
        processed_data = pid.convert_categorical_action(processed_data, num_start=curr_action_timesteps, num_end=prev_action_timesteps, interval_thermal=seconds_prediction_window)
    
    # split data into training and test sets
    N = processed_data.shape[0] # number of datapoints
    train_data = processed_data.iloc[:int(N*train_ratio)]
    test_data = processed_data.iloc[int(N*train_ratio):]
    
    # which columns to drop for training and testing
    columns_to_drop = ["dt", "action_duration"]
    if curr_action_timesteps != 0:
        columns_to_drop.append("action")
    if prev_action_timesteps != 0:
        columns_to_drop.append("action_prev")

    # train data
    train_data = train_data[train_data["dt"] == seconds_prediction_window]
    train_data = train_data.drop(columns_to_drop, axis=1)

    train_y = train_data["t_next"].interpolate(method="time")
    train_X = train_data.drop(["t_next"], axis=1).interpolate(method="time")

    # test data
    test_data = test_data[test_data["dt"] == seconds_prediction_window]
    test_data = test_data.drop(columns_to_drop, axis=1)

    test_y = test_data["t_next"].interpolate(method="time")
    test_X = test_data.drop(["t_next"], axis=1).interpolate(method="time")
    
    # Make OLS model
    reg = LinearRegression().fit(train_X, train_y)
    return reg

# Create Model and run on all Buildings/Zones

In [4]:
import create_test_models as ctm
import datetime
import pytz
import xbos_services_getter as xsg

building_zone_names_stub = xsg.get_building_zone_names_stub()
all_building_zones = xsg.get_all_buildings_zones(building_zone_names_stub)

start = datetime.datetime(year=2018, month=7, day=1).replace(tzinfo=pytz.utc)
end = start + datetime.timedelta(days=10)


for bldg in list(not_working.keys())[1:]:
    print("Building", bldg)
    for zone in all_building_zones[bldg]:
        print("Zone", zone)
        reg, p_data, test_X, test_y = ctm.create_model(bldg, zone, 
                             start, end, "5m", "1m", 0.7, True, 
                             1, 1, "OLS")
#         try:
#             reg, p_data, test_X, test_y = ctm.create_model(bldg, zone, 
#                              start, end, "5m", "1m", 0.7, True, 
#                              1, 1, "OLS")
#             print("Score", reg.score(test_X, test_y))

#         except:
#             if bldg not in not_working:
#                 not_working[bldg] = []
#             not_working[bldg].append(zone)
    
    print("")
        

Building hayward-station-8
Zone HVAC_Zone_F-2


_Rendezvous: <_Rendezvous of RPC that terminated with:
	status = StatusCode.INVALID_ARGUMENT
	details = "No data received from database."
	debug_error_string = "{"created":"@1553988333.716740000","description":"Error received from peer","file":"src/core/lib/surface/call.cc","file_line":1017,"grpc_message":"No data received from database.","grpc_status":3}"
>

In [None]:
test_X.head()

In [6]:
at first did 10 days

SyntaxError: invalid syntax (<ipython-input-6-66361735d7b1>, line 1)

In [None]:
not_working

In [1]:
not_working = {'hayward-station-1': ['HVAC_Zone_AC-7',
  'HVAC_Zone_AC-6',
  'HVAC_Zone_AC-5',
  'HVAC_Zone_AC-4',
  'HVAC_Zone_AC-3',
  'HVAC_Zone_AC-2',
  'HVAC_Zone_AC-1'],
 'hayward-station-8': ['HVAC_Zone_F-2', 'HVAC_Zone_F-3', 'HVAC_Zone_F-1'],
 'north-berkeley-senior-center': ['HVAC_Zone_AC-5',
  'HVAC_Zone_AC-3',
  'HVAC_Zone_AC-1'],
 'csu-dominguez-hills': ['HVAC_Zone_SAC_2134',
  'HVAC_Zone_SAC_2113A',
  'HVAC_Zone_SAC_2149',
  'HVAC_Zone_SAC_2103',
  'HVAC_Zone_SAC_2107',
  'HVAC_Zone_SAC_2144',
  'HVAC_Zone_Sac_2_Corridor',
  'HVAC_Zone_SAC_2114',
  'HVAC_Zone_SAC_2113',
  'HVAC_Zone_SAC-2106',
  'HVAC_Zone_SAC-2104',
  'HVAC_Zone_SAC-2102',
  'HVAC_Zone_SAC_2150',
  'HVAC_Zone_SAC_2105',
  'HVAC_Zone_SAC_2101',
  'HVAC_Zone_SAC_2129',
  'HVAC_Zone_SAC_2126'],
 'orinda-community-center': ['HVAC_Zone_RM2',
  'HVAC_Zone_RM1',
  'HVAC_Zone_RM6',
  'HVAC_Zone_RM7',
  'HVAC_Zone_AC-8',
  'HVAC_Zone_AC-7',
  'HVAC_Zone_AC-6',
  'HVAC_Zone_AC-5',
  'HVAC_Zone_AC-4',
  'HVAC_Zone_AC-3',
  'HVAC_Zone_AC-2',
  'HVAC_Zone_AC-1',
  'HVAC_Zone_Kinder_GYM',
  'HVAC_Zone_FRONT_OFFICE'],
 'avenal-veterans-hall': ['HVAC_Zone_AC-6',
  'HVAC_Zone_AC-5',
  'HVAC_Zone_AC-4',
  'HVAC_Zone_AC-3',
  'HVAC_Zone_AC-2',
  'HVAC_Zone_AC-1'],
 'south-berkeley-senior-center': ['HVAC_Zone_AC-3',
  'HVAC_Zone_Front_Office',
  'HVAC_Zone_AC-2']}
