# Building Data Genome - Modelling


## Installations and Setting up

In [5]:
#Installations

In [7]:
# Import required packages
import pandas as pd
import numpy as np
import warnings
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
mpl.style.use('ggplot')


import gc
import geopandas as gpd
from shapely.geometry import Point, Polygon

pd.set_option('display.max_colwidth', None)

### To use working directory in Google Drive:

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#!pwd
#!ls '/content/drive/MyDrive/Colab Notebooks/Energy Systems/data/'

### To use local directory:

In [8]:
#Import csv data into dataframes

#file path:
file_path = 'D:/building-data-genome/project_data/'

#Energy related
df_solar = pd.read_csv(file_path + 'solar_cleaned.csv')
df_water = pd.read_csv(file_path + 'water_cleaned.csv')
df_electricity = pd.read_csv(file_path + 'electricity_cleaned.csv')
df_gas = pd.read_csv(file_path + 'gas_cleaned.csv')
df_hotwater = pd.read_csv(file_path + 'hotwater_cleaned.csv')
df_irrigation = pd.read_csv(file_path + 'irrigation_cleaned.csv')
df_chilledwater = pd.read_csv(file_path + 'chilledwater_cleaned.csv')
df_steam = pd.read_csv(file_path + 'steam_cleaned.csv')

#Metadata
df_metadata = pd.read_csv(file_path + 'metadata.csv')

#Weather
df_weather = pd.read_csv(file_path + 'weather.csv')

### Reformat and merge data

In [9]:
# Keep only buildings of the top 5 regions in our dataset:
df_metadata = df_metadata[df_metadata["timezone"].isin(["US/Eastern", "US/Central", "Europe/London", "US/Mountain", "US/Pacific"])]


# Reformat the meters dataframes first

def melt_meter_dataframes (df, meter_name):
  #timestamp into datetime
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  # Melt the DataFrame: columns become rows under 'building_id', and their values go under 'meter_reading'
  df = df.melt(
      id_vars=["timestamp"],              # Columns to keep as is
      var_name="building_id",             # Name for the new column with former column names
      value_name="meter_reading"          # Name for the new column with values from the original DataFrame
      )

  # Add the 'meter' column
  df["meter"] = meter_name

  # Rearrange columns for clarity
  df = df[["timestamp", "building_id", "meter", "meter_reading"]]

  return df

df_electricity = melt_meter_dataframes(df_electricity, "electricity")
df_gas = melt_meter_dataframes(df_gas, "gas")
df_hotwater = melt_meter_dataframes(df_hotwater, "hotwater")
df_chilledwater = melt_meter_dataframes(df_chilledwater, "chilledwater")
df_steam = melt_meter_dataframes(df_steam, "steam")
df_water = melt_meter_dataframes(df_water, "water")
df_irrigation = melt_meter_dataframes(df_irrigation, "irrigation")
df_solar = melt_meter_dataframes(df_solar, "solar")


# Function to reduce the DF size ( https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction)
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Reduce memory
df_metadata = reduce_mem_usage(df_metadata)
gc.collect()

# Concatenate all meters data and then merge it with metadata

# Concatenate all the meter dataframes
meters = pd.concat([df_electricity, df_gas, df_hotwater, df_chilledwater, df_steam, df_water, df_irrigation, df_solar])
meters = reduce_mem_usage(meters)
gc.collect()

# Features from buildings metadata to add to meters dataset
buildings_sel = df_metadata[["building_id","site_id","primaryspaceusage","timezone"]]

# Join the datasets: meter reading + building metadata
dev = meters.merge(buildings_sel, on="building_id", how = "left")

# Transform timestamp to datetime object type
dev["timestamp"] = pd.to_datetime(dev["timestamp"], format='%Y-%m-%d %H:%M:%S')

del(meters, buildings_sel, df_metadata)

Mem. usage decreased to  0.33 Mb (18.9% reduction)
Mem. usage decreased to 1838.90 Mb (10.0% reduction)


In [10]:
dev.columns

Index(['timestamp', 'building_id', 'meter', 'meter_reading', 'site_id',
       'primaryspaceusage', 'timezone'],
      dtype='object')

## Introduction

We had concluded in our exploratory data analysis that there are differences across regions in these cases:

1) For yearly load profiles per usage type
2) For certain daily load profiles per usage type
3) For certain daily load profiles per meter type 

We will try to see if we can effectively classify the types of buildings per their load profiles as found in each region on a yearly basis and on a daily basis.