**Notebook summary:** create OEO friendly sub-tables from raw data for buildings techno-economic parameters. Code cells that output datafiles are commented out below. Please check file paths and uncomment this data files under the headings: **1) Efficiency 2) Discount Rate 3) Lifetime 4) Investment Costs 5) Fixed Costs**

# Essentials

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
import urllib.request
import os
import seaborn as sns
import gzip
from pandas.api.types import CategoricalDtype
#from plotnine import *
import zipfile
sns.set()

In [3]:
def get_vintages(x):
    if(x < 2020):
        temp_list = list(np.arange(2020, 2051, 5))
        temp_list.insert(0, x)
    elif(x >= 2020):
        temp_list = list(np.arange(x, 2051, 5))
    return temp_list

In [4]:
def func_weight_average_temp(group, value, weight):
    d = group[value]
    w = group[weight]
    return (d * w).sum() / w.sum()

In [5]:
region_names = dict()
region_names[1] = 'CA'
region_names[2] = 'NW'
region_names[3] = 'SW'
region_names[4] = 'TX'
region_names[5] = 'CEN'
region_names[6] = 'N_CEN'
region_names[7] = 'SE'
region_names[8] = 'MID_AT'
region_names[9] = 'NE'

# Read Master Data File

In [6]:
# df = pd.read_excel('./rawData/OEO_BuildingDB_SUBSET_23APR2021.ods', engine="odf")
df = pd.read_excel(
    './rawData/OEO_BuildingDB_HEATPUMPCorrected_10Aug2021.ods', engine="odf"
)  # this file has the same name for heat pumps across space heating and space cooling end-use demand sectors
df_sel = df.drop(columns=['Notes'])

# Creating OEO Tables

In [7]:
# create techno-economic parameters for years 2025, 2035 and 2045
df_sel_extraYears = df_sel.loc[(df_sel['vintage'] == 2020) |
                               (df_sel['vintage'] == 2030) |
                               (df_sel['vintage'] == 2040)].copy()
df_sel_extraYears['vintage'] = df_sel_extraYears['vintage'].replace({
    2020: 2025,
    2030: 2035,
    2040: 2045
})
# df_withExtra has years 2020 through 2050 with 5 year increments
df_withExtra = pd.concat([df_sel, df_sel_extraYears]).reset_index(drop=True)

In [8]:
# some techs don't go out to 2050 so making a copy of 2045 to copy into 2050
df_2050_missing = df_withExtra.loc[df_withExtra['tech'].str.endswith('_N')]
all_techs = set(df_2050_missing['tech'])
techs_in_2050 = set(
    df_2050_missing.loc[df_2050_missing['vintage'] == 2050]['tech'].tolist())
# techs that don't have a representation for 2050
techs_withNo2050 = all_techs - techs_in_2050
df_techs_withNo2050 = df_withExtra.loc[df_withExtra['tech'].isin(
    list(techs_withNo2050))].sort_values(by=["tech", "vintage"]).reset_index(
        drop=True)  # sort by here is just for visual organization

df_sel_extraYear = df_techs_withNo2050.loc[df_techs_withNo2050['vintage'] ==
                                           2045].copy()
df_sel_extraYear['vintage'] = df_sel_extraYear['vintage'].replace({2045: 2050})
# df_sel_extraYear[['efficiency', 'fixed cost', 'maintenance cost', 'lifetime']] = np.nan
df_sel_extraYear = df_sel_extraYear.reset_index(drop=True)

In [9]:
df_2050ToTechsAdded = pd.concat([df_techs_withNo2050,
                                 df_sel_extraYear]).reset_index(drop=True)
df_interp = df_2050ToTechsAdded.sort_values(by=["tech", "vintage"])
df_interp = df_interp.interpolate(method='nearest', xis=0)
#df_interp.groupby(['tech'])[['tech', 'vintage', 'efficiency']].apply(lambda x: x.interpolate(method = 'linear'))
# df_final has techno-economic parameters for all years in 5 year increments between 2020 and 2050
df_final = pd.concat(
    [df_interp.loc[df_interp['vintage'] == 2050], df_withExtra])
df_final = df_final.sort_values(by=["tech", "vintage"]).reset_index(drop=True)

In [10]:
# df_final is updated here with per unit costs
# Calculating costs in $ for each technology since some costs are normalized by the capacity of equipment
fixed_values = df_final['fixed cost'] * df_final['capacity']
maintenance_values = df_final['maintenance cost'] * df_final['capacity']
df_final['fixed cost new'] = fixed_values.where(
    df_final['fixed cost units'] == '$/ton', other=df_final['fixed cost'])
df_final['maintenance cost new'] = maintenance_values.where(
    df_final['maintenance units'] == '$/ton',
    other=df_final['maintenance cost'])
df_final['fixed cost units new'] = '$'
df_final['maintenance cost units new'] = '$'
df_final.drop(columns=[
    'fixed cost', 'fixed cost units', 'maintenance cost', 'maintenance units'
],
              inplace=True)

## Making copies for all OEO regions

In [11]:
num_elems_efficiency = df_final.shape[0]
df_regions = pd.concat([df_final]*9).reset_index()

In [12]:
# df_regions has all OEO regions for years in 5 year increments between 2020 and 2050 with costs per unit for all technologies
df_regions.loc[df_regions.index < num_elems_efficiency * 1, ['Region']] = 'CA'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 1) &
               (df_regions.index < num_elems_efficiency * 2),
               ['Region']] = 'NW'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 2) &
               (df_regions.index < num_elems_efficiency * 3),
               ['Region']] = 'SW'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 3) &
               (df_regions.index < num_elems_efficiency * 4),
               ['Region']] = 'TX'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 4) &
               (df_regions.index < num_elems_efficiency * 5),
               ['Region']] = 'CEN'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 5) &
               (df_regions.index < num_elems_efficiency * 6),
               ['Region']] = 'N_CEN'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 6) &
               (df_regions.index < num_elems_efficiency * 7),
               ['Region']] = 'SE'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 7) &
               (df_regions.index < num_elems_efficiency * 8),
               ['Region']] = 'MID_AT'
df_regions.loc[(df_regions.index >= num_elems_efficiency * 8) &
               (df_regions.index < num_elems_efficiency * 9),
               ['Region']] = 'NE'

## Efficiency

In [13]:
df_efficiency = df_regions[[
    "Tech_Code_Pandas", "Region", "Subsector", "input_comm",
    "Tech Description", "tech", "vintage", "output_comm", "efficiency",
    "efficiency units"
]].copy()
df_efficiency.rename(columns={
    "Region": "regions",
    "efficiency units": "eff_notes"
},
                     inplace=True)
df_efficiency.sort_values(by=["Tech_Code_Pandas", "vintage"], inplace=True)

In [14]:
# df_efficiency.to_csv('./TablesForDB/efficiency_buildings_regional.csv')

## Discount Rate

In [48]:
df_discount = df_efficiency[df_efficiency['tech'].str.endswith('_N')][[
    'regions', 'tech', 'vintage'
]].drop_duplicates().reset_index(
    drop=True).copy()

In [61]:
df_discount.sort_values(by=["regions", 'vintage', 'tech'], inplace=True)
df_discount.reset_index(drop=True, inplace=True)

In [66]:
df_discount['tech_rate'] = 0.3 # 30 % hurdle rate for new technologies
df_discount['tech_rate_notes'] = 'nan'

In [68]:
# df_discount.to_csv('./TablesForDB/discount_rate.csv')

## Lifetime

In [14]:
df_lifetime = df_regions[["Region", "tech", "lifetime",
                          "lifetime units"]].drop_duplicates()
df_lifetime.rename(columns={
    "lifetime": "life",
    "lifetime units": "life_notes",
    "Region": "regions"
},
                   inplace=True)

In [15]:
# df_lifetime.to_csv('./TablesForDB/lifetime_buildings_regional.csv')

## Investment costs

In [16]:
df_invest = df_regions[[
    "Region", "tech", "vintage", "fixed cost new", "fixed cost units new",
    "capacity", "capacity units"
]].copy()
df_invest = df_invest.rename(
    columns={
        "Region": "regions",
        "fixed cost new": "cost_invest",
        "fixed cost units new": "cost_invest_units"
    })
df_invest.drop_duplicates(
    inplace=True
)  # duplicates arise from multiple input_comm categories (eg. ELC, ELCDIST_R)

In [17]:
# df_invest.to_csv('./TablesForDB/costInvest_buildings_regional.csv')

## Fixed costs

In [18]:
df_fixedCost = df_regions[["Region", "tech", "vintage", "maintenance cost new" ,"maintenance cost units new", "capacity", "capacity units"]].copy()
df_fixedCost.drop_duplicates(inplace = True)

In [19]:
df_fixedCost_list = df_fixedCost.values.tolist()
index_of_vintage = 2  # look at df_fixedCost_list and set this value to the vintage index
other_elems = []
periods = []
for s_check in df_fixedCost_list:
    for i in get_vintages(s_check[index_of_vintage]):
        other_elems.append(s_check)
        periods.append(i)

list_combined_periods = [[a, *v] for a, v in zip(periods, other_elems)]

In [20]:
df_fixedCost_final = pd.DataFrame(list_combined_periods, columns = ['periods', 'regions', 'tech', 'vintage', 'cost_fixed', 'cost_fixed_units', 'capacity', 'capacity_units'])

In [21]:
# df_fixedCost_final.to_csv('./TablesForDB/costFixed_buildings_regional.csv')

# Derive Regional Heat Pump Coefficient of Performance

## Calculate annual temperature population weighted

In [22]:
# read data files for average annual temperature population weighted estimate
df_population = pd.read_excel('./rawData/STATE_populations_CENSUS.xlsx')
df_annual_temp = pd.read_excel("./rawData/USState_annual_avg_temp_2010.ods",
                               engine="odf")
df_OEO_states = pd.read_csv('./rawData/state_regions_cluster.csv')

# select population for the year of 2010
df_pop_2010 = df_population[['STUSPS', 2010]].copy()

In [23]:
df_popAndTemp = df_pop_2010.merge(df_annual_temp,
                                  left_on='STUSPS',
                                  right_on='State Code')
df_popAndTemp = df_popAndTemp.merge(df_OEO_states,
                                    left_on='STUSPS',
                                    right_on='STUSPS')

In [24]:
# weighted average of temperature using population data from 2010
value = 'Average Temperature (F)'
weight = 2010
_series_temp = df_popAndTemp.groupby(['Region'
                                      ]).apply(func_weight_average_temp, value,
                                               weight)
_df_temperature = pd.DataFrame({
    "avg_temperature": _series_temp.values,
    "Region": _series_temp.index
})
_df_temperature['Region_OEO'] = _df_temperature['Region'].map(region_names)
_df_temperature = _df_temperature[['avg_temperature', 'Region_OEO']]

## Adjusting heat pump COPs based on regional temperatures

In [25]:
df_eff_copy = df_efficiency.copy()
df_eff_copy = df_eff_copy.merge(_df_temperature,
                                left_on='regions',
                                right_on='Region_OEO')
df_eff_copy.drop(columns=['Region_OEO'], inplace=True)

In [26]:
_df_heatPump = df_eff_copy.loc[df_eff_copy['Tech Description'].str.contains('HEAT PUMP')].copy()

In [27]:
assumed_operating_temperature = 70  # F
slope_from_Vaishnav2020 = 0.0541  # from Figure S2

_df_heatPump['efficiency_temp'] = _df_heatPump['efficiency'] - (
    slope_from_Vaishnav2020 *
    (assumed_operating_temperature - _df_heatPump['avg_temperature']))

conditions = [
    (_df_heatPump['efficiency_temp'] > 0),
    (_df_heatPump['efficiency_temp'] <= 0),
]

choices = [_df_heatPump['efficiency_temp'], _df_heatPump['efficiency']]

_df_heatPump['efficiency'] = np.select(conditions, choices)
_df_heatPump.drop(columns=['efficiency_temp'], inplace=True)

df_eff_copy.update(_df_heatPump)

In [29]:
df_eff_copy.to_csv('./TablesForDB/efficiency_buildings_regional.csv')

# Diagnostic bits of code

In [None]:
# df_final[df_final['tech'].str.endswith('_R')]
# df_final[df_final['tech'] == 'R_WH_SOLST_ELC_R']
# tech_new_or_old = [item[-1] for item in df_final['tech'].str.split('_')]
# df_final['new_or_old'] = tech_new_or_old
# df_temp = df_final[['Tech_Code_Pandas', 'new_or_old']].drop_duplicates()
# ser_temp = df_temp['Tech_Code_Pandas'].value_counts()

In [None]:
# len(df_final.loc[(df_final.tech.str.endswith('_R') & df_final.tech.str.startswith('R_'))]['tech'].unique())
# len(df_final.loc[(df_final.tech.str.endswith('_R') & df_final.tech.str.startswith('C_'))]['tech'].unique())

In [None]:
# Important!!
## Check if techs from existing capacity df are all represented in the main techno-econonic parameters df
# existingCap_res = pd.read_csv('/home/adityasinha/Documents/WorkDocuments/OEO_AS_projectFiles/Initial_building_tasks_2Mar2021/residential_existingCapacity_v2.csv')
# existingCap_comm = pd.read_csv('/home/adityasinha/Documents/WorkDocuments/OEO_AS_projectFiles/Initial_building_tasks_2Mar2021/commercial_existingCapacity_v2.csv')
# var_existCap_techs = set(existingCap_res.tech.unique()) | set(existingCap_comm.tech.unique())
# var_df_final_techs = set(df_final.loc[df_final.tech.str.endswith('_R')]['tech'].unique())
# var_df_final_techs - var_existCap_techs
# var_existCap_techs - var_df_final_techs

#df_final.sort_values(by=["Tech_Code_Pandas", "vintage"]).reset_index(drop = True)

In [None]:
# from pandas._testing import assert_frame_equal

In [None]:
# assert_frame_equal(df1, df1)