Skip to content

Commit

Permalink
Merge pull request #148 from USEPA/release-v1.1.2
Browse files Browse the repository at this point in the history
Release v1.1.2
  • Loading branch information
bl-young committed Oct 4, 2023
2 parents 39b9600 + aed31ec commit dbe1c5d
Show file tree
Hide file tree
Showing 18 changed files with 191 additions and 310 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
# install testing
- name: Install package and dependencies
run: |
pip install .["RCRAInfo"]
pip install .
# linting & pytest
- name: Lint with flake8
Expand Down
13 changes: 0 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,7 @@ git clone https://github.com/USEPA/standardizedinventories.git
cd standardizedinventories
pip install . # or pip install -e . for devs
```
The current version contains optional dependencies (`selenium` and `webdriver_manager`) to download RCRAInfo data using a chrome browswer interface prior to generating those stewi inventories.
See details in [RCRAInfo.py](https://github.com/USEPA/standardizedinventories/blob/master/stewi/RCRAInfo.py) for how to generate those inventories without these optional libraries.

To download these optional dependencies use one of the following pip install commands:

```
pip install .["RCRAInfo"]
```

or

```
pip install . -r requirements.txt -r rcrainfo_requirements.txt
```

### Secondary Context Installation Steps
In order to enable calculation and assignment of urban/rural secondary contexts, please refer to
Expand Down
2 changes: 2 additions & 0 deletions chemicalmatcher/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ databases:
# see https://cdxapps.epa.gov/oms-substance-registry-services/swagger-ui/
queries:
caslistprefix: substances/cas?casList=
nameprefix: substance/name/
listprefix: substances/list_acronym/
sep: '%7c'
inventory_lists:
RCRAInfo:
Expand Down
10 changes: 5 additions & 5 deletions chemicalmatcher/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

SRSconfig = config(config_path=MODULEPATH)['databases']['SRS']
base = SRSconfig['url']
queries = SRSconfig['queries']

# Certain characters return errors or missing results but if replaces
# with '_' this work per advice from Tim Bazel (CGI Federal) on 6/27/2018
Expand All @@ -24,11 +25,10 @@
# Return json object with SRS result
def get_SRSInfo_for_substance_name(name):
name_for_query = urllib.parse.quote(name)
nameprefix = 'substance/name/'
nameprefixexcludeSynonyms = '?excludeSynonyms=True'
for i in srs_replace_group:
name_for_query = name_for_query.replace(i, '_')
url = base + nameprefix + name_for_query + nameprefixexcludeSynonyms
url = (f'{base}{queries.get("nameprefix")}{name_for_query}'
'?excludeSynonyms=True')
flow_info = query_SRS_for_flow(url)
return flow_info

Expand All @@ -39,8 +39,8 @@ def get_SRSInfo_for_program_list(inventory):
# Base URL for queries
srs_flow_df = pd.DataFrame()
for listname in inventory_to_SRSlist_acronymns[inventory]:
log.debug('Getting %s', listname)
url = f'{base}substances/list_acronym/{urllib.parse.quote(listname)}'
log.debug(f'Getting {listname}')
url = f'{base}{queries.get("listprefix")}{urllib.parse.quote(listname)}'
flow_info = query_SRS_for_program_list(url, inventory)
if len(flow_info) == 0:
log.info(f'No flows found for {listname}')
Expand Down
6 changes: 2 additions & 4 deletions chemicalmatcher/programsynonymlookupbyCAS.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,18 @@
# SRS web service docs at https://cdxnodengn.epa.gov/cdx-srs-rest/
# Base URL for queries
queries = SRSconfig['queries']
caslistprefix = queries['caslistprefix']
sep = queries['sep']# This is the code for a pipe seperator required between CAS numbers


def programsynonymlookupbyCAS(cas_list, inventories_of_interest):
caslist_for_query = ''
index_of_last = len(cas_list) - 1
for cas in cas_list[:index_of_last]:
caslist_for_query = caslist_for_query + cas + sep
caslist_for_query = caslist_for_query + cas + queries.get("sep")
# add on last CAS
caslist_for_query = caslist_for_query + cas_list[index_of_last]

# perform query
url = base + caslistprefix + caslist_for_query
url = f'{base}{queries.get("caslistprefix")}{caslist_for_query}'
chemicallistresponse = requests.get(url)
chemicallistjson = json.loads(chemicallistresponse.text)

Expand Down
2 changes: 0 additions & 2 deletions env_sec_ctxt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@ dependencies:
- pyyaml=6.0
- requests=2.31.0
- requests-ftp=0.3.1
- selenium=4.9.1
- shapely=2.0.1
- xlrd=2.0.1
- webdriver-manager=3.8.6

- pip:
- "--editable=git+https://github.com/USEPA/standardizedinventories.git#egg=StEWI"
2 changes: 0 additions & 2 deletions rcrainfo_requirements.txt

This file was deleted.

5 changes: 1 addition & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="StEWI",
version="1.1.1",
version="1.1.2",
author="Ben Young, Wesley Ingwersen, Matthew Bergmann, Jose Hernandez-Betancur, Tapajyoti Ghosh, Eric Bell",
author_email="ingwersen.wesley@epa.gov",
description="Standardized Emission And Waste Inventories (StEWI)"
Expand All @@ -24,9 +24,6 @@
'openpyxl>=3.0.7',
'xlrd>=2.0.0',
],
extras_require={"RCRAInfo": ['webdriver_manager>=3.4.2',
'selenium>=3.141.0'],
},
classifiers=[
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
Expand Down
37 changes: 12 additions & 25 deletions stewi/DMR.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
2014-2021
"""

import requests
import pandas as pd
import argparse
import urllib
Expand All @@ -27,6 +26,7 @@
from io import BytesIO

from esupy.processed_data_mgmt import read_source_metadata
from esupy.remote import make_url_request
from stewi.globals import unit_convert,\
DATA_PATH, lb_kg, write_metadata, get_reliability_table_for_source,\
log, compile_source_metadata, config, store_inventory, set_stewi_meta,\
Expand Down Expand Up @@ -134,31 +134,18 @@ def download_data(url_params, filepath: Path) -> str:
df = pd.DataFrame()
url = generate_url(url_params)
log.debug(url)
for attempt in range(3):
try:
r = requests.get(url)
r.raise_for_status()
# When more than 100,000 records, need to split queries
if ((len(r.content) < 1000) and
('Maximum number of records' in str(r.content))):
for x in ('NGP', 'GPC', 'NPD'):
split_url = f'{url}&p_permit_type={x}'
r = requests.get(split_url)
r.raise_for_status()
df_sub = pd.read_csv(BytesIO(r.content), low_memory=False)
if len(df_sub) < 3: continue
df = pd.concat([df, df_sub], ignore_index=True)
else:
df = pd.read_csv(BytesIO(r.content), low_memory=False)
break
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
log.info(err)
time.sleep(20)
pass
r = make_url_request(url)
# When more than 100,000 records, need to split queries
if ((len(r.content) < 1000) and
('Maximum number of records' in str(r.content))):
for x in ('NGP', 'GPC', 'NPD'):
split_url = f'{url}&p_permit_type={x}'
r = make_url_request(split_url)
df_sub = pd.read_csv(BytesIO(r.content), low_memory=False)
if len(df_sub) < 3: continue
df = pd.concat([df, df_sub], ignore_index=True)
else:
log.warning("exceeded max attempts")
return 'other_error'
df = pd.read_csv(BytesIO(r.content), low_memory=False)
log.debug(f"saving to {filepath}")
pd.to_pickle(df, filepath)
return 'success'
Expand Down
110 changes: 79 additions & 31 deletions stewi/GHGRP.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,21 @@

import pandas as pd
import numpy as np
import requests
import time
import argparse
import warnings
import zipfile
import io
import urllib
from pathlib import Path
from xml.dom import minidom
from xml.parsers.expat import ExpatError

from esupy.processed_data_mgmt import read_source_metadata
from stewi.globals import download_table, write_metadata, import_table, \
from esupy.remote import make_url_request
from stewi.globals import write_metadata, compile_source_metadata, aggregate, \
DATA_PATH, get_reliability_table_for_source, set_stewi_meta, config,\
store_inventory, paths, log, \
compile_source_metadata, aggregate
store_inventory, paths, log
from stewi.validate import update_validationsets_sources, validate_inventory,\
write_validation_result
from stewi.formats import StewiFormat
Expand Down Expand Up @@ -119,7 +121,7 @@ def get_row_count(table, report_year):
count_url += f'/REPORTING_YEAR/=/{report_year}'
count_url += '/COUNT'
try:
count_request = requests.get(count_url)
count_request = make_url_request(count_url)
count_xml = minidom.parseString(count_request.text)
table_count = count_xml.getElementsByTagName('TOTALQUERYRESULTS')
table_count = int(table_count[0].firstChild.nodeValue)
Expand Down Expand Up @@ -261,6 +263,47 @@ def import_or_download_table(filepath, table, year, m):
return table_df


def download_table(filepath: Path, url: str, get_time=False):
"""Download file at url to Path if it does not exist."""
if not filepath.exists():
if url.lower().endswith('zip'):
r = make_url_request(url)
zip_file = zipfile.ZipFile(io.BytesIO(r.content))
zip_file.extractall(filepath)
elif 'xls' in url.lower() or url.lower().endswith('excel'):
r = make_url_request(url)
with open(filepath, "wb") as f:
f.write(r.content)
elif 'json' in url.lower():
pd.read_json(url).to_csv(filepath, index=False)
if get_time:
try:
retrieval_time = filepath.stat().st_ctime
except OSError:
retrieval_time = time.time()
return time.ctime(retrieval_time)
elif get_time:
return time.ctime(filepath.stat().st_ctime)


def import_table(path_or_reference, get_time=False):
"""Read and return time of csv from url or Path."""
try:
df = pd.read_csv(path_or_reference, low_memory=False)
except urllib.error.URLError as exception:
log.warning(exception.reason)
log.info('retrying url...')
time.sleep(3)
df = pd.read_csv(path_or_reference, low_memory=False)
if get_time and isinstance(path_or_reference, Path):
retrieval_time = path_or_reference.stat().st_ctime
return df, time.ctime(retrieval_time)
elif get_time:
retrieval_time = time.time()
return df, time.ctime(retrieval_time)
return df


def download_and_parse_subpart_tables(year, m):
"""
Generates a list of required subpart tables, based on report year.
Expand Down Expand Up @@ -297,13 +340,13 @@ def download_and_parse_subpart_tables(year, m):
if table_df is None:
continue
# add 1-2 letter subpart abbreviation
table_df['SUBPART_NAME'] = list(year_tables.loc[
year_tables['TABLE'] == subpart_emissions_table, 'SUBPART'])[0]

abbv = (year_tables.query('TABLE == @subpart_emissions_table')
['SUBPART'].iloc[0])
table_df = table_df.assign(SUBPART_NAME = abbv)
# concatenate temporary dataframe to master ghgrp1 dataframe
ghgrp1 = pd.concat([ghgrp1, table_df], ignore_index=True)

ghgrp1.reset_index(drop=True, inplace=True)
ghgrp1 = ghgrp1.reset_index(drop=True)
log.info('Parsing table data...')
if 'C' in ghgrp1.SUBPART_NAME.unique():
ghgrp1 = calculate_combustion_emissions(ghgrp1)
Expand Down Expand Up @@ -384,32 +427,37 @@ def calculate_combustion_emissions(df):
# NOTE: 'PART_75_CO2_EMISSIONS_METHOD' includes biogenic carbon emissions,
# so there will be a slight error here, but biogenic/nonbiogenic emissions
# for Part 75 are not reported separately.
df['c_co2'] = df['TIER1_CO2_COMBUSTION_EMISSIONS'] + \
df['TIER2_CO2_COMBUSTION_EMISSIONS'] + \
df['TIER3_CO2_COMBUSTION_EMISSIONS'] + \
df['TIER_123_SORBENT_CO2_EMISSIONS'] + \
df['TIER_4_TOTAL_CO2_EMISSIONS'] - \
df['TIER_4_BIOGENIC_CO2_EMISSIONS'] + \
df['PART_75_CO2_EMISSIONS_METHOD'] -\
df['TIER123_BIOGENIC_CO2_EMISSIONS']
df = (df.assign(c_co2 = lambda x:
x['TIER1_CO2_COMBUSTION_EMISSIONS'] +
x['TIER2_CO2_COMBUSTION_EMISSIONS'] +
x['TIER3_CO2_COMBUSTION_EMISSIONS'] +
x['TIER_123_SORBENT_CO2_EMISSIONS'] +
x['TIER_4_TOTAL_CO2_EMISSIONS'] -
x['TIER_4_BIOGENIC_CO2_EMISSIONS'] +
x['PART_75_CO2_EMISSIONS_METHOD'] -
x['TIER123_BIOGENIC_CO2_EMISSIONS'])
# biogenic carbon:
df['c_co2_b'] = df['TIER123_BIOGENIC_CO2_EMISSIONS'] + \
df['TIER_4_BIOGENIC_CO2_EMISSIONS']
.assign(c_co2_b = lambda x:
x['TIER123_BIOGENIC_CO2_EMISSIONS'] +
x['TIER_4_BIOGENIC_CO2_EMISSIONS'])
# methane:
df['c_ch4'] = df['TIER1_CH4_COMBUSTION_EMISSIONS'] + \
df['TIER2_CH4_COMBUSTION_EMISSIONS'] + \
df['TIER3_CH4_COMBUSTION_EMISSIONS'] + \
df['T4CH4COMBUSTIONEMISSIONS'] + \
df['PART_75_CH4_EMISSIONS_CO2E']/CH4GWP
.assign(c_ch4 = lambda x:
x['TIER1_CH4_COMBUSTION_EMISSIONS'] +
x['TIER2_CH4_COMBUSTION_EMISSIONS'] +
x['TIER3_CH4_COMBUSTION_EMISSIONS'] +
x['T4CH4COMBUSTIONEMISSIONS'] +
x['PART_75_CH4_EMISSIONS_CO2E']/CH4GWP)
# nitrous oxide:
df['c_n2o'] = df['TIER1_N2O_COMBUSTION_EMISSIONS'] + \
df['TIER2_N2O_COMBUSTION_EMISSIONS'] + \
df['TIER3_N2O_COMBUSTION_EMISSIONS'] + \
df['T4N2OCOMBUSTIONEMISSIONS'] + \
df['PART_75_N2O_EMISSIONS_CO2E']/N2OGWP

.assign(c_n2o = lambda x:
x['TIER1_N2O_COMBUSTION_EMISSIONS'] +
x['TIER2_N2O_COMBUSTION_EMISSIONS'] +
x['TIER3_N2O_COMBUSTION_EMISSIONS'] +
x['T4N2OCOMBUSTIONEMISSIONS'] +
x['PART_75_N2O_EMISSIONS_CO2E']/N2OGWP)
# drop subpart C columns because they are no longer needed
return df.drop(columns=subpart_c_cols)
.drop(columns=subpart_c_cols)
)
return df


def parse_additional_suparts_data(addtnl_subparts_path, subpart_cols_file, year):
Expand Down
13 changes: 2 additions & 11 deletions stewi/NEI.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@

import numpy as np
import pandas as pd
import requests

from esupy.processed_data_mgmt import download_from_remote,\
read_source_metadata
from esupy.remote import make_url_request
from esupy.util import strip_file_extension
from stewi.globals import DATA_PATH, write_metadata, USton_kg, lb_kg,\
log, store_inventory, config, assign_secondary_context,\
Expand Down Expand Up @@ -128,16 +128,7 @@ def generate_national_totals(year):
url = build_url.replace('__year__', year)
url = url.replace('__file__', file)

# make http request
r = []
try:
r = requests.Session().get(url, verify=False)
except requests.exceptions.ConnectionError:
log.error(f"URL Connection Error for {url}")
try:
r.raise_for_status()
except requests.exceptions.HTTPError:
log.error('Error in URL request!')
r = make_url_request(url, verify=False)

# extract data from zip archive
z = zipfile.ZipFile(io.BytesIO(r.content))
Expand Down
Loading

0 comments on commit dbe1c5d

Please sign in to comment.