Merge pull request #148 from USEPA/release-v1.1.2

Release v1.1.2
USEPA · Oct 4, 2023 · dbe1c5d · dbe1c5d
2 parents 39b9600 + aed31ec
commit dbe1c5d
Show file tree

Hide file tree

Showing 18 changed files with 191 additions and 310 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -51,7 +51,7 @@ jobs:
     # install testing
     - name: Install package and dependencies
       run: |
-        pip install .["RCRAInfo"]
+        pip install .
 
     # linting & pytest
     - name: Lint with flake8

diff --git a/README.md b/README.md
@@ -122,20 +122,7 @@ git clone https://github.com/USEPA/standardizedinventories.git
 cd standardizedinventories
 pip install . # or pip install -e . for devs
 ```
-The current version contains optional dependencies (`selenium` and `webdriver_manager`) to download RCRAInfo data using a chrome browswer interface prior to generating those stewi inventories.
-See details in [RCRAInfo.py](https://github.com/USEPA/standardizedinventories/blob/master/stewi/RCRAInfo.py) for how to generate those inventories without these optional libraries.
 
-To download these optional dependencies use one of the following pip install commands:
-
-```
-pip install .["RCRAInfo"]
-```
-
-or
-
-```
-pip install . -r requirements.txt -r rcrainfo_requirements.txt
-```
 
 ### Secondary Context Installation Steps
 In order to enable calculation and assignment of urban/rural secondary contexts, please refer to

diff --git a/chemicalmatcher/config.yaml b/chemicalmatcher/config.yaml
@@ -4,6 +4,8 @@ databases:
     # see https://cdxapps.epa.gov/oms-substance-registry-services/swagger-ui/
     queries:
       caslistprefix: substances/cas?casList=
+      nameprefix: substance/name/
+      listprefix: substances/list_acronym/
       sep: '%7c'
     inventory_lists:
       RCRAInfo:

diff --git a/chemicalmatcher/globals.py b/chemicalmatcher/globals.py
@@ -13,6 +13,7 @@
 
 SRSconfig = config(config_path=MODULEPATH)['databases']['SRS']
 base = SRSconfig['url']
+queries = SRSconfig['queries']
 
 # Certain characters return errors or missing results but if replaces
 # with '_' this work per advice from Tim Bazel (CGI Federal) on 6/27/2018
@@ -24,11 +25,10 @@
 # Return json object with SRS result
 def get_SRSInfo_for_substance_name(name):
     name_for_query = urllib.parse.quote(name)
-    nameprefix = 'substance/name/'
-    nameprefixexcludeSynonyms = '?excludeSynonyms=True'
     for i in srs_replace_group:
         name_for_query = name_for_query.replace(i, '_')
-    url = base + nameprefix + name_for_query + nameprefixexcludeSynonyms
+    url = (f'{base}{queries.get("nameprefix")}{name_for_query}'
+           '?excludeSynonyms=True')
     flow_info = query_SRS_for_flow(url)
     return flow_info
 
@@ -39,8 +39,8 @@ def get_SRSInfo_for_program_list(inventory):
     # Base URL for queries
     srs_flow_df = pd.DataFrame()
     for listname in inventory_to_SRSlist_acronymns[inventory]:
-        log.debug('Getting %s', listname)
-        url = f'{base}substances/list_acronym/{urllib.parse.quote(listname)}'
+        log.debug(f'Getting {listname}')
+        url = f'{base}{queries.get("listprefix")}{urllib.parse.quote(listname)}'
         flow_info = query_SRS_for_program_list(url, inventory)
         if len(flow_info) == 0:
             log.info(f'No flows found for {listname}')

diff --git a/chemicalmatcher/programsynonymlookupbyCAS.py b/chemicalmatcher/programsynonymlookupbyCAS.py
@@ -9,20 +9,18 @@
 # SRS web service docs at https://cdxnodengn.epa.gov/cdx-srs-rest/
 # Base URL for queries
 queries = SRSconfig['queries']
-caslistprefix = queries['caslistprefix']
-sep = queries['sep']# This is the code for a pipe seperator required between CAS numbers
 
 
 def programsynonymlookupbyCAS(cas_list, inventories_of_interest):
     caslist_for_query = ''
     index_of_last = len(cas_list) - 1
     for cas in cas_list[:index_of_last]:
-        caslist_for_query = caslist_for_query + cas + sep
+        caslist_for_query = caslist_for_query + cas + queries.get("sep")
     # add on last CAS
     caslist_for_query = caslist_for_query + cas_list[index_of_last]
 
     # perform query
-    url = base + caslistprefix + caslist_for_query
+    url = f'{base}{queries.get("caslistprefix")}{caslist_for_query}'
     chemicallistresponse = requests.get(url)
     chemicallistjson = json.loads(chemicallistresponse.text)
 

diff --git a/env_sec_ctxt.yaml b/env_sec_ctxt.yaml
@@ -17,10 +17,8 @@ dependencies:
   - pyyaml=6.0
   - requests=2.31.0
   - requests-ftp=0.3.1
-  - selenium=4.9.1
   - shapely=2.0.1
   - xlrd=2.0.1
-  - webdriver-manager=3.8.6
 
   - pip:
     - "--editable=git+https://github.com/USEPA/standardizedinventories.git#egg=StEWI"
diff --git a/rcrainfo_requirements.txt b/rcrainfo_requirements.txt
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="StEWI",
-    version="1.1.1",
+    version="1.1.2",
     author="Ben Young, Wesley Ingwersen, Matthew Bergmann, Jose Hernandez-Betancur, Tapajyoti Ghosh, Eric Bell",
     author_email="ingwersen.wesley@epa.gov",
     description="Standardized Emission And Waste Inventories (StEWI)"
@@ -24,9 +24,6 @@
         'openpyxl>=3.0.7',
         'xlrd>=2.0.0',
         ],
-    extras_require={"RCRAInfo": ['webdriver_manager>=3.4.2',
-                                 'selenium>=3.141.0'],
-                    },
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Environment :: Console",

diff --git a/stewi/DMR.py b/stewi/DMR.py
@@ -18,7 +18,6 @@
     2014-2021
 """
 
-import requests
 import pandas as pd
 import argparse
 import urllib
@@ -27,6 +26,7 @@
 from io import BytesIO
 
 from esupy.processed_data_mgmt import read_source_metadata
+from esupy.remote import make_url_request
 from stewi.globals import unit_convert,\
     DATA_PATH, lb_kg, write_metadata, get_reliability_table_for_source,\
     log, compile_source_metadata, config, store_inventory, set_stewi_meta,\
@@ -134,31 +134,18 @@ def download_data(url_params, filepath: Path) -> str:
     df = pd.DataFrame()
     url = generate_url(url_params)
     log.debug(url)
-    for attempt in range(3):
-        try:
-            r = requests.get(url)
-            r.raise_for_status()
-            # When more than 100,000 records, need to split queries
-            if ((len(r.content) < 1000) and
-                ('Maximum number of records' in str(r.content))):
-                for x in ('NGP', 'GPC', 'NPD'):
-                    split_url = f'{url}&p_permit_type={x}'
-                    r = requests.get(split_url)
-                    r.raise_for_status()
-                    df_sub = pd.read_csv(BytesIO(r.content), low_memory=False)
-                    if len(df_sub) < 3: continue
-                    df = pd.concat([df, df_sub], ignore_index=True)
-            else:
-                df = pd.read_csv(BytesIO(r.content), low_memory=False)
-            break
-        except (requests.exceptions.HTTPError,
-                requests.exceptions.ConnectionError) as err:
-            log.info(err)
-            time.sleep(20)
-            pass
+    r = make_url_request(url)
+    # When more than 100,000 records, need to split queries
+    if ((len(r.content) < 1000) and
+        ('Maximum number of records' in str(r.content))):
+        for x in ('NGP', 'GPC', 'NPD'):
+            split_url = f'{url}&p_permit_type={x}'
+            r = make_url_request(split_url)
+            df_sub = pd.read_csv(BytesIO(r.content), low_memory=False)
+            if len(df_sub) < 3: continue
+            df = pd.concat([df, df_sub], ignore_index=True)
     else:
-        log.warning("exceeded max attempts")
-        return 'other_error'
+        df = pd.read_csv(BytesIO(r.content), low_memory=False)
     log.debug(f"saving to {filepath}")
     pd.to_pickle(df, filepath)
     return 'success'

diff --git a/stewi/GHGRP.py b/stewi/GHGRP.py
@@ -28,19 +28,21 @@
 
 import pandas as pd
 import numpy as np
-import requests
 import time
 import argparse
 import warnings
+import zipfile
+import io
+import urllib
 from pathlib import Path
 from xml.dom import minidom
 from xml.parsers.expat import ExpatError
 
 from esupy.processed_data_mgmt import read_source_metadata
-from stewi.globals import download_table, write_metadata, import_table, \
+from esupy.remote import make_url_request
+from stewi.globals import write_metadata, compile_source_metadata, aggregate, \
     DATA_PATH, get_reliability_table_for_source, set_stewi_meta, config,\
-    store_inventory, paths, log, \
-    compile_source_metadata, aggregate
+    store_inventory, paths, log
 from stewi.validate import update_validationsets_sources, validate_inventory,\
     write_validation_result
 from stewi.formats import StewiFormat
@@ -119,7 +121,7 @@ def get_row_count(table, report_year):
         count_url += f'/REPORTING_YEAR/=/{report_year}'
     count_url += '/COUNT'
     try:
-        count_request = requests.get(count_url)
+        count_request = make_url_request(count_url)
         count_xml = minidom.parseString(count_request.text)
         table_count = count_xml.getElementsByTagName('TOTALQUERYRESULTS')
         table_count = int(table_count[0].firstChild.nodeValue)
@@ -261,6 +263,47 @@ def import_or_download_table(filepath, table, year, m):
     return table_df
 
 
+def download_table(filepath: Path, url: str, get_time=False):
+    """Download file at url to Path if it does not exist."""
+    if not filepath.exists():
+        if url.lower().endswith('zip'):
+            r = make_url_request(url)
+            zip_file = zipfile.ZipFile(io.BytesIO(r.content))
+            zip_file.extractall(filepath)
+        elif 'xls' in url.lower() or url.lower().endswith('excel'):
+            r = make_url_request(url)
+            with open(filepath, "wb") as f:
+                f.write(r.content)
+        elif 'json' in url.lower():
+            pd.read_json(url).to_csv(filepath, index=False)
+        if get_time:
+            try:
+                retrieval_time = filepath.stat().st_ctime
+            except OSError:
+                retrieval_time = time.time()
+            return time.ctime(retrieval_time)
+    elif get_time:
+        return time.ctime(filepath.stat().st_ctime)
+
+
+def import_table(path_or_reference, get_time=False):
+    """Read and return time of csv from url or Path."""
+    try:
+        df = pd.read_csv(path_or_reference, low_memory=False)
+    except urllib.error.URLError as exception:
+        log.warning(exception.reason)
+        log.info('retrying url...')
+        time.sleep(3)
+        df = pd.read_csv(path_or_reference, low_memory=False)
+    if get_time and isinstance(path_or_reference, Path):
+        retrieval_time = path_or_reference.stat().st_ctime
+        return df, time.ctime(retrieval_time)
+    elif get_time:
+        retrieval_time = time.time()
+        return df, time.ctime(retrieval_time)
+    return df
+
+
 def download_and_parse_subpart_tables(year, m):
     """
     Generates a list of required subpart tables, based on report year.
@@ -297,13 +340,13 @@ def download_and_parse_subpart_tables(year, m):
         if table_df is None:
             continue
         # add 1-2 letter subpart abbreviation
-        table_df['SUBPART_NAME'] = list(year_tables.loc[
-            year_tables['TABLE'] == subpart_emissions_table, 'SUBPART'])[0]
-
+        abbv = (year_tables.query('TABLE == @subpart_emissions_table')
+                ['SUBPART'].iloc[0])
+        table_df = table_df.assign(SUBPART_NAME = abbv)
         # concatenate temporary dataframe to master ghgrp1 dataframe
         ghgrp1 = pd.concat([ghgrp1, table_df], ignore_index=True)
 
-    ghgrp1.reset_index(drop=True, inplace=True)
+    ghgrp1 = ghgrp1.reset_index(drop=True)
     log.info('Parsing table data...')
     if 'C' in ghgrp1.SUBPART_NAME.unique():
         ghgrp1 = calculate_combustion_emissions(ghgrp1)
@@ -384,32 +427,37 @@ def calculate_combustion_emissions(df):
     # NOTE: 'PART_75_CO2_EMISSIONS_METHOD' includes biogenic carbon emissions,
     # so there will be a slight error here, but biogenic/nonbiogenic emissions
     # for Part 75 are not reported separately.
-    df['c_co2'] = df['TIER1_CO2_COMBUSTION_EMISSIONS'] + \
-                      df['TIER2_CO2_COMBUSTION_EMISSIONS'] + \
-                      df['TIER3_CO2_COMBUSTION_EMISSIONS'] + \
-                      df['TIER_123_SORBENT_CO2_EMISSIONS'] + \
-                      df['TIER_4_TOTAL_CO2_EMISSIONS'] - \
-                      df['TIER_4_BIOGENIC_CO2_EMISSIONS'] + \
-                      df['PART_75_CO2_EMISSIONS_METHOD'] -\
-                      df['TIER123_BIOGENIC_CO2_EMISSIONS']
+    df = (df.assign(c_co2 = lambda x:
+                        x['TIER1_CO2_COMBUSTION_EMISSIONS'] +
+                        x['TIER2_CO2_COMBUSTION_EMISSIONS'] +
+                        x['TIER3_CO2_COMBUSTION_EMISSIONS'] +
+                        x['TIER_123_SORBENT_CO2_EMISSIONS'] +
+                        x['TIER_4_TOTAL_CO2_EMISSIONS'] -
+                        x['TIER_4_BIOGENIC_CO2_EMISSIONS'] +
+                        x['PART_75_CO2_EMISSIONS_METHOD'] -
+                        x['TIER123_BIOGENIC_CO2_EMISSIONS'])
     # biogenic carbon:
-    df['c_co2_b'] = df['TIER123_BIOGENIC_CO2_EMISSIONS'] + \
-                        df['TIER_4_BIOGENIC_CO2_EMISSIONS']
+            .assign(c_co2_b = lambda x:
+                        x['TIER123_BIOGENIC_CO2_EMISSIONS'] +
+                        x['TIER_4_BIOGENIC_CO2_EMISSIONS'])
     # methane:
-    df['c_ch4'] = df['TIER1_CH4_COMBUSTION_EMISSIONS'] + \
-                      df['TIER2_CH4_COMBUSTION_EMISSIONS'] + \
-                      df['TIER3_CH4_COMBUSTION_EMISSIONS'] + \
-                      df['T4CH4COMBUSTIONEMISSIONS'] + \
-                      df['PART_75_CH4_EMISSIONS_CO2E']/CH4GWP
+            .assign(c_ch4 = lambda x:
+                        x['TIER1_CH4_COMBUSTION_EMISSIONS'] +
+                        x['TIER2_CH4_COMBUSTION_EMISSIONS'] +
+                        x['TIER3_CH4_COMBUSTION_EMISSIONS'] +
+                        x['T4CH4COMBUSTIONEMISSIONS'] +
+                        x['PART_75_CH4_EMISSIONS_CO2E']/CH4GWP)
     # nitrous oxide:
-    df['c_n2o'] = df['TIER1_N2O_COMBUSTION_EMISSIONS'] + \
-                      df['TIER2_N2O_COMBUSTION_EMISSIONS'] + \
-                      df['TIER3_N2O_COMBUSTION_EMISSIONS'] + \
-                      df['T4N2OCOMBUSTIONEMISSIONS'] + \
-                      df['PART_75_N2O_EMISSIONS_CO2E']/N2OGWP
-
+            .assign(c_n2o = lambda x:
+                        x['TIER1_N2O_COMBUSTION_EMISSIONS'] +
+                        x['TIER2_N2O_COMBUSTION_EMISSIONS'] +
+                        x['TIER3_N2O_COMBUSTION_EMISSIONS'] +
+                        x['T4N2OCOMBUSTIONEMISSIONS'] +
+                        x['PART_75_N2O_EMISSIONS_CO2E']/N2OGWP)
     # drop subpart C columns because they are no longer needed
-    return df.drop(columns=subpart_c_cols)
+            .drop(columns=subpart_c_cols)
+            )
+    return df
 
 
 def parse_additional_suparts_data(addtnl_subparts_path, subpart_cols_file, year):

diff --git a/stewi/NEI.py b/stewi/NEI.py
@@ -27,10 +27,10 @@
 
 import numpy as np
 import pandas as pd
-import requests
 
 from esupy.processed_data_mgmt import download_from_remote,\
     read_source_metadata
+from esupy.remote import make_url_request
 from esupy.util import strip_file_extension
 from stewi.globals import DATA_PATH, write_metadata, USton_kg, lb_kg,\
     log, store_inventory, config, assign_secondary_context,\
@@ -128,16 +128,7 @@ def generate_national_totals(year):
     url = build_url.replace('__year__', year)
     url = url.replace('__file__', file)
 
-    # make http request
-    r = []
-    try:
-        r = requests.Session().get(url, verify=False)
-    except requests.exceptions.ConnectionError:
-        log.error(f"URL Connection Error for {url}")
-    try:
-        r.raise_for_status()
-    except requests.exceptions.HTTPError:
-        log.error('Error in URL request!')
+    r = make_url_request(url, verify=False)
 
     # extract data from zip archive
     z = zipfile.ZipFile(io.BytesIO(r.content))