Merge pull request #382 from USEPA/reviewer_revs

Update flowsa2.0 with updates from reviewer comments
USEPA · Sep 29, 2023 · f1efee7 · f1efee7
2 parents 9aa771a + 0fcafd2
commit f1efee7
Show file tree

Hide file tree

Showing 41 changed files with 434 additions and 403 deletions.
diff --git a/.github/workflows/compare_single_FBS.yml b/.github/workflows/compare_single_FBS.yml
@@ -39,7 +39,7 @@ jobs:
           METHOD: ${{ github.event.inputs.method }}
       run: |
         echo "Method: ${{ github.event.inputs.method }}"
-        python flowsa/test_single_FBS.py --method $METHOD
+        python tests/test_single_FBS.py --method $METHOD
 
     - name: Upload files
       if: always()

diff --git a/.github/workflows/generate_FBA.yml b/.github/workflows/generate_FBA.yml
@@ -45,7 +45,7 @@ jobs:
       run: |
         echo "Source: ${{ github.event.inputs.source }}"
         echo "Year: ${{ github.event.inputs.year }}"
-        python flowsa/test_single_FBA.py --source $SOURCE --year $YEAR
+        python tests/test_single_FBA.py --source $SOURCE --year $YEAR
 
     - name: Upload files
       if: always()

diff --git a/.github/workflows/test_methods.yml b/.github/workflows/test_methods.yml
@@ -34,7 +34,7 @@ jobs:
 
     - name: Test FBA config
       run: |
-        python flowsa/test_methods.py
+        python tests/test_methods.py
 
   FBS_testing:
     runs-on: macos-latest
@@ -61,9 +61,9 @@ jobs:
       id: FBS
       if: always() # Proceed even if Test FBA fails
       run: |
-        for m in $(python flowsa/test_FBS_against_remote.py list)
+        for m in $(python tests/test_FBS_against_remote.py list)
         do
-          python flowsa/test_FBS_against_remote.py "$m"
+          python tests/test_FBS_against_remote.py "$m"
         done
 
     - name: Upload csv files

diff --git a/README.md b/README.md
@@ -28,17 +28,26 @@ exception of formatting. A list of available FBA datasets can be found in
 the [Wiki](https://github.com/USEPA/flowsa/wiki/Available-Data#flow-by-activity-datasets).
 
 `import flowsa` \
+Return list of all availble FBA datasets, including years 
 `flowsa.seeAvailableFlowByModels('FBA')` \
-`flowsa.getFlowByActivity(datasource="USDA_CoA_Cropland", year=2017)`
+Generate and return pandas dataframe for 2014 Energy Information 
+Administration (EIA) Manufacturing Energy Consumption Survey (MECS) land use \
+`fba = getFlowByActivity(datasource="EIA_MECS_Land", year=2014)`
 
 ### Flow-By-Sector (FBS) Datasets
 Flow-By-Sector datasets are tables of environmental and other data 
 attributed to [sectors](https://www.census.gov/naics/). A list of available 
-FBS datasets can be found in the [Wiki](https://github.com/USEPA/flowsa/wiki/Available-Data#flow-by-sector-datasets).
+FBS datasets can be found in the 
+[Wiki](https://github.com/USEPA/flowsa/wiki/Available-Data#flow-by-sector-datasets).
 
 `import flowsa` \
+Return list of all available FBS datasets, including years 
 `flowsa.seeAvailableFlowByModels('FBS')` \
-`flowsa.getFlowBySector('Water_national_2015_m1')`
+Generate and return pandas dataframe for national water withdrawals 
+attributed to 6-digit sectors. Download all required FBA datasets from 
+Data Commons. \
+`fbs_water = getFlowBySector('Water_national_2015_m1', 
+download_FBAs_if_missing=True)`
 
 ## Installation
 `pip install git+https://github.com/USEPA/flowsa.git@vX.X.X#egg=flowsa`
@@ -50,7 +59,8 @@ where vX.X.X can be replaced with the version you wish to install under
 For more information on `flowsa` see the [wiki](https://github.com/USEPA/flowsa/wiki).
 
 ### Accessing datsets output by FLOWSA
-FBA and FBS datasets can be accessed on [EPA's Data Commons](https://dmap-data-commons-ord.s3.amazonaws.com/index.html?prefix=flowsa/) without running the Python code. 
+FBA and FBS datasets can be accessed on 
+[EPA's Data Commons](https://dmap-data-commons-ord.s3.amazonaws.com/index.html?prefix=flowsa/) without running the Python code. 
 
 ## Disclaimer
 

diff --git a/examples/get_flows_by_activity.py b/examples/get_flows_by_activity.py
@@ -10,33 +10,33 @@
 These data are generally unchanged from the source data, with the exception
 of formatting.
 
-See source_catalog.yaml for available FlowByActivity datasets and
-available parameters for getFlowByActivity().
-Examples of use of flowsa. Read parquet files as dataframes.
+`getFlowByActivity()` has required and optional parameters
     :param datasource: str, the code of the datasource.
     :param year: int, a year, e.g. 2012
     :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
     :param geographic_level: str, a geographic level of the data.
-    Optional. E.g. 'national', 'state', 'county'.
+        Optional. E.g. 'national', 'state', 'county'.
+    :param download_FBA_if_missing: bool, if True will attempt to load from
+        remote server prior to generating if file not found locally,
+        optional, default is False
     :return: a pandas DataFrame in FlowByActivity format
 
 """
-
-import flowsa
+from flowsa import getFlowByActivity, seeAvailableFlowByModels
 from flowsa.settings import fbaoutputpath
 
 # see all datasources and years available in flowsa
-flowsa.seeAvailableFlowByModels('FBA')
+seeAvailableFlowByModels('FBA')
 
 # Load all information for EIA MECS Land
-fba_mecs = flowsa.getFlowByActivity(datasource="EIA_MECS_Land", year=2014)
+fba_mecs = getFlowByActivity(datasource="EIA_MECS_Land", year=2014)
 
 # only load state level water data and save as csv
-fba_usgs = flowsa.getFlowByActivity(datasource="USGS_NWIS_WU",
-                                    year=2015,
-                                    flowclass='Water',
-                                    geographic_level='state'
-                                    )
+fba_usgs = getFlowByActivity(datasource="USGS_NWIS_WU",
+                             year=2015,
+                             flowclass='Water',
+                             geographic_level='state'
+                             )
 
 # save output to csv, maintain leading 0s in location col
 fba_usgs.Location = fba_usgs.Location.apply('="{}"'.format)

diff --git a/examples/get_flows_by_sector.py b/examples/get_flows_by_sector.py
@@ -11,7 +11,9 @@
 master/flowsa/methods/flowbysectormethods/Water_national_2015_m1.yaml)
 captures the flow of withdrawn water through the economy. This dataset
 tracks water withdrawn by Public Supply (SectorProducedBy) that flows to
-Domestic use (SectorConsumedBy). Not all FBS contain data in both the
+Domestic use (SectorConsumedBy).
+
+Not all FBS contain data in both the
 SectorProducedBy and SectorConsumedBy columns. For example
 Employment_national_2018 (https://github.com/USEPA/flowsa/blob/master/
 flowsa/methods/flowbysectormethods/Employment_national_2018.yaml) only contains
@@ -21,22 +23,34 @@
 Tables are standardized into a table format defined in
 https://github.com/USEPA/flowsa/blob/master/format%20specs/FlowBySector.md.
 
-Retrieves stored data in the FlowBySector format
-    :param methodname: string, Name of an available method for the given class.
-    Method files found in flowsa/data/flowbysectormethods
+`getFlowBySector()` has required and optional parameters
+    :param methodname: str, name of an available method
+    :param fbsconfigpath: str, path to the FBS method file if loading a file
+        from outside the flowsa repository, optional
+    :param download_FBAs_if_missing: bool, if True will attempt to load FBAs
+        used in generating the FBS from remote server prior to generating if
+        file not found locally, optional, default is False
+    :param download_FBS_if_missing: bool, if True will attempt to load from
+        remote server prior to generating if file not found locally,
+        optional, default is False
     :return: dataframe in flow by sector format
 
 """
-
-import flowsa
+from flowsa import getFlowBySector, collapse_FlowBySector, \
+    seeAvailableFlowByModels
 
 # see available FBS models
-flowsa.seeAvailableFlowByModels('FBS')
+seeAvailableFlowByModels('FBS')
 
-# load FBS from local directory, if does not exist, method will download
-fbs_water = flowsa.getFlowBySector('Water_national_2015_m1',
-                                   download_FBAs_if_missing=True)
+# load FBS from local directory, if does not exist, method will run, option
+# to download the FBAs from Data Commons
+# (https://dmap-data-commons-ord.s3.amazonaws.com/index.html?prefix=flowsa/)
+# to run the method
+fbs_water = getFlowBySector('Water_national_2015_m1',
+                            download_FBAs_if_missing=True)
 
 # collapse the FBS - output has 'Sector' column instead of
-# 'SectorProducedBy' and 'SectorConsumedBy' columns
-fbs_water_collapsed = flowsa.collapse_FlowBySector('Water_national_2015_m1')
+# 'SectorProducedBy' and 'SectorConsumedBy' columns. The collapsed
+# `Water_national_2015_m1` FBS will have 2 fewer rows, as the df is aggregated
+# after dropping "SectorProducedBy" information
+fbs_water_collapsed = collapse_FlowBySector('Water_national_2015_m1')
diff --git a/flowsa/__init__.py b/flowsa/__init__.py
@@ -16,187 +16,9 @@
 Flow-By-Sector files are loaded when running these functions
 """
 
-import os
-import pprint
-import pandas as pd
-import flowsa.exceptions
-from flowsa.common import load_yaml_dict
-from flowsa.flowsa_log import log
-from flowsa.settings import sourceconfigpath, flowbysectormethodpath, \
-    biboutputpath, DEFAULT_DOWNLOAD_IF_MISSING
-from flowsa.flowbyfunctions import collapse_fbs_sectors, filter_by_geoscale
-from flowsa.validation import check_for_nonetypes_in_sector_col, \
-    check_for_negative_flowamounts
-# from flowsa.bibliography import generate_fbs_bibliography
-from flowsa.datavisualization import FBSscatterplot
-from flowsa.flowbyactivity import FlowByActivity
-from flowsa.flowbysector import FlowBySector
+from flowsa.common import seeAvailableFlowByModels
+from flowsa.flowbyactivity import getFlowByActivity
+from flowsa.flowbysector import getFlowBySector, collapse_FlowBySector
+# from flowsa.bibliography import writeFlowBySectorBibliography
+# from flowsa.datavisualization import generateFBSplot
 
-
-def getFlowByActivity(
-        datasource,
-        year,
-        flowclass=None,
-        geographic_level=None,
-        download_FBA_if_missing=DEFAULT_DOWNLOAD_IF_MISSING
-        ) -> pd.DataFrame:
-    """
-    Retrieves stored data in the FlowByActivity format
-    :param datasource: str, the code of the datasource.
-    :param year: int, a year, e.g. 2012
-    :param flowclass: str or list, a 'Class' of the flow. Optional. E.g.
-    'Water' or ['Employment', 'Chemicals']
-    :param geographic_level: str, a geographic level of the data.
-                             Optional. E.g. 'national', 'state', 'county'.
-    :param download_FBA_if_missing: bool, if True will attempt to load from
-        remote server prior to generating if file not found locally
-    :return: a pandas DataFrame in FlowByActivity format
-    """
-    fba = FlowByActivity.getFlowByActivity(
-        full_name=datasource,
-        config={},
-        year=int(year),
-        download_ok=download_FBA_if_missing
-    )
-
-    if len(fba) == 0:
-        raise flowsa.exceptions.FBANotAvailableError(
-            message=f"Error generating {datasource} for {str(year)}")
-    if flowclass is not None:
-        fba = fba.query('Class == @flowclass')
-    # if geographic level specified, only load rows in geo level
-    if geographic_level is not None:
-        fba = filter_by_geoscale(fba, geographic_level)
-    return pd.DataFrame(fba.reset_index(drop=True))
-
-
-def getFlowBySector(
-        methodname,
-        fbsconfigpath=None,
-        download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING,
-        download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING,
-        **kwargs
-        ) -> pd.DataFrame:
-    """
-    Loads stored FlowBySector output or generates it if it doesn't exist,
-    then loads
-    :param methodname: str, name of an available method for the given class
-    :param fbsconfigpath: str, path to the FBS method file if loading a file
-        from outside the flowsa repository
-    :param download_FBAs_if_missing: bool, if True will attempt to load FBAS
-        used in generating the FBS from remote server prior to generating if
-        file not found locally
-    :param download_FBS_if_missing: bool, if True will attempt to load from
-        remote server prior to generating if file not found locally
-    :return: dataframe in flow by sector format
-    """
-    fbs = FlowBySector.getFlowBySector(
-        method=methodname,
-        external_config_path=fbsconfigpath,
-        download_sources_ok=download_FBAs_if_missing,
-        download_fbs_ok=download_FBS_if_missing,
-        **kwargs
-    )
-    return pd.DataFrame(fbs)
-
-
-def collapse_FlowBySector(
-        methodname,
-        fbsconfigpath=None,
-        download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING,
-        download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING
-        ) -> pd.DataFrame:
-    """
-    Returns fbs with one sector column in place of two
-    :param methodname: string, Name of an available method for the given class
-    :return: dataframe in flow by sector format
-    """
-    fbs = flowsa.getFlowBySector(methodname, fbsconfigpath,
-                                 download_FBAs_if_missing,
-                                 download_FBS_if_missing)
-    fbs_collapsed = collapse_fbs_sectors(fbs)
-
-    # check data for NoneType in sector column
-    fbs_collapsed = check_for_nonetypes_in_sector_col(fbs_collapsed)
-    # check data for negative FlowAmount values
-    fbs_collapsed = check_for_negative_flowamounts(fbs_collapsed)
-
-    return fbs_collapsed
-
-# todo: will reintroduce option to create bibliography post 2.0 release
-# def writeFlowBySectorBibliography(methodname):
-#     """
-#     Generate bibliography for FlowBySectorMethod in local directory
-#     :param methodname: string, FBS methodname for which to create .bib file
-#     :return: .bib file save to local directory
-#     """
-#     # Generate a single .bib file for a list of Flow-By-Sector method names
-#     # and save file to local directory
-#     log.info(f'Write bibliography to {biboutputpath / methodname}.bib')
-#     generate_fbs_bibliography(methodname)
-
-
-def seeAvailableFlowByModels(flowbytype, print_method=True):
-    """
-    Console print and return available Flow-By-Activity or Flow-By-Sector models
-    :param flowbytype: 'FBA' or 'FBS'
-    :param print_method: False to skip printing to console
-    :return: dict or list of available models
-    """
-
-    # fb directory contents dependent on FBA or FBS
-    if flowbytype == 'FBA':
-        fb_dir = os.listdir(sourceconfigpath)
-    elif flowbytype == 'FBS':
-        fb_dir = os.listdir(flowbysectormethodpath)
-    else:
-        raise ValueError("flowbytype must be 'FBA' or 'FBS'")
-
-    # list of file names (drop extension) for yaml files in flow directory
-    fb_names = [os.path.splitext(f)[0] for f in fb_dir if f.endswith('.yaml')]
-
-    # further reduce list of file names by excluding common and summary_target
-    exclude = ["_common", "_summary_target"]
-    fb_names = [f for f in fb_names if all(s not in f for s in exclude)]
-
-    if flowbytype == 'FBA':
-        # create empty dictionary, this will be the data format to print FBA
-        data_print = {}
-        # iterate over names to build dict for FBA and handling years
-        for f in fb_names:
-            s = load_yaml_dict(f, 'FBA')
-            try:
-                years = s['years']
-            except KeyError:
-                years = 'YAML missing information on years'
-            data_print.update({f: years})
-    else:
-        # data format to print FBS
-        data_print = fb_names
-
-    if print_method:
-        # print data in human-readable format
-        pprint.pprint(data_print, width=79, compact=True)
-
-    return data_print
-
-
-# todo: revise data vis fxns for recursive method
-# def generateFBSplot(method_dict, plottype, sector_length_display=None,
-#                     sectors_to_include=None, plot_title=None):
-#     """
-#     Plot the results of FBS models. Graphic can either be a faceted
-#     scatterplot or a method comparison
-#     :param method_dict: dictionary, key is the label, value is the FBS
-#         methodname
-#     :param plottype: str, 'facet_graph' or 'method_comparison'
-#     :param sector_length_display: numeric, sector length by which to
-#     aggregate, default is 'None' which returns the max sector length in a
-#     dataframe
-#     :param sectors_to_include: list, sectors to include in output. Sectors
-#     are subset by all sectors that "start with" the values in this list
-#     :return: graphic displaying results of FBS models
-#     """
-#
-#     FBSscatterplot(method_dict, plottype, sector_length_display,
-#                    sectors_to_include, plot_title)