diff --git a/recidiviz/ingest/direct/direct_ingest_documentation_generator.py b/recidiviz/ingest/direct/direct_ingest_documentation_generator.py index d0d674d7a7..ad3db4045d 100644 --- a/recidiviz/ingest/direct/direct_ingest_documentation_generator.py +++ b/recidiviz/ingest/direct/direct_ingest_documentation_generator.py @@ -36,6 +36,9 @@ from recidiviz.ingest.direct.views.direct_ingest_view_query_builder_collector import ( DirectIngestViewQueryBuilderCollector, ) +from recidiviz.tools.raw_data_reference_reasons_yaml_loader import ( + RawDataReferenceReasonsYamlLoader, +) from recidiviz.utils.string import StrictStringFormatter STATE_RAW_DATA_FILE_HEADER_TEMPLATE = """# {state_name} Raw Data Description @@ -70,6 +73,10 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: state_code = StateCode(region_code.upper()) state_name = state_code.get_state().name + downstream_views_by_raw_file = self.get_downstream_referencing_views( + state_code + ) + file_header = StrictStringFormatter().format( STATE_RAW_DATA_FILE_HEADER_TEMPLATE, state_name=state_name, @@ -85,6 +92,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: ), ) else: + downstream_views_by_raw_file = defaultdict(list) file_header = "" raw_file_configs = [ @@ -109,6 +117,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file, + downstream_views_by_raw_file, ) docs_per_file: Dict[str, str] = { @@ -205,6 +214,7 @@ def _generate_raw_file_table( config_paths_by_file_tag: Dict[str, str], file_tags_with_raw_file_configs: List[str], views_by_raw_file: Dict[str, List[str]], + downstream_views_by_raw_file: Dict[str, List[str]], ) -> str: """Generates a Markdown-formatted table of contents to be included in a raw file specification.""" table_matrix = [ @@ -215,11 +225,16 @@ def _generate_raw_file_table( else f"{file_tag}" ), ",
".join(sorted(views_by_raw_file[file_tag])), + ",
".join(sorted(downstream_views_by_raw_file[file_tag])), ] for file_tag in sorted(config_paths_by_file_tag) ] writer = MarkdownTableWriter( - headers=["**Table**", "**Referencing Views**"], + headers=[ + "**Table**", + "**Referencing Ingest Views**", + "**Referencing Downstream Views**", + ], value_matrix=table_matrix, # Margin values other than 0 have nondeterministic spacing. Do not change. margin=0, @@ -240,3 +255,19 @@ def get_referencing_views( views_by_raw_file[config.file_tag].append(ingest_view.ingest_view_name) return views_by_raw_file + + @staticmethod + def get_downstream_referencing_views( + state_code: StateCode, + ) -> Dict[str, List[str]]: + """Generates a dictionary mapping raw files to downstream views that reference them.""" + raw_data_references = ( + RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views( + state_code + ) + ) + downstream_views_by_raw_file = defaultdict(list) + for file_tag, views in raw_data_references.items(): + downstream_views_by_raw_file[file_tag] = [view.to_str() for view in views] + + return downstream_views_by_raw_file diff --git a/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py b/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py index 3a2a093532..295a343ee6 100644 --- a/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py +++ b/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py @@ -16,10 +16,12 @@ # ============================================================================= """Tests for DirectIngestDocumentationGenerator.""" import unittest +from collections import defaultdict from typing import List from mock import MagicMock, patch +from recidiviz.big_query.big_query_address import BigQueryAddress from recidiviz.common.constants import states from recidiviz.common.constants.states import TEST_STATE_CODE_DOCS from recidiviz.ingest.direct.direct_ingest_documentation_generator import ( @@ -94,8 +96,13 @@ def tearDown(self) -> None: "recidiviz.ingest.direct.direct_ingest_documentation_generator.DirectIngestDocumentationGenerator" ".get_referencing_views" ) + @patch( + "recidiviz.ingest.direct.direct_ingest_documentation_generator.RawDataReferenceReasonsYamlLoader" + ".get_downstream_referencing_views" + ) def test_generate_raw_file_docs_for_region( self, + mock_downstream_referencing_views: MagicMock, mock_referencing_views: MagicMock, _mock_region: MagicMock, mock_raw_config: MagicMock, @@ -112,6 +119,16 @@ def test_generate_raw_file_docs_for_region( "tagNotHistorical": [], "tagPrimaryKeyColsMissing": [], } + mock_downstream_referencing_views.return_value = defaultdict( + set, + { + "multiLineDescription": { + BigQueryAddress.from_str("dataset.view_three"), + BigQueryAddress.from_str("dataset.view_four"), + }, + "tagColumnsMissing": {BigQueryAddress.from_str("dataset.view_four")}, + }, + ) documentation_generator = DirectIngestDocumentationGenerator() documentation = documentation_generator.generate_raw_file_docs_for_region( @@ -126,12 +143,12 @@ def test_generate_raw_file_docs_for_region( ## Table of Contents -| **Table** | **Referencing Views** | -|----------------------------------------------------------------|-----------------------| -|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,
view_two| -|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one | -|[tagNotHistorical](raw_data/tagNotHistorical.md) | | -|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| | +| **Table** |**Referencing Ingest Views**| **Referencing Downstream Views** | +|----------------------------------------------------------------|----------------------------|------------------------------------------| +|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,
view_two |dataset.view_four,
dataset.view_three| +|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one |dataset.view_four | +|[tagNotHistorical](raw_data/tagNotHistorical.md) | | | +|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| | | """ expected_multi_line = """## multiLineDescription diff --git a/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py b/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py index 47f5b3c94c..9753d1aeee 100644 --- a/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py +++ b/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py @@ -15,26 +15,20 @@ # along with this program. If not, see . # ============================================================================= """Tests for enforcing documentation of views that reference raw data.""" -import os import unittest from typing import Any, Dict, List, Set, Tuple from unittest.mock import patch -import yaml - -import recidiviz from recidiviz.big_query.big_query_address import BigQueryAddress from recidiviz.common.constants.states import StateCode from recidiviz.tools.find_direct_raw_data_references import ( find_direct_raw_data_references, ) -from recidiviz.view_registry.deployed_views import all_deployed_view_builders - -RAW_DATA_REFERENCES_YAML = "view_registry/raw_data_reference_reasons.yaml" -RAW_DATA_REFERENCES_YAML_PATH = os.path.join( - os.path.dirname(recidiviz.__file__), +from recidiviz.tools.raw_data_reference_reasons_yaml_loader import ( RAW_DATA_REFERENCES_YAML, + RawDataReferenceReasonsYamlLoader, ) +from recidiviz.view_registry.deployed_views import all_deployed_view_builders class TestEnforceRawDataReferenceDocumentation(unittest.TestCase): @@ -50,13 +44,7 @@ class TestEnforceRawDataReferenceDocumentation(unittest.TestCase): def setUpClass(cls) -> None: cls.project_id_patcher = patch("recidiviz.utils.metadata.project_id") cls.project_id_patcher.start().return_value = "recidiviz-testing" - with open(RAW_DATA_REFERENCES_YAML_PATH, "r", encoding="utf-8") as yaml_file: - cls.yaml_raw_data = yaml.safe_load(yaml_file) - cls.yaml_data = ( - TestEnforceRawDataReferenceDocumentation._convert_raw_yaml_data_to_objs( - cls.yaml_raw_data - ) - ) + cls.yaml_data = RawDataReferenceReasonsYamlLoader.get_yaml_data() cls.deployed_views_references = find_direct_raw_data_references( all_deployed_view_builders() ) @@ -67,7 +55,9 @@ def tearDownClass(cls) -> None: def test_verify_yaml_entries_in_alphabetical_order(self) -> None: self.assertTrue( - TestEnforceRawDataReferenceDocumentation._is_sorted(self.yaml_raw_data), + TestEnforceRawDataReferenceDocumentation._is_sorted( + RawDataReferenceReasonsYamlLoader.get_raw_yaml_data() + ), f"Entries in {RAW_DATA_REFERENCES_YAML} must be in alphabetical order.", ) @@ -130,15 +120,3 @@ def _find_missing_references( for view in views if view not in actual.get(state, {}).get(file_tag, set()) ] - - @staticmethod - def _convert_raw_yaml_data_to_objs( - references: Dict[str, Dict[str, Set[str]]] - ) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]: - return { - StateCode(state_code): { - file_tag: {BigQueryAddress.from_str(view) for view in views} - for file_tag, views in file_tags.items() - } - for state_code, file_tags in references.items() - } diff --git a/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py b/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py new file mode 100644 index 0000000000..72f429d8aa --- /dev/null +++ b/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py @@ -0,0 +1,121 @@ +# Recidiviz - a data platform for criminal justice reform +# Copyright (C) 2023 Recidiviz, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# ============================================================================= +"""Tests for raw_data_reference_reasons_yaml_loader.py.""" +import unittest +from collections import defaultdict +from unittest.mock import mock_open, patch + +import yaml +from mock import MagicMock + +from recidiviz.big_query.big_query_address import BigQueryAddress +from recidiviz.common.constants.states import StateCode +from recidiviz.tools.raw_data_reference_reasons_yaml_loader import ( + RawDataReferenceReasonsYamlLoader, +) + +mock_yaml_content = """ +US_XX: + table1: + dataset1.table1: |- + Usage reason unknown. + dataset2.table2: |- + Usage reason unknown. +US_YY: + table2: + dataset3.table3: |- + Usage reason unknown. +""" +mock_yaml_invalid_content = """ +US_NOT_REAL: + table1: + dataset1.table1: |- + Usage reason unknown. +""" +mock_raw_data = { + "US_XX": { + "table1": { + "dataset1.table1": "Usage reason unknown.", + "dataset2.table2": "Usage reason unknown.", + } + }, + "US_YY": {"table2": {"dataset3.table3": "Usage reason unknown."}}, +} +mock_converted_data = { + StateCode.US_XX: { + "table1": { + BigQueryAddress.from_str("dataset1.table1"), + BigQueryAddress.from_str("dataset2.table2"), + } + }, + StateCode.US_YY: {"table2": {BigQueryAddress.from_str("dataset3.table3")}}, +} + + +class TestRawDataReferenceReasonsYamlLoader(unittest.TestCase): + """Test raw data reference reasons yaml loader.""" + + def setUp(self) -> None: + RawDataReferenceReasonsYamlLoader.reset_data() + + @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content) + @patch("yaml.safe_load", side_effect=yaml.YAMLError("error parsing YAML")) + def test_load_yaml_failure(self, _1: MagicMock, _2: MagicMock) -> None: + with self.assertRaises(RuntimeError): + RawDataReferenceReasonsYamlLoader.get_yaml_data() + with self.assertRaises(RuntimeError): + RawDataReferenceReasonsYamlLoader.get_raw_yaml_data() + + @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_invalid_content) + def test_parse_yaml_failure(self, _: MagicMock) -> None: + with self.assertRaises(RuntimeError): + RawDataReferenceReasonsYamlLoader.get_yaml_data() + + @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content) + def test_load_yaml(self, _: MagicMock) -> None: + self.assertEqual( + RawDataReferenceReasonsYamlLoader.get_yaml_data(), mock_converted_data + ) + self.assertEqual( + RawDataReferenceReasonsYamlLoader.get_raw_yaml_data(), mock_raw_data + ) + + @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content) + def test_get_downstream_referencing_views(self, _: MagicMock) -> None: + result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views( + StateCode.US_XX + ) + self.assertEqual( + result, + { + "table1": { + BigQueryAddress.from_str("dataset1.table1"), + BigQueryAddress.from_str("dataset2.table2"), + } + }, + ) + + @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content) + def test_get_downstream_referencing_views_invalid_state(self, _: MagicMock) -> None: + result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views( + StateCode.US_WW + ) + self.assertEqual( + result, + defaultdict(set), + ) + self.assertEqual(result["non_existent_file_tag"], set()) diff --git a/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py b/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py new file mode 100644 index 0000000000..9a8d21b4d9 --- /dev/null +++ b/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py @@ -0,0 +1,93 @@ +# Recidiviz - a data platform for criminal justice reform +# Copyright (C) 2023 Recidiviz, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# ============================================================================= +"""Load and parse RAW_DATA_REFERENCES_YAML. +""" + +import os +from collections import defaultdict +from typing import Dict, Set + +import yaml + +import recidiviz +from recidiviz.big_query.big_query_address import BigQueryAddress +from recidiviz.common.constants.states import StateCode +from recidiviz.utils import environment + +RAW_DATA_REFERENCES_YAML = "view_registry/raw_data_reference_reasons.yaml" +RAW_DATA_REFERENCES_YAML_PATH = os.path.join( + os.path.dirname(recidiviz.__file__), + RAW_DATA_REFERENCES_YAML, +) + + +class RawDataReferenceReasonsYamlLoader: + """Class responsible for loading and parsing RAW_DATA_REFERENCES_YAML.""" + + _yaml_data: Dict[StateCode, Dict[str, Set[BigQueryAddress]]] = {} + _raw_yaml_data: Dict[str, Dict[str, Set[str]]] = {} + + @classmethod + def get_raw_yaml_data(cls) -> Dict[str, Dict[str, Set[str]]]: + if not cls._raw_yaml_data: + cls._load_yaml() + return cls._raw_yaml_data + + @classmethod + @environment.test_only + def get_yaml_data(cls) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]: + if not cls._yaml_data: + cls._load_yaml() + return cls._yaml_data + + @classmethod + @environment.test_only + def reset_data(cls) -> None: + cls._yaml_data = {} + cls._raw_yaml_data = {} + + @classmethod + def get_downstream_referencing_views( + cls, state_code: StateCode + ) -> Dict[str, Set[BigQueryAddress]]: + """Get raw data filetags and downstream referencing views for a given region code.""" + if not cls._yaml_data: + cls._load_yaml() + return cls._yaml_data.get(state_code, defaultdict(set)) + + @classmethod + def _load_yaml(cls, yaml_path: str = RAW_DATA_REFERENCES_YAML_PATH) -> None: + try: + with open(yaml_path, "r", encoding="utf-8") as yaml_file: + cls._raw_yaml_data = yaml.safe_load(yaml_file) + cls._yaml_data = cls.convert_raw_yaml_data_to_objs(cls._raw_yaml_data) + except Exception as e: + raise RuntimeError( + f"Failed to load or parse YAML data from {yaml_path}: {e}" + ) from e + + @staticmethod + def convert_raw_yaml_data_to_objs( + references: Dict[str, Dict[str, Set[str]]] + ) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]: + return { + StateCode(state_code): { + file_tag: {BigQueryAddress.from_str(view) for view in views} + for file_tag, views in file_tags.items() + } + for state_code, file_tags in references.items() + }