diff --git a/recidiviz/ingest/direct/direct_ingest_documentation_generator.py b/recidiviz/ingest/direct/direct_ingest_documentation_generator.py
index d0d674d7a7..ad3db4045d 100644
--- a/recidiviz/ingest/direct/direct_ingest_documentation_generator.py
+++ b/recidiviz/ingest/direct/direct_ingest_documentation_generator.py
@@ -36,6 +36,9 @@
from recidiviz.ingest.direct.views.direct_ingest_view_query_builder_collector import (
DirectIngestViewQueryBuilderCollector,
)
+from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
+ RawDataReferenceReasonsYamlLoader,
+)
from recidiviz.utils.string import StrictStringFormatter
STATE_RAW_DATA_FILE_HEADER_TEMPLATE = """# {state_name} Raw Data Description
@@ -70,6 +73,10 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
state_code = StateCode(region_code.upper())
state_name = state_code.get_state().name
+ downstream_views_by_raw_file = self.get_downstream_referencing_views(
+ state_code
+ )
+
file_header = StrictStringFormatter().format(
STATE_RAW_DATA_FILE_HEADER_TEMPLATE,
state_name=state_name,
@@ -85,6 +92,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
),
)
else:
+ downstream_views_by_raw_file = defaultdict(list)
file_header = ""
raw_file_configs = [
@@ -109,6 +117,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
config_paths_by_file_tag,
file_tags_with_raw_file_configs,
views_by_raw_file,
+ downstream_views_by_raw_file,
)
docs_per_file: Dict[str, str] = {
@@ -205,6 +214,7 @@ def _generate_raw_file_table(
config_paths_by_file_tag: Dict[str, str],
file_tags_with_raw_file_configs: List[str],
views_by_raw_file: Dict[str, List[str]],
+ downstream_views_by_raw_file: Dict[str, List[str]],
) -> str:
"""Generates a Markdown-formatted table of contents to be included in a raw file specification."""
table_matrix = [
@@ -215,11 +225,16 @@ def _generate_raw_file_table(
else f"{file_tag}"
),
",
".join(sorted(views_by_raw_file[file_tag])),
+ ",
".join(sorted(downstream_views_by_raw_file[file_tag])),
]
for file_tag in sorted(config_paths_by_file_tag)
]
writer = MarkdownTableWriter(
- headers=["**Table**", "**Referencing Views**"],
+ headers=[
+ "**Table**",
+ "**Referencing Ingest Views**",
+ "**Referencing Downstream Views**",
+ ],
value_matrix=table_matrix,
# Margin values other than 0 have nondeterministic spacing. Do not change.
margin=0,
@@ -240,3 +255,19 @@ def get_referencing_views(
views_by_raw_file[config.file_tag].append(ingest_view.ingest_view_name)
return views_by_raw_file
+
+ @staticmethod
+ def get_downstream_referencing_views(
+ state_code: StateCode,
+ ) -> Dict[str, List[str]]:
+ """Generates a dictionary mapping raw files to downstream views that reference them."""
+ raw_data_references = (
+ RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
+ state_code
+ )
+ )
+ downstream_views_by_raw_file = defaultdict(list)
+ for file_tag, views in raw_data_references.items():
+ downstream_views_by_raw_file[file_tag] = [view.to_str() for view in views]
+
+ return downstream_views_by_raw_file
diff --git a/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py b/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py
index 3a2a093532..295a343ee6 100644
--- a/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py
+++ b/recidiviz/tests/ingest/direct/direct_ingest_documentation_generator_test.py
@@ -16,10 +16,12 @@
# =============================================================================
"""Tests for DirectIngestDocumentationGenerator."""
import unittest
+from collections import defaultdict
from typing import List
from mock import MagicMock, patch
+from recidiviz.big_query.big_query_address import BigQueryAddress
from recidiviz.common.constants import states
from recidiviz.common.constants.states import TEST_STATE_CODE_DOCS
from recidiviz.ingest.direct.direct_ingest_documentation_generator import (
@@ -94,8 +96,13 @@ def tearDown(self) -> None:
"recidiviz.ingest.direct.direct_ingest_documentation_generator.DirectIngestDocumentationGenerator"
".get_referencing_views"
)
+ @patch(
+ "recidiviz.ingest.direct.direct_ingest_documentation_generator.RawDataReferenceReasonsYamlLoader"
+ ".get_downstream_referencing_views"
+ )
def test_generate_raw_file_docs_for_region(
self,
+ mock_downstream_referencing_views: MagicMock,
mock_referencing_views: MagicMock,
_mock_region: MagicMock,
mock_raw_config: MagicMock,
@@ -112,6 +119,16 @@ def test_generate_raw_file_docs_for_region(
"tagNotHistorical": [],
"tagPrimaryKeyColsMissing": [],
}
+ mock_downstream_referencing_views.return_value = defaultdict(
+ set,
+ {
+ "multiLineDescription": {
+ BigQueryAddress.from_str("dataset.view_three"),
+ BigQueryAddress.from_str("dataset.view_four"),
+ },
+ "tagColumnsMissing": {BigQueryAddress.from_str("dataset.view_four")},
+ },
+ )
documentation_generator = DirectIngestDocumentationGenerator()
documentation = documentation_generator.generate_raw_file_docs_for_region(
@@ -126,12 +143,12 @@ def test_generate_raw_file_docs_for_region(
## Table of Contents
-| **Table** | **Referencing Views** |
-|----------------------------------------------------------------|-----------------------|
-|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,
view_two|
-|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one |
-|[tagNotHistorical](raw_data/tagNotHistorical.md) | |
-|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| |
+| **Table** |**Referencing Ingest Views**| **Referencing Downstream Views** |
+|----------------------------------------------------------------|----------------------------|------------------------------------------|
+|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,
view_two |dataset.view_four,
dataset.view_three|
+|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one |dataset.view_four |
+|[tagNotHistorical](raw_data/tagNotHistorical.md) | | |
+|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| | |
"""
expected_multi_line = """## multiLineDescription
diff --git a/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py b/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py
index 47f5b3c94c..9753d1aeee 100644
--- a/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py
+++ b/recidiviz/tests/tools/enforce_raw_data_reference_documentation_test.py
@@ -15,26 +15,20 @@
# along with this program. If not, see .
# =============================================================================
"""Tests for enforcing documentation of views that reference raw data."""
-import os
import unittest
from typing import Any, Dict, List, Set, Tuple
from unittest.mock import patch
-import yaml
-
-import recidiviz
from recidiviz.big_query.big_query_address import BigQueryAddress
from recidiviz.common.constants.states import StateCode
from recidiviz.tools.find_direct_raw_data_references import (
find_direct_raw_data_references,
)
-from recidiviz.view_registry.deployed_views import all_deployed_view_builders
-
-RAW_DATA_REFERENCES_YAML = "view_registry/raw_data_reference_reasons.yaml"
-RAW_DATA_REFERENCES_YAML_PATH = os.path.join(
- os.path.dirname(recidiviz.__file__),
+from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
RAW_DATA_REFERENCES_YAML,
+ RawDataReferenceReasonsYamlLoader,
)
+from recidiviz.view_registry.deployed_views import all_deployed_view_builders
class TestEnforceRawDataReferenceDocumentation(unittest.TestCase):
@@ -50,13 +44,7 @@ class TestEnforceRawDataReferenceDocumentation(unittest.TestCase):
def setUpClass(cls) -> None:
cls.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
cls.project_id_patcher.start().return_value = "recidiviz-testing"
- with open(RAW_DATA_REFERENCES_YAML_PATH, "r", encoding="utf-8") as yaml_file:
- cls.yaml_raw_data = yaml.safe_load(yaml_file)
- cls.yaml_data = (
- TestEnforceRawDataReferenceDocumentation._convert_raw_yaml_data_to_objs(
- cls.yaml_raw_data
- )
- )
+ cls.yaml_data = RawDataReferenceReasonsYamlLoader.get_yaml_data()
cls.deployed_views_references = find_direct_raw_data_references(
all_deployed_view_builders()
)
@@ -67,7 +55,9 @@ def tearDownClass(cls) -> None:
def test_verify_yaml_entries_in_alphabetical_order(self) -> None:
self.assertTrue(
- TestEnforceRawDataReferenceDocumentation._is_sorted(self.yaml_raw_data),
+ TestEnforceRawDataReferenceDocumentation._is_sorted(
+ RawDataReferenceReasonsYamlLoader.get_raw_yaml_data()
+ ),
f"Entries in {RAW_DATA_REFERENCES_YAML} must be in alphabetical order.",
)
@@ -130,15 +120,3 @@ def _find_missing_references(
for view in views
if view not in actual.get(state, {}).get(file_tag, set())
]
-
- @staticmethod
- def _convert_raw_yaml_data_to_objs(
- references: Dict[str, Dict[str, Set[str]]]
- ) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]:
- return {
- StateCode(state_code): {
- file_tag: {BigQueryAddress.from_str(view) for view in views}
- for file_tag, views in file_tags.items()
- }
- for state_code, file_tags in references.items()
- }
diff --git a/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py b/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py
new file mode 100644
index 0000000000..72f429d8aa
--- /dev/null
+++ b/recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py
@@ -0,0 +1,121 @@
+# Recidiviz - a data platform for criminal justice reform
+# Copyright (C) 2023 Recidiviz, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# =============================================================================
+"""Tests for raw_data_reference_reasons_yaml_loader.py."""
+import unittest
+from collections import defaultdict
+from unittest.mock import mock_open, patch
+
+import yaml
+from mock import MagicMock
+
+from recidiviz.big_query.big_query_address import BigQueryAddress
+from recidiviz.common.constants.states import StateCode
+from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
+ RawDataReferenceReasonsYamlLoader,
+)
+
+mock_yaml_content = """
+US_XX:
+ table1:
+ dataset1.table1: |-
+ Usage reason unknown.
+ dataset2.table2: |-
+ Usage reason unknown.
+US_YY:
+ table2:
+ dataset3.table3: |-
+ Usage reason unknown.
+"""
+mock_yaml_invalid_content = """
+US_NOT_REAL:
+ table1:
+ dataset1.table1: |-
+ Usage reason unknown.
+"""
+mock_raw_data = {
+ "US_XX": {
+ "table1": {
+ "dataset1.table1": "Usage reason unknown.",
+ "dataset2.table2": "Usage reason unknown.",
+ }
+ },
+ "US_YY": {"table2": {"dataset3.table3": "Usage reason unknown."}},
+}
+mock_converted_data = {
+ StateCode.US_XX: {
+ "table1": {
+ BigQueryAddress.from_str("dataset1.table1"),
+ BigQueryAddress.from_str("dataset2.table2"),
+ }
+ },
+ StateCode.US_YY: {"table2": {BigQueryAddress.from_str("dataset3.table3")}},
+}
+
+
+class TestRawDataReferenceReasonsYamlLoader(unittest.TestCase):
+ """Test raw data reference reasons yaml loader."""
+
+ def setUp(self) -> None:
+ RawDataReferenceReasonsYamlLoader.reset_data()
+
+ @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
+ @patch("yaml.safe_load", side_effect=yaml.YAMLError("error parsing YAML"))
+ def test_load_yaml_failure(self, _1: MagicMock, _2: MagicMock) -> None:
+ with self.assertRaises(RuntimeError):
+ RawDataReferenceReasonsYamlLoader.get_yaml_data()
+ with self.assertRaises(RuntimeError):
+ RawDataReferenceReasonsYamlLoader.get_raw_yaml_data()
+
+ @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_invalid_content)
+ def test_parse_yaml_failure(self, _: MagicMock) -> None:
+ with self.assertRaises(RuntimeError):
+ RawDataReferenceReasonsYamlLoader.get_yaml_data()
+
+ @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
+ def test_load_yaml(self, _: MagicMock) -> None:
+ self.assertEqual(
+ RawDataReferenceReasonsYamlLoader.get_yaml_data(), mock_converted_data
+ )
+ self.assertEqual(
+ RawDataReferenceReasonsYamlLoader.get_raw_yaml_data(), mock_raw_data
+ )
+
+ @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
+ def test_get_downstream_referencing_views(self, _: MagicMock) -> None:
+ result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
+ StateCode.US_XX
+ )
+ self.assertEqual(
+ result,
+ {
+ "table1": {
+ BigQueryAddress.from_str("dataset1.table1"),
+ BigQueryAddress.from_str("dataset2.table2"),
+ }
+ },
+ )
+
+ @patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
+ def test_get_downstream_referencing_views_invalid_state(self, _: MagicMock) -> None:
+ result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
+ StateCode.US_WW
+ )
+ self.assertEqual(
+ result,
+ defaultdict(set),
+ )
+ self.assertEqual(result["non_existent_file_tag"], set())
diff --git a/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py b/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py
new file mode 100644
index 0000000000..9a8d21b4d9
--- /dev/null
+++ b/recidiviz/tools/raw_data_reference_reasons_yaml_loader.py
@@ -0,0 +1,93 @@
+# Recidiviz - a data platform for criminal justice reform
+# Copyright (C) 2023 Recidiviz, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# =============================================================================
+"""Load and parse RAW_DATA_REFERENCES_YAML.
+"""
+
+import os
+from collections import defaultdict
+from typing import Dict, Set
+
+import yaml
+
+import recidiviz
+from recidiviz.big_query.big_query_address import BigQueryAddress
+from recidiviz.common.constants.states import StateCode
+from recidiviz.utils import environment
+
+RAW_DATA_REFERENCES_YAML = "view_registry/raw_data_reference_reasons.yaml"
+RAW_DATA_REFERENCES_YAML_PATH = os.path.join(
+ os.path.dirname(recidiviz.__file__),
+ RAW_DATA_REFERENCES_YAML,
+)
+
+
+class RawDataReferenceReasonsYamlLoader:
+ """Class responsible for loading and parsing RAW_DATA_REFERENCES_YAML."""
+
+ _yaml_data: Dict[StateCode, Dict[str, Set[BigQueryAddress]]] = {}
+ _raw_yaml_data: Dict[str, Dict[str, Set[str]]] = {}
+
+ @classmethod
+ def get_raw_yaml_data(cls) -> Dict[str, Dict[str, Set[str]]]:
+ if not cls._raw_yaml_data:
+ cls._load_yaml()
+ return cls._raw_yaml_data
+
+ @classmethod
+ @environment.test_only
+ def get_yaml_data(cls) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]:
+ if not cls._yaml_data:
+ cls._load_yaml()
+ return cls._yaml_data
+
+ @classmethod
+ @environment.test_only
+ def reset_data(cls) -> None:
+ cls._yaml_data = {}
+ cls._raw_yaml_data = {}
+
+ @classmethod
+ def get_downstream_referencing_views(
+ cls, state_code: StateCode
+ ) -> Dict[str, Set[BigQueryAddress]]:
+ """Get raw data filetags and downstream referencing views for a given region code."""
+ if not cls._yaml_data:
+ cls._load_yaml()
+ return cls._yaml_data.get(state_code, defaultdict(set))
+
+ @classmethod
+ def _load_yaml(cls, yaml_path: str = RAW_DATA_REFERENCES_YAML_PATH) -> None:
+ try:
+ with open(yaml_path, "r", encoding="utf-8") as yaml_file:
+ cls._raw_yaml_data = yaml.safe_load(yaml_file)
+ cls._yaml_data = cls.convert_raw_yaml_data_to_objs(cls._raw_yaml_data)
+ except Exception as e:
+ raise RuntimeError(
+ f"Failed to load or parse YAML data from {yaml_path}: {e}"
+ ) from e
+
+ @staticmethod
+ def convert_raw_yaml_data_to_objs(
+ references: Dict[str, Dict[str, Set[str]]]
+ ) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]:
+ return {
+ StateCode(state_code): {
+ file_tag: {BigQueryAddress.from_str(view) for view in views}
+ for file_tag, views in file_tags.items()
+ }
+ for state_code, file_tags in references.items()
+ }