Merge pull request #1467 from Sage-Bionetworks/develop-filepath2-mani…

…fest-gen-FDS-2278 Update existing file paths in manifests at generation to conform to new convention
Sage-Bionetworks · Aug 30, 2024 · 12b63e3 · 12b63e3
2 parents def3bde + 8c77ccf
commit 12b63e3
Show file tree

Hide file tree

Showing 7 changed files with 309 additions and 59 deletions.
diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
@@ -291,8 +291,19 @@ def query_fileview(
                 self.storageFileviewTable = self.syn.tableQuery(
                     query=self.fileview_query,
                 ).asDataFrame()
-            except SynapseHTTPError:
-                raise AccessCredentialsError(self.storageFileview)
+            except SynapseHTTPError as exc:
+                exception_text = str(exc)
+                if "Unknown column path" in exception_text:
+                    raise ValueError(
+                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
+                    )
+                elif "Unknown column" in exception_text:
+                    missing_column = exception_text.split("Unknown column ")[-1]
+                    raise ValueError(
+                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
+                    )
+                else:
+                    raise AccessCredentialsError(self.storageFileview)
 
     def _build_query(
         self, columns: Optional[list] = None, where_clauses: Optional[list] = None
@@ -788,18 +799,49 @@ def fill_in_entity_id_filename(
         # note that if there is an existing manifest and there are files in the dataset
         # the columns Filename and entityId are assumed to be present in manifest schema
         # TODO: use idiomatic panda syntax
-        if dataset_files:
-            new_files = self._get_file_entityIds(
-                dataset_files=dataset_files, only_new_files=True, manifest=manifest
-            )
+        if not dataset_files:
+            manifest = manifest.fillna("")
+            return dataset_files, manifest
 
-            # update manifest so that it contains new dataset files
-            new_files = pd.DataFrame(new_files)
-            manifest = (
-                pd.concat([manifest, new_files], sort=False)
-                .reset_index()
-                .drop("index", axis=1)
-            )
+        all_files = self._get_file_entityIds(
+            dataset_files=dataset_files, only_new_files=False, manifest=manifest
+        )
+        new_files = self._get_file_entityIds(
+            dataset_files=dataset_files, only_new_files=True, manifest=manifest
+        )
+
+        all_files = pd.DataFrame(all_files)
+        new_files = pd.DataFrame(new_files)
+
+        # update manifest so that it contains new dataset files
+        manifest = (
+            pd.concat([manifest, new_files], sort=False)
+            .reset_index()
+            .drop("index", axis=1)
+        )
+
+        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
+        manifest_reindex = manifest.set_index("entityId")
+        all_files_reindex = all_files.set_index("entityId")
+        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
+            manifest_reindex
+        )
+
+        # Check if individual file paths in manifest and from synapse match
+        file_paths_match = (
+            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
+        )
+
+        # If all the paths do not match, update the manifest with the filepaths from synapse
+        if not file_paths_match.all():
+            manifest_reindex.loc[
+                ~file_paths_match, "Filename"
+            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
+
+            # reformat manifest for further use
+            manifest = manifest_reindex.reset_index()
+            entityIdCol = manifest.pop("entityId")
+            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
 
         manifest = manifest.fillna("")
         return dataset_files, manifest

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,6 +9,7 @@
 from dotenv import load_dotenv
 
 from schematic.configuration.configuration import CONFIG
+from schematic.models.metadata import MetadataModel
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.store.synapse import SynapseStorage
@@ -149,3 +150,13 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer:
     """Fixture to instantiate a DataModelGraphExplorer object."""
     dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld")
     return dmge
+
+
+def metadata_model(helpers, data_model_labels):
+    metadata_model = MetadataModel(
+        inputMModelLocation=helpers.get_data_path("example.model.jsonld"),
+        data_model_labels=data_model_labels,
+        inputMModelLocationType="local",
+    )
+
+    return metadata_model
diff --git a/tests/data/mock_manifests/filepath_submission_test_manifest.csv b/tests/data/mock_manifests/filepath_submission_test_manifest.csv
@@ -0,0 +1,3 @@
+Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId
+schematic - main/Test Filename Upload/txt1.txt,1.0,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954
+schematic - main/Test Filename Upload/txt2.txt,2.0,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956
diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py
@@ -0,0 +1,55 @@
+import logging
+from contextlib import nullcontext as does_not_raise
+
+from pytest_mock import MockerFixture
+
+from schematic.store.synapse import SynapseStorage
+from tests.conftest import metadata_model
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+class TestMetadataModel:
+    def test_submit_filebased_manifest(self, helpers, mocker: MockerFixture):
+        # spys
+        spy_upload_file_as_csv = mocker.spy(SynapseStorage, "upload_manifest_as_csv")
+        spy_upload_file_as_table = mocker.spy(
+            SynapseStorage, "upload_manifest_as_table"
+        )
+        spy_upload_file_combo = mocker.spy(SynapseStorage, "upload_manifest_combo")
+        spy_add_annotations = mocker.spy(
+            SynapseStorage, "add_annotations_to_entities_files"
+        )
+
+        # GIVEN a metadata model object using class labels
+        meta_data_model = metadata_model(helpers, "class_label")
+
+        # AND a filebased test manifset
+        manifest_path = helpers.get_data_path(
+            "mock_manifests/filepath_submission_test_manifest.csv"
+        )
+
+        # WHEN the manifest it submitted
+        # THEN submission should complete without error
+        with does_not_raise():
+            manifest_id = meta_data_model.submit_metadata_manifest(
+                manifest_path=manifest_path,
+                dataset_id="syn62276880",
+                manifest_record_type="file_and_entities",
+                restrict_rules=False,
+                file_annotations_upload=True,
+                hide_blanks=False,
+            )
+
+            # AND the manifest should be submitted to the correct place
+            assert manifest_id == "syn62280543"
+
+            # AND the manifest should be uploaded as a CSV
+            spy_upload_file_as_csv.assert_called_once()
+            # AND annotations should be added to the files
+            spy_add_annotations.assert_called_once()
+
+            # AND the manifest should not be uploaded as a table or combination of table, file, and entities
+            spy_upload_file_as_table.assert_not_called()
+            spy_upload_file_combo.assert_not_called()
diff --git a/tests/test_manifest.py b/tests/test_manifest.py
@@ -763,39 +763,77 @@ def test_create_manifests(
             assert all_results == expected_result
 
     @pytest.mark.parametrize(
-        "component,datasetId",
-        [("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")],
+        "component,datasetId,expected_file_based,expected_rows,expected_files",
+        [
+            ("Biospecimen", "syn61260107", False, 4, None),
+            (
+                "BulkRNA-seqAssay",
+                "syn61374924",
+                True,
+                4,
+                pd.Series(
+                    [
+                        "schematic - main/BulkRNASeq and files/txt1.txt",
+                        "schematic - main/BulkRNASeq and files/txt2.txt",
+                        "schematic - main/BulkRNASeq and files/txt4.txt",
+                        "schematic - main/BulkRNASeq and files/txt3.txt",
+                    ],
+                    name="Filename",
+                ),
+            ),
+        ],
         ids=["Record based", "File based"],
     )
-    def test_get_manifest_with_files(self, helpers, component, datasetId):
+    def test_get_manifest_with_files(
+        self,
+        helpers,
+        component,
+        datasetId,
+        expected_file_based,
+        expected_rows,
+        expected_files,
+    ):
         """
-        Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
+        Test to ensure that
+            when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
+            when generating a file based manifest from a dataset thathas had files added that the files are added correctly
         """
+        # GIVEN the example data model
         path_to_data_model = helpers.get_data_path("example.model.jsonld")
 
+        # AND a graph data model
         graph_data_model = generate_graph_data_model(
             helpers,
             path_to_data_model=path_to_data_model,
             data_model_labels="class_label",
         )
 
+        # AND a manifest generator
         generator = ManifestGenerator(
             path_to_data_model=path_to_data_model,
             graph=graph_data_model,
             root=component,
             use_annotations=True,
         )
 
+        # WHEN a manifest is generated for the appropriate dataset as a dataframe
         manifest = generator.get_manifest(
             dataset_id=datasetId, output_format="dataframe"
         )
 
-        filename_in_manifest_columns = "Filename" in manifest.columns
+        # AND it is determined if the manifest is filebased
+        is_file_based = "Filename" in manifest.columns
+
+        # AND the number of rows are checked
         n_rows = manifest.shape[0]
 
-        if component == "Biospecimen":
-            assert not filename_in_manifest_columns
-            assert n_rows == 4
-        elif component == "BulkRNA-seqAssay":
-            assert filename_in_manifest_columns
-            assert n_rows == 4
+        # THEN the manifest should have the expected number of rows
+        assert n_rows == expected_rows
+
+        # AND the manifest should be filebased or not as expected
+        assert is_file_based == expected_file_based
+
+        # AND if the manifest is file based
+        if expected_file_based:
+            # THEN the manifest should have the expected files
+            assert manifest["Filename"].equals(expected_files)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -2,8 +2,8 @@
 
 import logging
 import os
-from typing import Optional, Generator
 from pathlib import Path
+from typing import Generator, Optional
 from unittest.mock import patch
 
 import pytest