Skip to content

Commit

Permalink
Merge pull request #1467 from Sage-Bionetworks/develop-filepath2-mani…
Browse files Browse the repository at this point in the history
…fest-gen-FDS-2278

Update existing file paths in manifests at generation to conform to new convention
  • Loading branch information
GiaJordan committed Aug 30, 2024
2 parents def3bde + 8c77ccf commit 12b63e3
Show file tree
Hide file tree
Showing 7 changed files with 309 additions and 59 deletions.
68 changes: 55 additions & 13 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,19 @@ def query_fileview(
self.storageFileviewTable = self.syn.tableQuery(
query=self.fileview_query,
).asDataFrame()
except SynapseHTTPError:
raise AccessCredentialsError(self.storageFileview)
except SynapseHTTPError as exc:
exception_text = str(exc)
if "Unknown column path" in exception_text:
raise ValueError(
"The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
)
elif "Unknown column" in exception_text:
missing_column = exception_text.split("Unknown column ")[-1]
raise ValueError(
f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
)
else:
raise AccessCredentialsError(self.storageFileview)

def _build_query(
self, columns: Optional[list] = None, where_clauses: Optional[list] = None
Expand Down Expand Up @@ -788,18 +799,49 @@ def fill_in_entity_id_filename(
# note that if there is an existing manifest and there are files in the dataset
# the columns Filename and entityId are assumed to be present in manifest schema
# TODO: use idiomatic panda syntax
if dataset_files:
new_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=True, manifest=manifest
)
if not dataset_files:
manifest = manifest.fillna("")
return dataset_files, manifest

# update manifest so that it contains new dataset files
new_files = pd.DataFrame(new_files)
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)
all_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=False, manifest=manifest
)
new_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=True, manifest=manifest
)

all_files = pd.DataFrame(all_files)
new_files = pd.DataFrame(new_files)

# update manifest so that it contains new dataset files
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)

# Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
manifest_reindex = manifest.set_index("entityId")
all_files_reindex = all_files.set_index("entityId")
all_files_reindex_like_manifest = all_files_reindex.reindex_like(
manifest_reindex
)

# Check if individual file paths in manifest and from synapse match
file_paths_match = (
manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
)

# If all the paths do not match, update the manifest with the filepaths from synapse
if not file_paths_match.all():
manifest_reindex.loc[
~file_paths_match, "Filename"
] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]

# reformat manifest for further use
manifest = manifest_reindex.reset_index()
entityIdCol = manifest.pop("entityId")
manifest.insert(len(manifest.columns), "entityId", entityIdCol)

manifest = manifest.fillna("")
return dataset_files, manifest
Expand Down
11 changes: 11 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dotenv import load_dotenv

from schematic.configuration.configuration import CONFIG
from schematic.models.metadata import MetadataModel
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.synapse import SynapseStorage
Expand Down Expand Up @@ -149,3 +150,13 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer:
"""Fixture to instantiate a DataModelGraphExplorer object."""
dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld")
return dmge


def metadata_model(helpers, data_model_labels):
metadata_model = MetadataModel(
inputMModelLocation=helpers.get_data_path("example.model.jsonld"),
data_model_labels=data_model_labels,
inputMModelLocationType="local",
)

return metadata_model
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId
schematic - main/Test Filename Upload/txt1.txt,1.0,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954
schematic - main/Test Filename Upload/txt2.txt,2.0,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956
55 changes: 55 additions & 0 deletions tests/integration/test_metadata_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging
from contextlib import nullcontext as does_not_raise

from pytest_mock import MockerFixture

from schematic.store.synapse import SynapseStorage
from tests.conftest import metadata_model

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


class TestMetadataModel:
def test_submit_filebased_manifest(self, helpers, mocker: MockerFixture):
# spys
spy_upload_file_as_csv = mocker.spy(SynapseStorage, "upload_manifest_as_csv")
spy_upload_file_as_table = mocker.spy(
SynapseStorage, "upload_manifest_as_table"
)
spy_upload_file_combo = mocker.spy(SynapseStorage, "upload_manifest_combo")
spy_add_annotations = mocker.spy(
SynapseStorage, "add_annotations_to_entities_files"
)

# GIVEN a metadata model object using class labels
meta_data_model = metadata_model(helpers, "class_label")

# AND a filebased test manifset
manifest_path = helpers.get_data_path(
"mock_manifests/filepath_submission_test_manifest.csv"
)

# WHEN the manifest it submitted
# THEN submission should complete without error
with does_not_raise():
manifest_id = meta_data_model.submit_metadata_manifest(
manifest_path=manifest_path,
dataset_id="syn62276880",
manifest_record_type="file_and_entities",
restrict_rules=False,
file_annotations_upload=True,
hide_blanks=False,
)

# AND the manifest should be submitted to the correct place
assert manifest_id == "syn62280543"

# AND the manifest should be uploaded as a CSV
spy_upload_file_as_csv.assert_called_once()
# AND annotations should be added to the files
spy_add_annotations.assert_called_once()

# AND the manifest should not be uploaded as a table or combination of table, file, and entities
spy_upload_file_as_table.assert_not_called()
spy_upload_file_combo.assert_not_called()
60 changes: 49 additions & 11 deletions tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,39 +763,77 @@ def test_create_manifests(
assert all_results == expected_result

@pytest.mark.parametrize(
"component,datasetId",
[("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")],
"component,datasetId,expected_file_based,expected_rows,expected_files",
[
("Biospecimen", "syn61260107", False, 4, None),
(
"BulkRNA-seqAssay",
"syn61374924",
True,
4,
pd.Series(
[
"schematic - main/BulkRNASeq and files/txt1.txt",
"schematic - main/BulkRNASeq and files/txt2.txt",
"schematic - main/BulkRNASeq and files/txt4.txt",
"schematic - main/BulkRNASeq and files/txt3.txt",
],
name="Filename",
),
),
],
ids=["Record based", "File based"],
)
def test_get_manifest_with_files(self, helpers, component, datasetId):
def test_get_manifest_with_files(
self,
helpers,
component,
datasetId,
expected_file_based,
expected_rows,
expected_files,
):
"""
Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
Test to ensure that
when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
when generating a file based manifest from a dataset thathas had files added that the files are added correctly
"""
# GIVEN the example data model
path_to_data_model = helpers.get_data_path("example.model.jsonld")

# AND a graph data model
graph_data_model = generate_graph_data_model(
helpers,
path_to_data_model=path_to_data_model,
data_model_labels="class_label",
)

# AND a manifest generator
generator = ManifestGenerator(
path_to_data_model=path_to_data_model,
graph=graph_data_model,
root=component,
use_annotations=True,
)

# WHEN a manifest is generated for the appropriate dataset as a dataframe
manifest = generator.get_manifest(
dataset_id=datasetId, output_format="dataframe"
)

filename_in_manifest_columns = "Filename" in manifest.columns
# AND it is determined if the manifest is filebased
is_file_based = "Filename" in manifest.columns

# AND the number of rows are checked
n_rows = manifest.shape[0]

if component == "Biospecimen":
assert not filename_in_manifest_columns
assert n_rows == 4
elif component == "BulkRNA-seqAssay":
assert filename_in_manifest_columns
assert n_rows == 4
# THEN the manifest should have the expected number of rows
assert n_rows == expected_rows

# AND the manifest should be filebased or not as expected
assert is_file_based == expected_file_based

# AND if the manifest is file based
if expected_file_based:
# THEN the manifest should have the expected files
assert manifest["Filename"].equals(expected_files)
2 changes: 1 addition & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import logging
import os
from typing import Optional, Generator
from pathlib import Path
from typing import Generator, Optional
from unittest.mock import patch

import pytest
Expand Down
Loading

0 comments on commit 12b63e3

Please sign in to comment.