Skip to content

Commit

Permalink
fix(dataset): unset date_created after import (#2373)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-alisafaee committed Sep 28, 2021
1 parent 449ec7b commit 8e120fe
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 33 deletions.
2 changes: 1 addition & 1 deletion renku/core/commands/dataset.py
Expand Up @@ -262,7 +262,7 @@ def _add_to_dataset(
dataset.update_metadata_from(with_metadata)

# TODO: Remove this once we have a proper database dispatcher for injection
# we need to commit because "project clone" changes injection, so tha database instance here
# we need to commit because "project clone" changes injection, so the database instance here
# is not the same as the one in CommandBuilder
database_gateway.commit()

Expand Down
12 changes: 6 additions & 6 deletions renku/core/management/migrations/models/v9.py
Expand Up @@ -63,7 +63,7 @@
from renku.core.models.git import get_user_info
from renku.core.models.provenance.annotation import AnnotationSchema
from renku.core.models.refs import LinkReference
from renku.core.utils.datetime8601 import parse_date
from renku.core.utils.datetime8601 import fix_datetime, parse_date
from renku.core.utils.doi import extract_doi, is_doi
from renku.core.utils.urls import get_host, get_slug
from renku.version import __version__, version_url
Expand Down Expand Up @@ -1875,7 +1875,7 @@ def fix_datetimes(self, obj, many=False, **kwargs):
"""Pre dump hook."""
if many:
return [self.fix_datetimes(o, many=False, **kwargs) for o in obj]
obj.created = self._fix_timezone(obj.created)
obj.created = fix_datetime(obj.created)
return obj


Expand Down Expand Up @@ -1978,7 +1978,7 @@ def fix_datetimes(self, obj, many=False, **kwargs):
"""Pre dump hook."""
if many:
return [self.fix_datetimes(o, many=False, **kwargs) for o in obj]
object.__setattr__(obj, "created", self._fix_timezone(obj.created))
object.__setattr__(obj, "created", fix_datetime(obj.created))
return obj


Expand Down Expand Up @@ -2018,7 +2018,7 @@ def fix_datetimes(self, obj, many=False, **kwargs):
"""Pre dump hook."""
if many:
return [self.fix_datetimes(o, many=False, **kwargs) for o in obj]
obj.added = self._fix_timezone(obj.added)
obj.added = fix_datetime(obj.added)
return obj


Expand Down Expand Up @@ -2079,8 +2079,8 @@ def fix_datetimes(self, obj, many=False, **kwargs):
"""Pre dump hook."""
if many:
return [self.fix_datetimes(o, many=False, **kwargs) for o in obj]
obj.date_published = self._fix_timezone(obj.date_published)
obj.date_created = self._fix_timezone(obj.date_created)
obj.date_published = fix_datetime(obj.date_published)
obj.date_created = fix_datetime(obj.date_created)
return obj


Expand Down
6 changes: 0 additions & 6 deletions renku/core/models/calamus.py
Expand Up @@ -26,8 +26,6 @@
from calamus.utils import normalize_type, normalize_value
from marshmallow.base import SchemaABC

from renku.core.utils.datetime8601 import fix_timezone

prov = fields.Namespace("http://www.w3.org/ns/prov#")
rdfs = fields.Namespace("http://www.w3.org/2000/01/rdf-schema#")
renku = fields.Namespace("https://swissdatasciencecenter.github.io/renku-ontology#")
Expand Down Expand Up @@ -80,10 +78,6 @@ def _add_field_to_data(self, data, name, value):
raise ValueError(f"Field {name} is already in data {data}")
data[name] = value

def _fix_timezone(self, value):
"""Fix timezone of non-aware datetime objects."""
return fix_timezone(value)


class Uri(fields._JsonLDField, marshmallow.fields.String, marshmallow.fields.Dict):
"""A Dict/String field."""
Expand Down
21 changes: 13 additions & 8 deletions renku/core/models/dataset.py
Expand Up @@ -35,7 +35,7 @@
from renku.core.models.entity import CollectionSchema, Entity, EntitySchema
from renku.core.models.provenance.agent import Person, PersonSchema, SoftwareAgent
from renku.core.models.provenance.annotation import Annotation, AnnotationSchema
from renku.core.utils.datetime8601 import fix_timezone, local_now, parse_date
from renku.core.utils.datetime8601 import fix_datetime, local_now, parse_date
from renku.core.utils.git import get_path
from renku.core.utils.urls import get_slug

Expand Down Expand Up @@ -237,8 +237,8 @@ def __init__(
super().__init__()

self.based_on: RemoteEntity = based_on
self.date_added: datetime = fix_timezone(date_added) or local_now()
self.date_removed: datetime = fix_timezone(date_removed)
self.date_added: datetime = fix_datetime(date_added) or local_now()
self.date_removed: datetime = fix_datetime(date_removed)
self.entity: Entity = entity
self.id: str = id or DatasetFile.generate_id()
self.is_external: bool = is_external
Expand Down Expand Up @@ -292,7 +292,7 @@ def is_equal_to(self, other: "DatasetFile"):

def remove(self, date: datetime = None):
"""Create a new instance and mark it as removed."""
date_removed = fix_timezone(date) or local_now()
date_removed = fix_datetime(date) or local_now()
self.date_removed = date_removed

def is_removed(self) -> bool:
Expand Down Expand Up @@ -341,6 +341,8 @@ def __init__(
# if `date_published` is set, we are probably dealing with an imported dataset so `date_created` is not needed
if date_published:
date_created = None
else:
date_created = fix_datetime(date_created) or local_now()
if initial_identifier is None:
assert identifier is None, "Initial identifier can be None only when creating a new Dataset."
initial_identifier = identifier = uuid4().hex
Expand All @@ -352,9 +354,9 @@ def __init__(
self.creators: List[Person] = creators or []
# `dataset_files` includes existing files and those that have been removed in the previous version
self.dataset_files: List[DatasetFile] = dataset_files or []
self.date_created: datetime = fix_timezone(date_created) or local_now()
self.date_published: datetime = fix_timezone(date_published)
self.date_removed: datetime = fix_timezone(date_removed)
self.date_created: datetime = date_created
self.date_published: datetime = fix_datetime(date_published)
self.date_removed: datetime = fix_datetime(date_removed)
self.derived_from: Url = derived_from
self.description: str = description
self.images: List[ImageObject] = images or []
Expand Down Expand Up @@ -478,7 +480,7 @@ def _assign_new_identifier(self, identifier: str):

def remove(self, date: datetime = None):
"""Mark the dataset as removed."""
self.date_removed = fix_timezone(date) or local_now()
self.date_removed = fix_datetime(date) or local_now()

def is_removed(self) -> bool:
"""Return true if dataset is removed."""
Expand Down Expand Up @@ -534,6 +536,9 @@ def update_metadata_from(self, other: "Dataset"):
value = getattr(other, name)
setattr(self, name, value)

if self.date_published is not None:
self.date_created = None

def update_metadata(self, **kwargs):
"""Updates metadata."""
editable_attributes = ["creators", "description", "keywords", "title"]
Expand Down
4 changes: 2 additions & 2 deletions renku/core/models/project.py
Expand Up @@ -28,7 +28,7 @@
from renku.core.models.calamus import DateTimeList, JsonLDSchema, Nested, StringList, fields, oa, prov, renku, schema
from renku.core.models.provenance.agent import Person, PersonSchema
from renku.core.models.provenance.annotation import Annotation, AnnotationSchema
from renku.core.utils.datetime8601 import fix_timezone, local_now, parse_date
from renku.core.utils.datetime8601 import fix_datetime, local_now, parse_date
from renku.core.utils.scm import normalize_to_ascii


Expand Down Expand Up @@ -67,7 +67,7 @@ def __init__(
self.annotations: List[Annotation] = annotations or []
self.automated_update: bool = automated_update
self.creator: Person = creator
self.date_created: datetime = fix_timezone(date_created) or local_now()
self.date_created: datetime = fix_datetime(date_created) or local_now()
self.description: str = description
self.id: str = id
self.immutable_template_files: List[str] = immutable_template_files
Expand Down
15 changes: 10 additions & 5 deletions renku/core/utils/datetime8601.py
Expand Up @@ -54,12 +54,17 @@ def parse_date(value):
return date


def fix_timezone(value):
"""Fix timezone of non-aware datetime objects."""
def fix_datetime(value):
"""Fix timezone of non-aware datetime objects and remove microseconds."""
if value is None:
return
if isinstance(value, datetime) and not value.tzinfo:
value = _set_to_local_timezone(value)

if isinstance(value, datetime):
if not value.tzinfo:
value = _set_to_local_timezone(value)
if value.microsecond:
value = value.replace(microsecond=0)

return value


Expand All @@ -70,4 +75,4 @@ def _set_to_local_timezone(value):

def local_now():
"""Return current datetime in local timezone."""
return datetime.now(timezone.utc).astimezone()
return datetime.now(timezone.utc).replace(microsecond=0).astimezone()
6 changes: 5 additions & 1 deletion tests/cli/test_datasets.py
Expand Up @@ -21,6 +21,7 @@
import os
import shutil
import textwrap
import time
from pathlib import Path

import git
Expand Down Expand Up @@ -1680,20 +1681,23 @@ def test_immutability_for_files(directory_tree, runner, client, load_dataset_wit

old_dataset = load_dataset_with_injection("my-data", client)

time.sleep(1)
# Add some files
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", str(directory_tree)]).exit_code

dataset = load_dataset_with_injection("my-data", client)
assert_dataset_is_mutated(old=old_dataset, new=dataset)
old_dataset = dataset

time.sleep(1)
# Add the same files again; it should mutate because files addition dates change
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", "--overwrite", str(directory_tree)]).exit_code

dataset = load_dataset_with_injection("my-data", client)
assert_dataset_is_mutated(old=old_dataset, new=dataset)
old_dataset = dataset

time.sleep(1)
# Remove some files
assert 0 == runner.invoke(cli, ["dataset", "unlink", "my-data", "-I", "file1", "--yes"]).exit_code

Expand Down Expand Up @@ -1856,7 +1860,7 @@ def test_datasets_provenance_after_add_with_overwrite(
):
"""Test datasets provenance is updated if adding and overwriting same files."""
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", "--create", str(directory_tree)]).exit_code

time.sleep(1)
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", "--overwrite", str(directory_tree)]).exit_code

with get_datasets_provenance_with_injection(client) as datasets_provenance:
Expand Down
5 changes: 4 additions & 1 deletion tests/cli/test_integration_datasets.py
Expand Up @@ -35,7 +35,6 @@
from tests.utils import assert_dataset_is_mutated, format_result_exception, retry_failed, with_dataset


@pytest.mark.skip("Add dataset doesn't store the dataset, investigate why this fails")
@pytest.mark.integration
@retry_failed
@pytest.mark.parametrize(
Expand Down Expand Up @@ -87,6 +86,8 @@ def test_dataset_import_real_doi(runner, client, doi, prefix, sleep_after, load_

dataset = load_dataset_with_injection(doi["name"], client)
assert doi["doi"] in dataset.same_as.url
assert dataset.date_created is None
assert dataset.date_published is not None


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1013,6 +1014,8 @@ def test_dataset_update_zenodo(client, runner, doi, load_dataset_with_injection)
assert after_dataset.derived_from is None
assert after_dataset.same_as is not None
assert after_dataset.same_as != before_dataset.same_as
assert after_dataset.date_created is None
assert after_dataset.date_published is not None


@pytest.mark.integration
Expand Down
4 changes: 2 additions & 2 deletions tests/cli/test_migrate.py
Expand Up @@ -193,7 +193,7 @@ def test_comprehensive_dataset_migration(
assert "Cornell University" == dataset.creators[0].affiliation
assert "Rooth, Mats" == dataset.creators[0].name
assert dataset.date_published is None
assert "2020-08-10T21:35:05.115412+00:00" == dataset.date_created.isoformat("T")
assert "2020-08-10T21:35:05+00:00" == dataset.date_created.isoformat("T")
assert "Replication material for a paper to be presented" in dataset.description
assert "https://doi.org/10.7910/DVN/EV6KLF" == dataset.same_as.url
assert "1" == tags[0].name
Expand All @@ -203,7 +203,7 @@ def test_comprehensive_dataset_migration(

file_ = dataset.find_file("data/dataverse/copy.sh")
assert "https://dataverse.harvard.edu/api/access/datafile/3050656" == file_.source
assert "2020-08-10T21:35:10.877832+00:00" == file_.date_added.isoformat("T")
assert "2020-08-10T21:35:10+00:00" == file_.date_added.isoformat("T")
assert file_.based_on is None
assert not hasattr(file_, "creators")

Expand Down
8 changes: 8 additions & 0 deletions tests/cli/test_rerun.py
Expand Up @@ -19,6 +19,7 @@

import os
import subprocess
import time
from pathlib import Path

import git
Expand Down Expand Up @@ -247,7 +248,9 @@ def test_rerun_overridden_output(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r2", "cp", b, c]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r3", "wc", a, stdout=c).exit_code

result = runner.invoke(cli, ["rerun", "--dry-run", c])
Expand All @@ -269,7 +272,9 @@ def test_rerun_overridden_outputs_partially(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r2", "tee", c, d, stdin=b).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r3", "wc", a, stdout=c).exit_code

result = runner.invoke(cli, ["rerun", "--dry-run", c])
Expand Down Expand Up @@ -299,8 +304,11 @@ def test_rerun_multiple_paths_common_output(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r2", "cp", b, d]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r3", "cp", a, c]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r4", "wc", c, stdout=d).exit_code

result = runner.invoke(cli, ["rerun", "--dry-run", d])
Expand Down
8 changes: 8 additions & 0 deletions tests/cli/test_update.py
Expand Up @@ -18,6 +18,7 @@
"""Test ``update`` command."""

import os
import time
from pathlib import Path

import git
Expand Down Expand Up @@ -348,7 +349,9 @@ def test_update_overridden_output(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r2", "cp", b, c]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r3", "wc", a, stdout=c).exit_code

write_and_commit_file(repo, a, "new content")
Expand All @@ -372,7 +375,9 @@ def test_update_overridden_outputs_partially(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r2", "tee", c, d, stdin=b).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r3", "wc", a, stdout=c).exit_code

write_and_commit_file(repo, a, "new content")
Expand All @@ -397,8 +402,11 @@ def test_update_multiple_paths_common_output(project, renku_cli, runner):
write_and_commit_file(repo, a, "content")

assert 0 == runner.invoke(cli, ["run", "--name", "r1", "cp", a, b]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r2", "cp", b, d]).exit_code
time.sleep(1)
assert 0 == runner.invoke(cli, ["run", "--name", "r3", "cp", a, c]).exit_code
time.sleep(1)
assert 0 == renku_cli("run", "--name", "r4", "wc", c, stdout=d).exit_code

write_and_commit_file(repo, a, "new content")
Expand Down
3 changes: 2 additions & 1 deletion tests/utils.py
Expand Up @@ -82,7 +82,8 @@ def assert_dataset_is_mutated(old: Dataset, new: Dataset, mutator=None):
assert old.id != new.id
assert old.identifier != new.identifier
assert old.id == new.derived_from.url_id
assert old.date_created != new.date_created
if old.date_created and new.date_created:
assert old.date_created <= new.date_created
assert new.same_as is None
assert new.date_published is None
assert new.identifier in new.id
Expand Down

0 comments on commit 8e120fe

Please sign in to comment.