Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions renku/core/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def add_to_dataset(
identifier = extract_doi(
with_metadata.identifier
) if with_metadata else None

try:
with client.with_dataset(
name=name, identifier=identifier, create=create
Expand Down
10 changes: 10 additions & 0 deletions renku/core/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,16 @@ def __init__(self):
super(NothingToCommit, self).__init__('There is nothing to commit.')


class DatasetFileExists(RenkuException):
"""Raise when file is already in dataset."""

def __init__(self):
"""Build a custom message."""
super(
DatasetFileExists, self
).__init__('File already exists in dataset. Use --force to add.')


class CommitMessageEmpty(RenkuException):
"""Raise invalid commit message."""

Expand Down
44 changes: 23 additions & 21 deletions renku/core/management/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,10 @@ def with_dataset(self, name=None, identifier=None, create=False):
shutil.rmtree(path.parent, ignore_errors=True)
raise

# TODO
# if path is None:
# path = dataset_path / self.METADATA
# if path.exists():
# raise ValueError('Dataset already exists')

dataset.to_yaml()

def create_dataset(
self, name, short_name=None, description='', creators=()
self, name, short_name=None, description='', creators=None
):
"""Create a dataset."""
if not name:
Expand All @@ -176,13 +170,17 @@ def create_dataset(

identifier = str(uuid.uuid4())
path = (self.renku_datasets_path / identifier / self.METADATA)

try:
path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
raise errors.DatasetExistsError(
'Dataset with reference {} exists'.format(path.parent)
)

if creators is None:
creators = [Person.from_git(self.repo)]

with with_reference(path):
dataset = Dataset(
client=self,
Expand All @@ -196,8 +194,8 @@ def create_dataset(
dataset_ref = LinkReference.create(
client=self, name='datasets/' + short_name
)
dataset_ref.set_reference(path)

dataset_ref.set_reference(path)
dataset.to_yaml()

return dataset, path, dataset_ref
Expand Down Expand Up @@ -260,32 +258,36 @@ def add_data_to_dataset(
else:
raise errors.IgnoredFiles(ignored)

if dataset.contains_any(files) and force is False:
raise errors.DatasetFileExists()

# commit all new data
file_paths = {str(data['path']) for data in files if str(data['path'])}
self.repo.git.add(*(file_paths - set(ignored)))
files_to_add = (file_paths - set(ignored))

if not self.repo.is_dirty():
return warning_message
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bug reported in #881 can be solved by moving the commit statement to this if block and removing return from here:

if not self.repo.is_dirty():
    self.repo.index.commit(
         'renku dataset: commiting {} newly added files'.
         format(len(file_paths) + len(ignored))
    )

Like this we avoid an empty commit if there is no change in data files and we allow the metadata to be updated (which was prevented previously by returning early).

Copy link
Contributor Author

@jsam jsam Dec 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And in that case adding ignored files would fail to be added?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Added ignored files show up as dirty.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How? If they are ignored it means that they are part of the .gitignore and git won't register them at all.

Copy link
Contributor

@mohammad-alisafaee mohammad-alisafaee Dec 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[~/playground/tmp]$ rm -rf * .* ; renku init
OK
Initialized empty project in /home/mohammad/playground/tmp
[~/playground/tmp]$ grep .scrapy .gitignore 
.scrapy
[~/playground/tmp]$ echo .scrapy > .scrapy
[~/playground/tmp]$ git add .scrapy 
The following paths are ignored by one of your .gitignore files:
.scrapy
Use -f if you really want to add them.
[~/playground/tmp]$ git add -f .scrapy
[~/playground/tmp]$ git status -s
A  .scrapy

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If users do not use --force and there are ignored files then this function raises an exception and won't reach this point.

self.repo.git.add(*files_to_add)

self.repo.index.commit(
'renku dataset: commiting {} newly added files'.
format(len(file_paths) + len(ignored))
)
if self.repo.is_dirty():
commit_msg = ('renku dataset: '
'committing {} newly added files'
).format(len(file_paths) + len(ignored))

self.repo.index.commit(commit_msg)

# Generate the DatasetFiles
dataset_files = []
for data in files:
if os.path.basename(str(data['path'])) == '.git':
continue

datasetfile = DatasetFile.from_revision(self, **data)
dataset_file = DatasetFile.from_revision(self, **data)

# Set dataset file path relative to projects root for submodules
if datasetfile.client != self:
datasetfile.path = str(data['path'])
dataset_files.append(datasetfile)
dataset.update_files(dataset_files)
# Set dataset file path relative to root for submodules.
if dataset_file.client != self:
dataset_file.path = str(data['path'])
dataset_files.append(dataset_file)

dataset.update_files(dataset_files)
return warning_message

def _add_from_local(self, dataset, path, link, destination):
Expand Down
7 changes: 7 additions & 0 deletions renku/core/models/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,13 @@ def editable(self):
data = {field_: obj.pop(field_) for field_ in self.EDITABLE_FIELDS}
return data

def contains_any(self, files):
"""Check if files are already within a dataset."""
for file_ in files:
if self.find_file(file_['path']):
return True
return False

def find_file(self, filename, return_index=False):
"""Find a file in files container."""
for index, file_ in enumerate(self.files):
Expand Down
16 changes: 15 additions & 1 deletion renku/service/serializers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Renku service datasets serializers."""
import marshmallow
from marshmallow import Schema, fields, post_load, pre_load

from renku.service.serializers.rpc import JsonRPCResponse
Expand Down Expand Up @@ -64,7 +65,8 @@ class DatasetCreateResponseRPC(JsonRPCResponse):
class DatasetAddFile(Schema):
"""Schema for dataset add file view."""

file_id = fields.String(required=True)
file_id = fields.String()
file_path = fields.String()


class DatasetAddRequest(Schema):
Expand All @@ -86,6 +88,18 @@ def default_commit_message(self, data, **kwargs):

return data

@post_load()
def check_files(self, data, **kwargs):
"""Check serialized file list."""
for _file in data['files']:
if 'file_id' in _file and 'file_path' in _file:
raise marshmallow.ValidationError((
'invalid reference found:'
'use either `file_id` or `file_path`'
))

return data


class DatasetAddResponse(Schema):
"""Response schema for dataset add file view."""
Expand Down
20 changes: 15 additions & 5 deletions renku/service/views/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# limitations under the License.
"""Renku service datasets view."""
import json
from pathlib import Path

from flask import Blueprint, jsonify, request
from flask_apispec import marshal_with, use_kwargs
Expand Down Expand Up @@ -141,7 +142,7 @@ def list_dataset_files_view(user, cache):
@requires_cache
@requires_identity
def add_file_to_dataset_view(user, cache):
"""Add uploaded file to cloned repository."""
"""Add the uploaded file to cloned repository."""
ctx = DatasetAddRequest().load(request.json)
project = cache.get_project(user, ctx['project_id'])
project_path = make_project_path(user, project)
Expand All @@ -160,14 +161,23 @@ def add_file_to_dataset_view(user, cache):
)

local_paths = []
for file_ in ctx['files']:
file = cache.get_file(user, file_['file_id'])
local_path = make_file_path(user, file)
for _file in ctx['files']:
local_path = None

if 'file_id' in _file:
file = cache.get_file(user, _file['file_id'])
local_path = make_file_path(user, file)

elif 'file_path' in _file:
local_path = project_path / Path(_file['file_path'])

if not local_path or not local_path.exists():
return jsonify(
error={
'code': INVALID_PARAMS_ERROR_CODE,
'message': 'invalid file_id: {0}'.format(file_['file_id'])
'message':
'invalid file reference: {0}'.
format(local_path.relative_to(project_path))
}
)

Expand Down
79 changes: 71 additions & 8 deletions tests/cli/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pathlib import Path

import pytest
import requests
import yaml

from renku.cli import cli
Expand Down Expand Up @@ -1278,6 +1279,7 @@ def test_avoid_empty_commits(runner, client, directory_tree):
result = runner.invoke(
cli, ['dataset', 'add', 'my-dataset', directory_tree.strpath]
)

assert 0 == result.exit_code

commit_sha_after = client.repo.head.object.hexsha
Expand All @@ -1291,18 +1293,59 @@ def test_avoid_empty_commits(runner, client, directory_tree):

commit_sha_after = client.repo.head.object.hexsha
assert commit_sha_before == commit_sha_after
assert 'Error: There is nothing to commit.' in result.output
assert 'Error: File already exists in dataset.' in result.output


def test_add_removes_credentials(runner, client):
"""Test credentials are removed when adding to a dataset."""
url = 'https://username:password@example.com/index.html'
result = runner.invoke(cli, ['dataset', 'add', '-c', 'my-dataset', url])
def test_multiple_dataset_commits(runner, client, directory_tree):
"""Check adding existing data to multiple datasets."""
commit_sha_before = client.repo.head.object.hexsha
result = runner.invoke(
cli, ['dataset', 'add', '-c', 'my-dataset1', directory_tree.strpath]
)

assert 0 == result.exit_code

with client.with_dataset('my-dataset') as dataset:
file_ = dataset.files[0]
assert file_.url == 'https://example.com/index.html'
commit_sha_after = client.repo.head.object.hexsha
assert commit_sha_before != commit_sha_after

commit_sha_before = commit_sha_after
result = runner.invoke(
cli, ['dataset', 'add', '-c', 'my-dataset2', directory_tree.strpath]
)
assert 0 == result.exit_code

commit_sha_after = client.repo.head.object.hexsha
assert commit_sha_before != commit_sha_after


def test_add_same_filename_multiple(runner, client, directory_tree):
"""Check adding same filename multiple times."""
result = runner.invoke(
cli, ['dataset', 'add', '-c', 'my-dataset1', directory_tree.strpath]
)

assert 0 == result.exit_code

result = runner.invoke(
cli, ['dataset', 'add', 'my-dataset1', directory_tree.strpath]
)
assert 1 == result.exit_code
assert 'Error: File already exists in dataset.' in result.output

result = runner.invoke(
cli,
['dataset', 'add', '--force', 'my-dataset1', directory_tree.strpath]
)
assert 1 == result.exit_code
assert 'Error: There is nothing to commit.' in result.output

result = runner.invoke(
cli, [
'dataset', 'add', '--force', 'my-dataset1', directory_tree.strpath,
'Dockerfile'
]
)
assert 0 == result.exit_code


def test_add_removes_local_path_information(runner, client, directory_tree):
Expand All @@ -1316,3 +1359,23 @@ def test_add_removes_local_path_information(runner, client, directory_tree):
for file_ in dataset.files:
assert file_.url.startswith('file://../')
assert file_.url.endswith(file_.name)


def test_add_remove_credentials(runner, client, monkeypatch):
"""Check removal of credentials during adding of remote data files."""
url = 'https://username:password@example.com/index.html'

def get(u):
"""Mocked response."""
response = requests.Response()
response._content = b'{}'
return response

result = runner.invoke(cli, ['dataset', 'create', 'my-dataset'])
assert 0 == result.exit_code

monkeypatch.setattr(requests, 'get', get)
dataset = client.load_dataset('my-dataset')
o = client._add_from_url(dataset, url, client.path)
Copy link
Contributor

@mohammad-alisafaee mohammad-alisafaee Dec 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to avoid calling a non-public method in this test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That client abstraction should really be fixed (IMHO), cause isolating pieces in testable units is almost impossible without resorting to such calls.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get why you removed the cli call to add files. I thought perhaps it was because of the requests mocking that something wasn't working in between.
Perhaps it's better to remove this test and have it only as integration test because it is mocking an implementation detail and it is calling a protected method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I din't removed anything, that test the way it was written it was integration test and it was not marked as such - I just marked it as an integration test and wrote the actual unit test for the part of what that test is suppose to be testing since @rokroskar requested to have it.

I would also be fine just with having it as an integration test, tbh.


assert 'https://example.com/index.html' == o[0]['url']
3 changes: 2 additions & 1 deletion tests/cli/test_gitignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_dataset_add(tmpdir, runner, client):
assert 0 == result.exit_code
assert 'OK' in result.output

# Using an extension from gitignore.default
# Using an extension from gitignore.default defined as *.spec
ignored_file = tmpdir.join('my.spec')
ignored_file.write('My Specification')

Expand All @@ -36,6 +36,7 @@ def test_dataset_add(tmpdir, runner, client):
['dataset', 'add', 'testing', ignored_file.strpath],
catch_exceptions=False,
)

assert 1 == result.exit_code

client.repo.git.clean('-dff')
Expand Down
12 changes: 12 additions & 0 deletions tests/cli/test_integration_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,3 +968,15 @@ def test_renku_clone_uses_project_name(
result = runner.invoke(cli, ['clone', REMOTE, path])
assert 0 == result.exit_code
assert (Path(project_path) / expected_path / 'Dockerfile').exists()


@pytest.mark.integration
def test_add_removes_credentials(runner, client):
"""Check removal of credentials during adding of remote data files."""
url = 'https://username:password@example.com/index.html'
result = runner.invoke(cli, ['dataset', 'add', '-c', 'my-dataset', url])
assert 0 == result.exit_code

with client.with_dataset('my-dataset') as dataset:
file_ = dataset.files[0]
assert file_.url == 'https://example.com/index.html'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, it should be other way around - thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's do not start the discussion about expected and actual ;).

Loading