Skip to content

Commit

Permalink
feat(core): renku doctor check for lfs migrate info (#1234)
Browse files Browse the repository at this point in the history
* feat(core): renku doctor check for lfs migrate info
  • Loading branch information
m-alisafaee committed May 14, 2020
1 parent 8405ce5 commit 480da06
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 6 deletions.
7 changes: 7 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def repository():
def project(repository):
"""Create a test project."""
from git import Repo
from renku.cli import cli

runner = CliRunner()

repo = Repo(repository, search_parent_directories=True)
commit = repo.head.commit
Expand All @@ -179,6 +182,10 @@ def project(repository):
# remove any extra non-tracked files (.pyc, etc)
repo.git.clean('-xdff')

assert 0 == runner.invoke(
cli, ['githooks', 'install', '--force']
).exit_code


@pytest.fixture
def project_metadata(project):
Expand Down
17 changes: 17 additions & 0 deletions renku/cli/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,20 @@ def clean(client, paths):
)

click.secho('OK', fg='green')


@storage.command()
@click.option('--all', is_flag=True, help='Include all branches.')
@pass_local_client
def check(client, all):
"""Check if large files are committed to Git history."""
files = client.check_lfs_migrate_info(everything=all)
if files:
message = (
WARNING + 'Git history contains large files\n\t' +
'\n\t'.join(files)
)
click.echo(message)
exit(1)
else:
click.secho('OK', fg='green')
2 changes: 2 additions & 0 deletions renku/core/commands/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .githooks import check_git_hooks_installed
from .migration import check_migration
from .references import check_missing_references
from .storage import check_lfs_info
from .validate_shacl import check_datasets_structure, check_project_structure

# Checks will be executed in the order as they are listed in __all__.
Expand All @@ -35,4 +36,5 @@
'check_project_structure',
'check_datasets_structure',
'check_missing_external_files',
'check_lfs_info',
)
38 changes: 38 additions & 0 deletions renku/core/commands/checks/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Check for large files in Git history."""

from renku.core.commands.echo import WARNING


def check_lfs_info(client):
"""Checks if files in history should be in LFS."""
if not client.has_external_storage:
return True, None

files = client.check_lfs_migrate_info()

if not files:
return True, None

message = (
WARNING + 'Git history contains large files - consider moving them ' +
'to external storage like git LFS\n\t' + '\n\t'.join(files) + '\n'
)

return False, message
15 changes: 15 additions & 0 deletions renku/core/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import json
from pathlib import Path

import pathspec
import pyld

try:
Expand Down Expand Up @@ -78,6 +79,20 @@ def set(self, active_ctx, local_ctx, result):

cgi.escape = html.escape


class RenkuGitWildMatchPattern(pathspec.patterns.GitWildMatchPattern):
"""Custom GitWildMatchPattern matcher."""

__slots__ = ('pattern', )

def __init__(self, pattern, include=None):
"""Initialize RenkuRegexPattern."""
super().__init__(pattern, include)
self.pattern = pattern


pathspec.util.register_pattern('renku_gitwildmatch', RenkuGitWildMatchPattern)

__all__ = (
'FileNotFoundError',
'Path',
Expand Down
64 changes: 62 additions & 2 deletions renku/core/management/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""Client for handling a data storage."""
import functools
import os
import re
import shlex
import tempfile
from collections import defaultdict
Expand Down Expand Up @@ -79,6 +80,10 @@ class StorageApiMixin(RepositoryApiMixin):

_CMD_STORAGE_PULL = ['git', 'lfs', 'pull', '-I']

_CMD_STORAGE_MIGRATE_INFO = [
'git', 'lfs', 'migrate', 'info', '--top', '42000'
]

_CMD_STORAGE_LIST = ['git', 'lfs', 'ls-files', '-n']

_CMD_STORAGE_STATUS = ['git', 'lfs', 'status']
Expand Down Expand Up @@ -114,9 +119,9 @@ def renku_lfs_ignore(self):
"""Gets pathspec for files to not add to LFS."""
ignore_path = self.path / self.RENKU_LFS_IGNORE_PATH
if not os.path.exists(ignore_path):
return pathspec.PathSpec.from_lines('gitwildmatch', [])
return pathspec.PathSpec.from_lines('renku_gitwildmatch', [])
with ignore_path.open('r') as f:
return pathspec.PathSpec.from_lines('gitwildmatch', f)
return pathspec.PathSpec.from_lines('renku_gitwildmatch', f)

@property
def minimum_lfs_file_size(self):
Expand Down Expand Up @@ -390,3 +395,58 @@ def checkout_paths_from_storage(self, *paths):
stderr=STDOUT,
check=True,
)

def check_lfs_migrate_info(self, everything=False):
"""Return list of file groups in history should be in LFS."""
ref = ['--everything'] if everything else [
'--include-ref', self.repo.active_branch.name
]

includes = []
excludes = []
for p in self.renku_lfs_ignore.patterns:
if p.regex is None:
continue

pattern = p.pattern.replace(os.linesep, '').replace('\n', '')
if pattern.startswith('!'):
pattern.replace('!', '', 1)

if p.include: # File ignored by LFS
excludes.append(pattern)
else:
includes.append(pattern)

if excludes:
excludes = ['--exclude', ','.join(excludes)]
if includes:
includes = ['--include', ','.join(includes)]

above = ['--above', str(self.minimum_lfs_file_size)]

command = (
self._CMD_STORAGE_MIGRATE_INFO + ref + above + includes + excludes
)

try:
lfs_output = run(
command,
stdout=PIPE,
stderr=STDOUT,
cwd=self.path,
universal_newlines=True
)
except (KeyboardInterrupt, OSError) as e:
raise errors.GitError(
'Couldn\'t run \'git lfs migrate info\':\n{0}'.format(e)
)

groups = []
files_re = re.compile(r'(.*\s+[\d.]+\s+\S+).*')

for line in lfs_output.stdout.split('\n'):
match = files_re.match(line)
if match:
groups.append(match.groups()[0])

return groups
45 changes: 41 additions & 4 deletions tests/core/commands/test_doctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@
from renku.cli import cli


def test_git_hooks(runner, project):
"""Test detection of not-installed git hooks."""
def test_new_project_is_ok(runner, project):
"""Test renku doctor initially is OK on a new project."""
# Initially, every thing is OK
result = runner.invoke(cli, ['doctor'])
assert 0 == result.exit_code
assert 'Everything seems to be ok.' in result.output


def test_git_hooks_not_available(runner, project):
"""Test detection of not-installed git hooks."""
result = runner.invoke(cli, ['githooks', 'uninstall'])
assert 0 == result.exit_code

Expand All @@ -42,7 +45,7 @@ def test_git_hooks_modified(runner, project):
assert 0 == result.exit_code

hook_path = Path(project) / '.git' / 'hooks' / 'pre-commit'
lines = hook_path.read_text().split('/n')
lines = hook_path.read_text().split('\n')

# Append some more commands
appended = lines + ['# Some more commands', 'ls']
Expand All @@ -54,8 +57,42 @@ def test_git_hooks_modified(runner, project):
assert 'Everything seems to be ok.' in result.output

# Modify Renku hook
hook_path.write_text('\n'.join(lines[:-5]))
modified = [l for l in lines if '# END RENKU HOOK.' not in l]
hook_path.write_text('\n'.join(modified))

result = runner.invoke(cli, ['doctor'])
assert 1 == result.exit_code
assert 'Git hooks are outdated or not installed.' in result.output


def test_lfs_broken_history(runner, client, tmp_path):
"""Test lfs migrate info check on a broken history."""
big_file = tmp_path / 'big-file.ipynb'
with open(big_file, 'w') as file_:
file_.seek(client.minimum_lfs_file_size)
file_.write('some-data')

# Add a file without adding it to LFS
result = runner.invoke(
cli, [
'--no-external-storage', 'dataset', 'add', '--create',
'new-dataset',
str(big_file)
],
catch_exceptions=False
)
assert 0 == result.exit_code

result = runner.invoke(cli, ['doctor'])
assert 1 == result.exit_code
assert 'Git history contains large files' in result.output
assert '*.ipynb' in result.output

# Exclude *.ipynb files from LFS in .renkulfsignore
(client.path / client.RENKU_LFS_IGNORE_PATH).write_text(
'\n'.join(['*swp', '*ipynb', '.DS_Store'])
)

result = runner.invoke(cli, ['doctor'])
assert 0 == result.exit_code
assert 'Git history contains large files' not in result.output

0 comments on commit 480da06

Please sign in to comment.