Skip to content

Commit

Permalink
fix(service): normalize git url to avoid duplicate cache entries (#3606)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-alisafaee committed Sep 1, 2023
1 parent e0ff587 commit 19142c6
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 44 deletions.
3 changes: 3 additions & 0 deletions renku/core/util/contexts.py
Expand Up @@ -26,6 +26,7 @@
from renku.core import errors
from renku.core.interface.database_gateway import IDatabaseGateway
from renku.core.interface.project_gateway import IProjectGateway
from renku.ui.service.utils import normalize_git_url


@contextlib.contextmanager
Expand Down Expand Up @@ -114,6 +115,8 @@ def renku_project_context(path, check_git_path=True):
if check_git_path:
path = get_git_path(path)

path = normalize_git_url(str(path))

with project_context.with_path(path=path):
project_context.external_storage_requested = True
yield project_context.path
Expand Down
12 changes: 3 additions & 9 deletions renku/domain_model/git.py
Expand Up @@ -25,6 +25,7 @@

from renku.core import errors
from renku.core.util.os import is_ascii, normalize_to_ascii
from renku.ui.service.utils import normalize_git_url

_RE_SCHEME = r"(?P<scheme>(git\+)?(https?|git|ssh|rsync))\://"

Expand Down Expand Up @@ -70,13 +71,6 @@ def _build(*parts):
]


def filter_repo_name(repo_name: str) -> str:
"""Remove the .git extension from the repo name."""
if repo_name is not None and repo_name.endswith(".git"):
return repo_name[: -len(".git")]
return repo_name


@attr.s()
class GitURL:
"""Parser for common Git URLs."""
Expand All @@ -90,14 +84,14 @@ class GitURL:
password = attr.ib(default=None)
port = attr.ib(default=None)
owner = attr.ib(default=None)
name: Optional[str] = attr.ib(default=None, converter=filter_repo_name)
name: Optional[str] = attr.ib(default=None, converter=normalize_git_url)
slug = attr.ib(default=None)
_regex = attr.ib(default=None, eq=False, order=False)

def __attrs_post_init__(self):
"""Derive basic information."""
if not self.name and self.path:
self.name = filter_repo_name(Path(self.path).name)
self.name = normalize_git_url(Path(self.path).name)

self.slug = normalize_to_ascii(self.name)

Expand Down
9 changes: 5 additions & 4 deletions renku/ui/service/cache/models/project.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -15,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Renku service cache project related models."""

import os
import shutil
from datetime import datetime
Expand All @@ -26,6 +26,7 @@

from renku.ui.service.cache.base import BaseCache
from renku.ui.service.config import CACHE_PROJECTS_PATH
from renku.ui.service.utils import normalize_git_url

MAX_CONCURRENT_PROJECT_REQUESTS = 10
LOCK_TIMEOUT = 15
Expand Down Expand Up @@ -61,7 +62,7 @@ def abs_path(self) -> Path:
branch = self.branch
if not self.branch:
branch = NO_BRANCH_FOLDER
return CACHE_PROJECTS_PATH / self.user_id / self.owner / self.slug / branch
return CACHE_PROJECTS_PATH / self.user_id / self.owner / normalize_git_url(self.slug) / branch

def read_lock(self, timeout: Optional[float] = None):
"""Shared read lock on the project."""
Expand Down
2 changes: 1 addition & 1 deletion renku/ui/service/cache/projects.py
Expand Up @@ -31,7 +31,7 @@ class ProjectManagementCache(BaseCache):

project_schema = ProjectSchema()

def make_project(self, user, project_data, persist=True):
def make_project(self, user, project_data, persist=True) -> Project:
"""Store user project metadata."""
project_data.update({"user_id": user.user_id})

Expand Down
9 changes: 6 additions & 3 deletions renku/ui/service/cache/serializers/project.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -22,6 +21,7 @@

from renku.ui.service.cache.models.project import Project
from renku.ui.service.serializers.common import AccessSchema, CreationSchema, MandatoryUserSchema
from renku.ui.service.utils import normalize_git_url


class ProjectSchema(CreationSchema, AccessSchema, MandatoryUserSchema):
Expand All @@ -43,4 +43,7 @@ class ProjectSchema(CreationSchema, AccessSchema, MandatoryUserSchema):
@post_load
def make_project(self, data, **options):
"""Construct project object."""
data["git_url"] = normalize_git_url(data["git_url"])
data["name"] = normalize_git_url(data["name"])
data["slug"] = normalize_git_url(data["slug"])
return Project(**data)
36 changes: 26 additions & 10 deletions renku/ui/service/controllers/api/mixins.py
Expand Up @@ -18,6 +18,7 @@
from abc import ABCMeta, abstractmethod
from functools import wraps
from pathlib import Path
from typing import Optional, Union

import portalocker

Expand All @@ -41,6 +42,7 @@
from renku.ui.service.jobs.contexts import enqueue_retry
from renku.ui.service.jobs.delayed_ctrl import delayed_ctrl_job
from renku.ui.service.serializers.common import DelayedResponseRPC
from renku.ui.service.utils import normalize_git_url

PROJECT_FETCH_TIME = 30

Expand Down Expand Up @@ -96,15 +98,30 @@ def __init__(
self.migrate_project = self.request_data.get("migrate_project", False)

# NOTE: This is absolute project path and its set before invocation of `renku_op`,
# so its safe to use it in controller operations. Its type will always be `pathlib.Path`.
self.project_path = None
# so it's safe to use it in controller operations. Its type will always be `pathlib.Path`.
self._project_path = None

@property
@abstractmethod
def context(self):
"""Operation context."""
raise NotImplementedError

@property
def project_path(self) -> Optional[Path]:
"""Absolute project's path."""
return self._project_path

@project_path.setter
def project_path(self, path: Optional[Union[str, Path]]):
"""Set absolute project's path."""
if not path:
self._project_path = None
return

path = normalize_git_url(str(path))
self._project_path = Path(path)

@abstractmethod
def renku_op(self):
"""Implements operation for the controller."""
Expand Down Expand Up @@ -138,7 +155,7 @@ def execute_op(self):

if self.context.get("is_delayed", False) and "user_id" in self.user_data:
# NOTE: After pushing the controller to delayed execution,
# its important to remove the delayed mark,
# it's important to remove the delayed mark,
# otherwise job will keep recursively enqueuing itself.
self.context.pop("is_delayed")

Expand All @@ -149,13 +166,12 @@ def execute_op(self):

return job

if "git_url" in self.context and "user_id" not in self.user_data:
# NOTE: Anonymous session support.
return self.remote()

elif "git_url" in self.context and "user_id" in self.user_data:
return self.local()

if "git_url" in self.context:
if "user_id" not in self.user_data:
# NOTE: Anonymous session support.
return self.remote()
else:
return self.local()
else:
raise RenkuException("context does not contain `project_id` or `git_url`")

Expand Down
9 changes: 6 additions & 3 deletions renku/ui/service/controllers/project_lock_status.py
@@ -1,6 +1,5 @@
#
# Copyright 2021 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -24,6 +23,7 @@
from renku.ui.service.controllers.api.mixins import RenkuOperationMixin
from renku.ui.service.errors import IntermittentProjectIdError
from renku.ui.service.serializers.project import ProjectLockStatusRequest, ProjectLockStatusResponseRPC
from renku.ui.service.utils import normalize_git_url
from renku.ui.service.views import result_response


Expand All @@ -39,6 +39,9 @@ def __init__(self, cache, user_data, request_data):

super().__init__(cache, user_data, request_data)

if "git_url" in self.ctx:
self.ctx["git_url"] = normalize_git_url(self.ctx["git_url"])

@property
def context(self):
"""Controller operation context."""
Expand Down
8 changes: 4 additions & 4 deletions renku/ui/service/controllers/utils/remote_project.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -26,6 +25,7 @@
from renku.core.util.contexts import renku_project_context
from renku.infrastructure.repository import Repository
from renku.ui.service.serializers.cache import ProjectCloneContext
from renku.ui.service.utils import normalize_git_url

ANONYMOUS_SESSION = "anonymous"

Expand All @@ -44,7 +44,7 @@ def __init__(self, user_data, request_data):

self.ctx = ProjectCloneContext().load({**user_data, **request_data}, unknown=EXCLUDE)

self.git_url = self.ctx["url_with_auth"]
self.git_url = normalize_git_url(self.ctx["url_with_auth"])
self.branch = self.ctx["branch"]

@property
Expand Down
2 changes: 1 addition & 1 deletion renku/ui/service/gateways/gitlab_api_provider.py
Expand Up @@ -37,7 +37,7 @@ class GitlabAPIProvider(IGitAPIProvider):
target_folder: Folder to use to download the files.
remote: Remote repository URL.
token: User bearer token.
ref: optional reference to checkout,
ref: optional reference to check out,
Raises:
errors.ProjectNotFound: If the remote URL is not accessible.
errors.AuthenticationError: If the bearer token is invalid in any way.
Expand Down
6 changes: 5 additions & 1 deletion renku/ui/service/gateways/repository_cache.py
Expand Up @@ -38,6 +38,7 @@
from renku.ui.service.errors import IntermittentCacheError, IntermittentLockError
from renku.ui.service.interfaces.repository_cache import IRepositoryCache
from renku.ui.service.logger import service_log
from renku.ui.service.utils import normalize_git_url


class LocalRepositoryCache(IRepositoryCache):
Expand All @@ -50,6 +51,7 @@ def get(
if git_url is None:
raise ValidationError("Invalid `git_url`, URL is empty", "git_url")

git_url = normalize_git_url(git_url)
try:
project = Project.get(
(Project.user_id == user.user_id) & (Project.git_url == git_url) & (Project.branch == branch)
Expand Down Expand Up @@ -101,6 +103,8 @@ def _clone_project(
self, cache: ServiceCache, git_url: str, branch: Optional[str], user: User, shallow: bool = True
) -> Project:
"""Clone a project to cache."""
git_url = normalize_git_url(git_url)

try:
parsed_git_url = GitURL.parse(git_url)
except UnicodeError as e:
Expand Down Expand Up @@ -228,7 +232,7 @@ def _maybe_update_cache(self, project: Project, user: User):

def git_url_with_auth(project: Project, user: User):
"""Format url with auth."""
git_url = urlparse(project.git_url)
git_url = urlparse(normalize_git_url(project.git_url))

url = "oauth2:{}@{}".format(user.token, git_url.netloc)
return git_url._replace(netloc=url).geturl()
11 changes: 10 additions & 1 deletion renku/ui/service/serializers/common.py
Expand Up @@ -23,13 +23,22 @@

from renku.ui.service.errors import UserRepoUrlInvalidError
from renku.ui.service.serializers.rpc import JsonRPCResponse
from renku.ui.service.utils import normalize_git_url


class RemoteRepositoryBaseSchema(Schema):
"""Schema for tracking a remote repository."""

git_url = fields.String(metadata={"description": "Remote git repository url."})

@pre_load
def normalize_url(self, data, **_):
"""Remove ``.git`` extension from the git url."""
if "git_url" in data and data["git_url"]:
data["git_url"] = normalize_git_url(data["git_url"])

return data

@validates("git_url")
def validate_git_url(self, value):
"""Validates git url."""
Expand All @@ -48,7 +57,7 @@ class RemoteRepositorySchema(RemoteRepositoryBaseSchema):
branch = fields.String(load_default=None, metadata={"description": "Remote git branch (or tag or commit SHA)."})

@pre_load
def set_branch_from_ref(self, data, **kwargs):
def set_branch_from_ref(self, data, **_):
"""Set `branch` field from `ref` if present."""
if "ref" in data and not data.get("branch"):
# Backward compatibility: branch and ref were both used. Let's keep branch as the exposed field
Expand Down
2 changes: 2 additions & 0 deletions renku/ui/service/serializers/templates.py
Expand Up @@ -26,6 +26,7 @@
from renku.ui.service.config import TEMPLATE_CLONE_DEPTH_DEFAULT
from renku.ui.service.serializers.cache import ProjectCloneContext, RepositoryCloneRequest
from renku.ui.service.serializers.rpc import JsonRPCResponse
from renku.ui.service.utils import normalize_git_url


class ManifestTemplatesRequest(RepositoryCloneRequest):
Expand Down Expand Up @@ -74,6 +75,7 @@ class ProjectTemplateRequest(ProjectCloneContext, ManifestTemplatesRequest):
def add_required_fields(self, data, **kwargs):
"""Add necessary fields."""
project_name_stripped = normalize_to_ascii(data["project_name"])
project_name_stripped = normalize_git_url(project_name_stripped)
if len(project_name_stripped) == 0:
raise ValidationError("Project name contains only unsupported characters")
new_project_url = f"{data['project_repository']}/{data['project_namespace']}/{project_name_stripped}"
Expand Down

0 comments on commit 19142c6

Please sign in to comment.