Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: only make one call to S3 (instead of 1500) #14

Merged
merged 1 commit into from Apr 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions bananas_server/application/bananas_server.py
Expand Up @@ -123,6 +123,8 @@ def receive_PACKET_CONTENT_CLIENT_CONTENT(self, source, content_infos):
)

def reload_md5sum_mapping(self):
self.storage.clear_cache()

for content_type in ContentType:
if content_type == ContentType.CONTENT_TYPE_END:
continue
Expand Down
3 changes: 3 additions & 0 deletions bananas_server/storage/local.py
Expand Up @@ -32,6 +32,9 @@ def _get_filename(self, content_entry):

return f"{self.folder}/{content_type_folder_name}/{unique_id}/{md5sum}.tar.gz"

def clear_cache(self):
pass

def list_folder(self, content_type, unique_id=None):
content_type_folder_name = get_folder_name_from_content_type(content_type)

Expand Down
30 changes: 24 additions & 6 deletions bananas_server/storage/s3.py
Expand Up @@ -27,6 +27,7 @@ def __init__(self):
raise Exception("--storage-s3-bucket has to be given if storage is s3")

self._s3 = boto3.client("s3")
self._folder_cache = None

def _get_filename(self, content_entry):
content_type_folder_name = get_folder_name_from_content_type(content_entry.content_type)
Expand All @@ -35,7 +36,7 @@ def _get_filename(self, content_entry):

return f"{content_type_folder_name}/{unique_id}/{md5sum}.tar.gz"

def _get_folder_list(self, folder, continuation_token=None):
def _get_full_folder_list(self, folder, continuation_token=None):
kwargs = {}
if continuation_token:
kwargs["ContinuationToken"] = continuation_token
Expand All @@ -49,19 +50,36 @@ def _get_folder_list(self, folder, continuation_token=None):
objects.add(obj["Key"])

if response.get("NextContinuationToken"):
objects.update(self._get_folder_list(folder, continuation_token=response["NextContinuationToken"]))
objects.update(self._get_full_folder_list(folder, continuation_token=response["NextContinuationToken"]))

return objects

def _get_folder_list(self, folder_search):
# List all files on the S3, and cache it. Otherwise we will be doing
# a lot of API calls, and that is very slow.
if self._folder_cache is None:
self._folder_cache = self._get_full_folder_list("")

# Filter out the request based on the cache. We are a generator to
# not create yet-an-other-list in memory.
for folder in self._folder_cache:
if folder.startswith(folder_search):
yield folder

def clear_cache(self):
self._folder_cache = None

def list_folder(self, content_type, unique_id=None):
content_type_folder_name = get_folder_name_from_content_type(content_type)

if unique_id is None:
folders = self._get_folder_list(content_type_folder_name)
return [folder.split("/")[1] for folder in folders]

folders = self._get_folder_list(f"{content_type_folder_name}/{unique_id}")
return [folder.split("/")[2] for folder in folders]
for folder in folders:
yield folder.split("/")[1]
TrueBrain marked this conversation as resolved.
Show resolved Hide resolved
else:
folders = self._get_folder_list(f"{content_type_folder_name}/{unique_id}")
for folder in folders:
yield folder.split("/")[2]

def get_stream(self, content_entry):
filename = self._get_filename(content_entry)
Expand Down