Skip to content

Commit

Permalink
Replace media_url with url in provider scripts (#1891)
Browse files Browse the repository at this point in the history
* Replace `media_url` with `url` in provider scripts

* Update provider_data_ingester tests

* Fix tests
  • Loading branch information
obulat committed May 17, 2023
1 parent 20949ac commit 614720f
Show file tree
Hide file tree
Showing 55 changed files with 214 additions and 237 deletions.
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
audio_url: str,
url: str,
license_info: LicenseInfo,
foreign_identifier: str,
thumbnail_url: str | None = None,
Expand Down Expand Up @@ -79,7 +79,7 @@ def add_item(
foreign_landing_url: URL of page where the audio lives on the
source website.
audio_url: Direct link to the audio file
url: Direct link to the audio file
license_info: LicenseInfo object that has
- the URL of the license for the audio,
- string representation of the license,
Expand Down Expand Up @@ -154,7 +154,7 @@ def add_item(

audio_data = {
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"license_info": license_info,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
Expand Down Expand Up @@ -188,9 +188,6 @@ def _get_audio(self, **kwargs) -> Audio | None:
audio_metadata = self.clean_media_metadata(**kwargs)
if audio_metadata is None:
return None
# Convert the `audio_url` key used in AudioStore, TSV and
# provider API scripts into `url` key used in db
audio_metadata["url"] = audio_metadata.pop("audio_url")
# Validate that duration does not exceed Postgres int maximum
audio_metadata["duration"] = self._validate_integer(
audio_metadata.get("duration")
Expand Down
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
image_url: str,
url: str,
license_info: LicenseInfo,
foreign_identifier: str,
thumbnail_url: str | None = None,
Expand All @@ -70,7 +70,7 @@ def add_item(
Required Arguments:
foreign_landing_url: URL of page where the image lives on the
source website.
image_url: Direct link to the image file
url: Direct link to the image file
license_info: LicenseInfo object that has
- the URL of the license for the image,
Expand Down Expand Up @@ -125,7 +125,7 @@ def add_item(

image_data = {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
"filetype": filetype,
Expand Down Expand Up @@ -153,9 +153,6 @@ def _get_image(self, **kwargs) -> Image | None:
image_metadata = self.clean_media_metadata(**kwargs)
if image_metadata is None:
return None
# Convert the `image_url` key used in ImageStore, TSV and
# provider API scripts into `url` key used in db
image_metadata["url"] = image_metadata.pop("image_url")
return Image(**image_metadata)


Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/common/storage/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ def clean_media_metadata(self, **media_data) -> dict | None:
"license_info",
"foreign_identifier",
"foreign_landing_url",
f"{self.media_type}_url",
"url",
]:
if media_data.get(field) is None:
raise ValueError(f"Record missing required field: `{field}`")

for field in [
f"{self.media_type}_url",
"url",
"foreign_landing_url",
"thumbnail_url",
"creator_url",
Expand All @@ -143,7 +143,7 @@ def clean_media_metadata(self, **media_data) -> dict | None:
media_data["ingestion_type"] = "provider_api"

media_data["filetype"] = self._validate_filetype(
media_data["filetype"], media_data[f"{self.media_type}_url"]
media_data["filetype"], media_data["url"]
)
media_data["filesize"] = self._validate_integer(media_data.get("filesize"))

Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/common/tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _process_row(tsv_row):
image_store = _image_store_dict[row_image.provider]
image_store.add_item(
foreign_landing_url=row_image.foreign_landing_url,
image_url=row_image.url,
url=row_image.url,
thumbnail_url=row_image.thumbnail_url,
license_info=get_license_info(
license_url=get_license_url(row_meta_data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _handle_object_data(data, license_info: LicenseInfo) -> list[dict]:
images.append(
{
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"width": width,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_record_data(self, data):
"foreign_landing_url": foreign_landing_url,
"title": data.get("title", None),
"creator": creator_name,
"image_url": image["url"],
"url": image["url"],
"width": self._get_int_value(image, "width"),
"height": self._get_int_value(image, "height"),
"filesize": self._get_int_value(image, "filesize"),
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_record_data(self, data: dict) -> dict | None:
try:
record = {
"foreign_landing_url": self._get_foreign_landing_url(data),
"image_url": self._get_image_url(data),
"url": self._get_image_url(data),
"foreign_identifier": self._get_foreign_identifier(data),
"meta_data": self._get_meta_data_dict(data),
"title": self._get_title(data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def get_record_data(self, data):
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"title": title,
"source": source,
"creator": creator,
Expand Down
4 changes: 2 additions & 2 deletions catalog/dags/providers/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def get_record_data(self, data):
return None

image_size = self._get_largest_image_size(data)
if not (image_url := data.get(f"url_{image_size}")):
if not (url := data.get(f"url_{image_size}")):
return None

if not (foreign_identifier := data.get("id")):
Expand Down Expand Up @@ -254,7 +254,7 @@ def get_record_data(self, data):

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"width": width,
Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/providers/provider_api_scripts/freesound.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def _get_audio_files(
return None, None

main_file = {
"audio_url": preview_url,
"url": preview_url,
"filetype": self.preferred_preview.split("-")[-1],
"bit_rate": FreesoundDataIngester.preview_bitrates[self.preferred_preview],
"filesize": int(filesize),
Expand Down Expand Up @@ -227,7 +227,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
if not (item_license := get_license_info(media_data.get("license"))):
return None

# We use the mp3-hq preview url as `audio_url` as the main url
# We use the mp3-hq preview url as `url` as the main url
# for playing on the frontend,
# and the actual uploaded file as an alt_file that is available
# for download (and requires a user to be authenticated to download)
Expand Down Expand Up @@ -258,7 +258,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
"audio_set": audio_set,
"set_url": set_url,
"alt_files": alt_files,
# audio_url, filetype, bit_rate
# url, filetype, bit_rate
**main_audio,
}

Expand Down
10 changes: 5 additions & 5 deletions catalog/dags/providers/provider_api_scripts/jamendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,12 @@ def _get_audio_url(self, data):
>>> _remove_param_from_url(url, "from")
'https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31'
:return: Tuple with main audio file information:
- audio_url
- url
- duration (in milliseconds)
"""
if not (audio_url := data.get("audio")):
if not (url := data.get("audio")):
return None
return self._remove_param_from_url(audio_url, "from")
return self._remove_param_from_url(url, "from")

@staticmethod
def _get_creator_data(data):
Expand Down Expand Up @@ -184,7 +184,7 @@ def get_record_data(self, data):
if not (foreign_landing_url := data.get("shareurl")):
return None

if not (audio_url := self._get_audio_url(data)):
if not (url := self._get_audio_url(data)):
return None

if not (license_info := get_license_info(data.get("license_ccurl"))):
Expand Down Expand Up @@ -221,7 +221,7 @@ def get_record_data(self, data):
"creator_url": creator_url,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"duration": duration,
"filetype": filetype,
"thumbnail_url": thumbnail,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def get_record_data(self, object_id):
return [
{
"foreign_landing_url": foreign_landing_url,
"image_url": img,
"url": img,
"license_info": self.DEFAULT_LICENSE_INFO,
"foreign_identifier": self._get_foreign_id(object_id, img),
"creator": artist,
Expand Down
10 changes: 4 additions & 6 deletions catalog/dags/providers/provider_api_scripts/museum_victoria.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class ImageDetails(TypedDict, total=False):
foreign_identifier: str
image_url: str
url: str
license_info: LicenseInfo
foreign_landing_url: str
title: str
Expand Down Expand Up @@ -97,17 +97,15 @@ def _get_images(media_data) -> list[ImageDetails]:
continue
if not (foreign_identifier := media.get("id")):
continue
image_url, height, width, filesize = VictoriaDataIngester._get_image_data(
media
)
url, height, width, filesize = VictoriaDataIngester._get_image_data(media)
license_info = VictoriaDataIngester._get_license_info(media)
if not image_url or not license_info:
if not url or not license_info:
continue
creator = VictoriaDataIngester._get_creator(media)

image: ImageDetails = {
"foreign_identifier": foreign_identifier,
"image_url": image_url,
"url": url,
"height": height,
"width": width,
"license_info": license_info,
Expand Down
4 changes: 2 additions & 2 deletions catalog/dags/providers/provider_api_scripts/nappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_record_data(self, data: dict) -> dict | None:
if not (foreign_landing_url := data.get("foreign_landing_url")):
return None

if not (image_url := data.get("url")):
if not (url := data.get("url")):
return None

if not (foreign_identifier := data.get("foreign_identifier")):
Expand All @@ -95,7 +95,7 @@ def get_record_data(self, data: dict) -> dict | None:

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"thumbnail_url": thumbnail_url,
"license_info": self.license_info,
"foreign_identifier": foreign_identifier,
Expand Down
22 changes: 10 additions & 12 deletions catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class NyplDataIngester(ProviderDataIngester):
# NYPL returns a list of image objects, with the dimension encoded
# in the URL's query parameter.
# This list is in order from the largest image to the smallest one.
image_url_dimensions = ["g", "v", "q", "w", "r"]
url_dimensions = ["g", "v", "q", "w", "r"]

def __init__(self, *args, **kwargs):
NYPL_API = Variable.get("API_KEY_NYPL")
Expand Down Expand Up @@ -117,23 +117,21 @@ def get_record_data(self, data):
continue

image_link = capture.get("imageLinks", {}).get("imageLink", [])
image_url, filetype = NyplDataIngester._get_image_data(image_link)
if not image_url:
url, filetype = NyplDataIngester._get_image_data(image_link)
if not url:
continue

foreign_landing_url = capture.get("itemLink", {}).get("$")
if not foreign_landing_url:
continue
if not (foreign_landing_url := capture.get("itemLink", {}).get("$")):
continue

license_url = capture.get("rightsStatementURI", {}).get("$")
if not (license_info := get_license_info(license_url)):
continue

image_data = {
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": license_info,
"title": title,
"creator": creator,
Expand Down Expand Up @@ -186,9 +184,9 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
"description": "Cropped .jpeg (1600 pixels on the long side)"
}
Selects the largest image based on the image URL's `t` query parameter
and image_url_dimensions.
and url_dimensions.
"""
# Create a dict with the NyplDataIngester.image_url_dimensions as keys,
# Create a dict with the NyplDataIngester.url_dimensions as keys,
# and image data as value.
image_types = {
parse_qs(urlparse(img["$"]).query)["t"][0]: i
Expand All @@ -200,17 +198,17 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
# Select the dict containing the URL for the largest image.
# The image size is encoded in the URL query parameter `t`.
# The list of dimensions is sorted by size of the corresponding image.
for dimension in NyplDataIngester.image_url_dimensions:
for dimension in NyplDataIngester.url_dimensions:
preferred_image_index = image_types.get(dimension)
if preferred_image_index is not None:
preferred_image = images[preferred_image_index]

# Removes the `download` query to get the viewable image URL
image_url = preferred_image["$"].replace("&download=1", "")
url = preferred_image["$"].replace("&download=1", "")
filetype = NyplDataIngester._get_filetype(
preferred_image["description"]
)
return image_url, filetype
return url, filetype

return None, None

Expand Down
30 changes: 15 additions & 15 deletions catalog/dags/providers/provider_api_scripts/phylopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,31 +89,31 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
TODO: Adapt `url` and `creator_url` to avoid redirects.
"""

uid = data.get("uuid")
if not uid:
if not (foreign_identifier := data.get("uuid")):
return None

data = data.get("_links", {})
img_url = data.get("sourceFile", {}).get("href")
foreign_url = data.get("self", {}).get("href")
if not img_url or not foreign_url:
links = data.get("_links", {})

if not (url := links.get("sourceFile", {}).get("href")):
return None

license_url = data.get("license", {}).get("href")
if not (license_info := get_license_info(license_url)):
if not (foreign_url_path := links.get("self", {}).get("href")):
return None
foreign_landing_url = self.host + foreign_url_path

foreign_url = self.host + foreign_url
license_url = links.get("license", {}).get("href")
if not (license_info := get_license_info(license_url)):
return None

title = data.get("self", {}).get("title")
creator, creator_url = self._get_creator(data.get("contributor", {}))
width, height = self._get_image_sizes(data)
title = links.get("self", {}).get("title")
creator, creator_url = self._get_creator(links.get("contributor", {}))
width, height = self._get_image_sizes(links)

return {
"license_info": license_info,
"foreign_identifier": uid,
"foreign_landing_url": foreign_url,
"image_url": img_url,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"url": url,
"title": title,
"creator": creator,
"creator_url": creator_url,
Expand Down

0 comments on commit 614720f

Please sign in to comment.