diff --git a/openverse_catalog/dags/common/extensions.py b/openverse_catalog/dags/common/extensions.py index ddae3429b..0a82e1263 100644 --- a/openverse_catalog/dags/common/extensions.py +++ b/openverse_catalog/dags/common/extensions.py @@ -5,9 +5,7 @@ def extract_filetype(url: str, media_type: str) -> str | None: - """ - Extracts the filetype from a media url extension. - """ + """Extract the filetype from a media url extension.""" possible_filetype = url.split(".")[-1] if possible_filetype in EXTENSIONS.get(media_type, {}): return possible_filetype diff --git a/openverse_catalog/dags/common/github.py b/openverse_catalog/dags/common/github.py index c2c81586d..9dc503fa8 100644 --- a/openverse_catalog/dags/common/github.py +++ b/openverse_catalog/dags/common/github.py @@ -4,6 +4,8 @@ class GitHubAPI: def __init__(self, pat: str): """ + Initialize. + :param pat: GitHub Personal Access Token to use to authenticate requests """ self.session = requests.Session() diff --git a/openverse_catalog/dags/common/helpers.py b/openverse_catalog/dags/common/helpers.py index 66155a7c0..1631a072f 100644 --- a/openverse_catalog/dags/common/helpers.py +++ b/openverse_catalog/dags/common/helpers.py @@ -8,7 +8,7 @@ class IngestionInput(NamedTuple): def get_partitioned_reingestion_days(inputs: list[IngestionInput]): """ - This method calculates day-shift lists for Provider API workflows. + Calculate day-shift lists for Provider API workflows. The input should be a list of pairs of integers: diff --git a/openverse_catalog/dags/common/licenses/licenses.py b/openverse_catalog/dags/common/licenses/licenses.py index f7c34b2cf..65ad18dce 100644 --- a/openverse_catalog/dags/common/licenses/licenses.py +++ b/openverse_catalog/dags/common/licenses/licenses.py @@ -1,4 +1,6 @@ """ +License utilities. + This module has a number of public methods which are useful for working with licenses. """ @@ -27,7 +29,7 @@ class InvalidLicenseURLException(Exception): @lru_cache(maxsize=1024) def get_license_info(license_url=None, license_=None, license_version=None): """ - Returns a valid license, version, license URL tuple if possible. + Return a valid license, version, license URL tuple if possible. Three optional arguments: license_url: String URL to a CC license page. @@ -182,7 +184,7 @@ def get_license_info_from_license_pair( license_, license_version, pair_map=REVERSE_LICENSE_PATH_MAP ) -> tuple[str | None, str | None, str | None]: """ - Validates a given license pair, and derives a license URL from it. + Validate a given license pair, and derive a license URL from it. Returns both the validated pair and the derived license URL. """ diff --git a/openverse_catalog/dags/common/loader/loader.py b/openverse_catalog/dags/common/loader/loader.py index 76e201cf3..2ef929a77 100644 --- a/openverse_catalog/dags/common/loader/loader.py +++ b/openverse_catalog/dags/common/loader/loader.py @@ -43,8 +43,9 @@ def upsert_data( duplicates_count: tuple[int, int], ) -> RecordMetrics: """ - Upserts data into the catalog DB from the loading table, and calculates - final record metrics. + Upsert data into the catalog DB from the loading table. + + This also calculates final record metrics. """ missing_columns, foreign_id_dup = duplicates_count upserted = sql.upsert_records_to_db_table( diff --git a/openverse_catalog/dags/common/loader/provider_details.py b/openverse_catalog/dags/common/loader/provider_details.py index 92c22a0ad..98d6887f4 100644 --- a/openverse_catalog/dags/common/loader/provider_details.py +++ b/openverse_catalog/dags/common/loader/provider_details.py @@ -1,4 +1,6 @@ """ +Provider information. + This file holds the default provider names for each provider API and a dictionary of related sub providers. The key of the dictionary reflects the sub provider name and the corresponding item is a value (or set of values) diff --git a/openverse_catalog/dags/common/loader/reporting.py b/openverse_catalog/dags/common/loader/reporting.py index 4d3e1d4f0..d336d3bb7 100644 --- a/openverse_catalog/dags/common/loader/reporting.py +++ b/openverse_catalog/dags/common/loader/reporting.py @@ -96,6 +96,7 @@ def report_completion( ) -> str: """ Send a Slack notification when the load_data task has completed. + Messages are only sent out in production and if a Slack connection is defined. In all cases the data is logged. diff --git a/openverse_catalog/dags/common/loader/s3.py b/openverse_catalog/dags/common/loader/s3.py index 509b85850..c7ace6c37 100644 --- a/openverse_catalog/dags/common/loader/s3.py +++ b/openverse_catalog/dags/common/loader/s3.py @@ -22,8 +22,9 @@ def copy_file_to_s3_staging( staging_prefix=STAGING_PREFIX, ): """ - Copy a media file from the staging directory, using a media, staging, - and identifiers to construct the S3 key. + Copy a media file from the staging directory. + + This uses the media, staging prefix, and identifiers to construct the S3 key. """ logger.info(f"Creating staging object in s3_bucket: {s3_bucket}") s3 = S3Hook(aws_conn_id=aws_conn_id) @@ -44,6 +45,7 @@ def copy_file_to_s3( ): """ Copy a TSV file to S3 with the given prefix. + The TSV's version is pushed to the `tsv_version` XCom, and the constructed S3 key is pushed to the `s3_key` XCom. The TSV is removed after the upload is complete. diff --git a/openverse_catalog/dags/common/loader/sql.py b/openverse_catalog/dags/common/loader/sql.py index 6659f79f5..2c07aef93 100644 --- a/openverse_catalog/dags/common/loader/sql.py +++ b/openverse_catalog/dags/common/loader/sql.py @@ -51,8 +51,12 @@ def create_column_definitions(table_columns: list[Column], is_loading=True): - """Loading table should not have 'NOT NULL' constraints: all TSV values - are copied, and then the items without required columns are dropped""" + """ + Create column definitions for a table. + + Loading table should not have 'NOT NULL' constraints: all TSV values + are copied, and then the items without required columns are dropped. + """ definitions = [column.create_definition(is_loading) for column in table_columns] return ",\n ".join(definitions) @@ -62,9 +66,7 @@ def create_loading_table( identifier: str, media_type: str = IMAGE, ): - """ - Create intermediary table and indices if they do not exist - """ + """Create intermediary table and indices if they do not exist.""" load_table = _get_load_table_name(identifier, media_type=media_type) postgres = PostgresHook(postgres_conn_id=postgres_conn_id) loading_table_columns = TSV_COLUMNS[media_type] @@ -128,8 +130,10 @@ def load_local_data_to_intermediate_table( def _handle_s3_load_result(cursor) -> int: """ - Handle the results of the aws_s3.table_import_from_s3 function. Locally this will - return an integer, but on AWS infrastructure it will return a string similar to: + Handle the results of the aws_s3.table_import_from_s3 function. + + Locally this will return an integer, but on AWS infrastructure it will return a + string similar to: 500 rows imported into relation "..." from file ... of ... bytes """ @@ -175,8 +179,9 @@ def clean_intermediate_table_data( media_type: MediaType = IMAGE, ) -> tuple[int, int]: """ - Necessary for old TSV files that have not been cleaned up, - using `MediaStore` class: + Clean the data in the intermediate table. + + Necessary for old TSV files that have not been cleaned up, using `MediaStore` class: Removes any rows without any of the required fields: `url`, `license`, `license_version`, `foreign_id`. Also removes any duplicate rows that have the same `provider` @@ -215,9 +220,11 @@ def _is_tsv_column_from_different_version( column: Column, media_type: str, tsv_version: str ) -> bool: """ - Checks that column is a column that exists in TSV files (unlike the db-only + Check that a column appears in the available columns for a TSV verison. + + Check that column is a column that exists in TSV files (unlike the db-only columns like IDENTIFIER or CREATED_ON), but is not available for `tsv_version`. - For example, Category column was added to Image TSV in version 001 + For example, Category column was added to Image TSV in version 001: >>> from common.storage import CATEGORY, DIRECT_URL >>> _is_tsv_column_from_different_version(CATEGORY, IMAGE, '000') True @@ -226,7 +233,6 @@ def _is_tsv_column_from_different_version( >>> from common.storage import IDENTIFIER >>> _is_tsv_column_from_different_version(IDENTIFIER, IMAGE, '000') False - """ return ( column not in COLUMNS[media_type][tsv_version] @@ -242,9 +248,11 @@ def upsert_records_to_db_table( tsv_version: str = CURRENT_TSV_VERSION, ): """ - Upserts newly ingested records from loading table into the main db table. + Upsert newly ingested records from loading table into the main db table. + For tsv columns that do not exist in the `tsv_version` for `media_type`, NULL value is used. + :param postgres_conn_id :param identifier :param db_table diff --git a/openverse_catalog/dags/common/log_cleanup.py b/openverse_catalog/dags/common/log_cleanup.py index c27615cd9..c61ab6b04 100644 --- a/openverse_catalog/dags/common/log_cleanup.py +++ b/openverse_catalog/dags/common/log_cleanup.py @@ -29,7 +29,9 @@ def dir_size_in_mb(dir_paths: list[Path] | Path): def get_folders_to_delete(dag_log_folder: Path, max_log_age_in_days: int) -> list[Path]: - """Returns a list of log folders that are older `than max_log_age_in_days` + """ + Return a list of log folders that are older `than max_log_age_in_days`. + The folder structure is as follows: `{dag_id}/{task_id}/{timestamp}/{try}.log` This function iterates over all `{timestamp}` folders, detects the @@ -90,10 +92,11 @@ def clean_up( should_delete: bool | str, **kwargs, ) -> list[Path]: - """Finds all log folders that were modified more than - `max_log_age_in_days` days ago, and - deletes them, if `should_delete` is True, or - logs them, if `should_delete` is False. + """ + Find and delete all log folders that were modified more than `max_log_age_in_days`. + + Deletion only happens if `should_delete` is True, otherwise they are logged + (e.g. if `should_delete` is False). :param base_log_folder: the folder in which dag log folders are located. diff --git a/openverse_catalog/dags/common/popularity/sql.py b/openverse_catalog/dags/common/popularity/sql.py index a7c8d7a6e..f744ce674 100644 --- a/openverse_catalog/dags/common/popularity/sql.py +++ b/openverse_catalog/dags/common/popularity/sql.py @@ -302,9 +302,7 @@ def create_standardized_media_popularity_function( def create_audioset_view_query(): - """ - Returns SQL to create the audioset_view. - """ + """Return SQL to create the audioset_view.""" return dedent( f""" CREATE VIEW public.{AUDIOSET_VIEW_NAME} diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index eda23b3e2..160e22207 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -22,15 +22,15 @@ class SocketConnectBlockedError(Exception): class RetriesExceeded(Exception): - """ - Custom exception for when the number of allowed retries has been exceeded. - """ + """Custom exception for when the number of allowed retries has been exceeded.""" pass class DelayedRequester: """ + Requester class with a built-in delay. + Provides methods `get` and `head` that are wrappers around the `requests` module methods with the same name (i.e., it simply passes along whatever arguments it receives). The difference is that when this class is initialized diff --git a/openverse_catalog/dags/common/sensors/single_run_external_dags_sensor.py b/openverse_catalog/dags/common/sensors/single_run_external_dags_sensor.py index a9d86f7e0..81afd0a71 100644 --- a/openverse_catalog/dags/common/sensors/single_run_external_dags_sensor.py +++ b/openverse_catalog/dags/common/sensors/single_run_external_dags_sensor.py @@ -10,6 +10,8 @@ class SingleRunExternalDAGsSensor(BaseSensorOperator): """ + Sensor for ensuring only one DAG runs at a time. + Waits for a list of related DAGs, each assumed to have a similar Sensor, to not be running. A related DAG is considered to be 'running' if it is itself in the running state, and its corresponding wait task completed diff --git a/openverse_catalog/dags/common/slack.py b/openverse_catalog/dags/common/slack.py index e67682bb0..52c7aa2a4 100644 --- a/openverse_catalog/dags/common/slack.py +++ b/openverse_catalog/dags/common/slack.py @@ -1,5 +1,5 @@ """ -# Slack Block Message Builder +# Slack Block Message Builder. TODO: - track number of characters, raise error after 4k @@ -81,7 +81,7 @@ class SilencedSlackNotification(TypedDict): class SlackMessage: - """Slack Block Message Builder""" + """Slack Block Message Builder.""" def __init__( self, @@ -131,7 +131,7 @@ def clear(self) -> None: self._payload = self._base_payload.copy() def display(self) -> None: - """Prints current payload, intended for local development only.""" + """Print current payload, intended for local development only.""" if self._context: self._append_context() self._payload.update({"blocks": self.blocks}) @@ -163,7 +163,7 @@ def _add_context( raise ValueError("Unable to include more than 10 context elements") def add_context(self, message: str, plain_text: bool = False) -> None: - """Display context above or below a text block""" + """Display context above or below a text block.""" self._add_context( self._text_block, message, @@ -171,7 +171,7 @@ def add_context(self, message: str, plain_text: bool = False) -> None: ) def add_context_image(self, url: str, alt_text: str | None = None) -> None: - """Display context image inline within a text block""" + """Display context image inline within a text block.""" self._add_context(self._image_block, url, alt_text=alt_text) #################################################################################### @@ -204,7 +204,7 @@ def add_image( def send(self, notification_text: str = "Airflow notification") -> Response: """ - Sends message payload to the channel configured by the webhook. + Send message payload to the channel configured by the webhook. Any notification text provided will only show up as the content within the notification pushed to various devices. @@ -231,6 +231,8 @@ def send(self, notification_text: str = "Airflow notification") -> Response: def should_silence_message(text, username, dag_id): """ + Determine if a Slack message should be silenced. + Checks the `SILENCED_SLACK_NOTIFICATIONS` Airflow variable to see if the message should be silenced for this DAG. """ @@ -252,6 +254,8 @@ def should_send_message( text, username, dag_id, http_conn_id=SLACK_NOTIFICATIONS_CONN_ID ): """ + Determine if a Slack message should actually be sent. + Returns True if: * A Slack connection is defined * The DAG is not configured to silence messages of this type @@ -314,6 +318,8 @@ def send_alert( unfurl_media: bool = False, ): """ + Send a slack alert. + Wrapper for send_message that allows sending a message to the configured alerts channel instead of the default notification channel. """ @@ -332,6 +338,7 @@ def send_alert( def on_failure_callback(context: dict) -> None: """ Send an alert out regarding a failure to Slack. + Errors are only sent out in production and if a Slack connection is defined. """ # Get relevant info diff --git a/openverse_catalog/dags/common/storage/columns.py b/openverse_catalog/dags/common/storage/columns.py index ae3d99598..523c02210 100644 --- a/openverse_catalog/dags/common/storage/columns.py +++ b/openverse_catalog/dags/common/storage/columns.py @@ -43,8 +43,9 @@ def _newest_non_null(column: str) -> str: def _merge_jsonb_objects(column: str) -> str: """ - This function returns SQL that merges the top-level keys of the - a JSONB column, taking the newest available non-null value. + Return an SQL that merges the top-level keys of a JSONB column. + + This takes the newest available non-null value. """ return f"""{column} = COALESCE( jsonb_strip_nulls(old.{column}) @@ -112,6 +113,8 @@ def __init__( nullable: bool | None = None, ): """ + Initialize a column. + :param name: The column name used in TSV, ImageStore and provider API scripts, can be different from the name in the database. :param required: If True, the database column will be set to 'NOT NULL' @@ -226,7 +229,7 @@ def __init__( def prepare_string(self, value): """ - Returns a string representation to the best integer approx of input. + Return a string representation to the best integer approx of input. If there is no sane mapping from the input to an integer, returns None. @@ -271,7 +274,7 @@ def __init__( def prepare_string(self, value): """ - Returns a string `t` or `f`, as appropriate to input. + Return a string `t` or `f`, as appropriate to input. If there is no sane mapping from the input to a boolean, returns None. @@ -318,7 +321,7 @@ def __init__( def prepare_string(self, value): """ - Returns a json string as appropriate to input. + Return a json string as appropriate to input. Also sanitizes values within the json to ensure they are loadable into a PostgreSQL table. @@ -335,8 +338,9 @@ def prepare_string(self, value): def _sanitize_json_values(self, value, recursion_limit=100): """ - Recursively sanitizes the non-dict, non-list values of an input - dictionary or list in preparation for dumping to JSON string. + Recursively sanitize the non-dict/non-list values of an input dict or list. + + This is done in preparation for dumping to a JSON string. """ input_type = type(value) @@ -395,9 +399,7 @@ def __init__( ) def prepare_string(self, value): - """ - Sanitizes input and enforces the character limit, returning a string. - """ + """Sanitizes input and enforces the character limit, returning a string.""" return self._Column__enforce_char_limit( self._Column__sanitize_string(value), self.SIZE, self.TRUNCATE ) @@ -406,6 +408,7 @@ def prepare_string(self, value): class UUIDColumn(Column): """ Represents the PrimaryKey `identifier` column in PostgreSQL. + name: Column name """ @@ -491,7 +494,7 @@ def __init__( def prepare_string(self, value): """ - Returns input unchanged, as long as it is a valid URL string. + Return input unchanged, as long as it is a valid URL string. Also enforces the character limit of the column. If the input value fails a validation, returns None. @@ -506,15 +509,15 @@ def prepare_string(self, value): class ArrayColumn(Column): """ - Represents a PostgreSQL column of type Array, which should hold elements - of the given base_column type. + Represents a PostgreSQL column of type Array. + + Arrays should hold elements of the given base_column type. name: name of the corresponding column in the DB required: whether the column should be considered required by the instantiating script. (Not necessarily mapping to `not null` columns in the PostgreSQL table) base_column: type of the elements in the array, another column - """ def __init__( @@ -536,9 +539,9 @@ def __init__( def prepare_string(self, value): """ - Returns a string representation of an array in the PostgreSQL format: - `{, ...}`. + Return a string representation of an array. + The format in PostgreSQL is: `{, ...}`. Apply changes and validations of the corresponding base column type. """ input_type = type(value) diff --git a/openverse_catalog/dags/common/storage/db_columns.py b/openverse_catalog/dags/common/storage/db_columns.py index 7933ac18a..d8e4165d3 100644 --- a/openverse_catalog/dags/common/storage/db_columns.py +++ b/openverse_catalog/dags/common/storage/db_columns.py @@ -1,6 +1,8 @@ """ -This module contains the lists of database columns -in the same order as in the main image / audio databases. +Database columns. + +This module contains the lists of database columns in the same order as in the +main media tables within the database. """ from common.storage import columns as col diff --git a/openverse_catalog/dags/common/storage/image.py b/openverse_catalog/dags/common/storage/image.py index 9a3ceb7ce..4f8e0d02a 100644 --- a/openverse_catalog/dags/common/storage/image.py +++ b/openverse_catalog/dags/common/storage/image.py @@ -146,7 +146,7 @@ def add_item( return self.total_items def _get_image(self, **kwargs) -> Image | None: - """Validates image information and returns Image namedtuple""" + """Validate image information and return Image namedtuple.""" image_metadata = self.clean_media_metadata(**kwargs) if image_metadata is None: return None @@ -158,8 +158,9 @@ def _get_image(self, **kwargs) -> Image | None: class MockImageStore(ImageStore): """ - A class that mocks the role of the ImageStore class. This class replaces - all functionality of ImageStore that calls the internet. + A class that mocks the role of the ImageStore class. + + This class replaces all functionality of ImageStore that calls the internet. It also allows for easy introspection into the images added to the media_buffer by making it a public attribute, and not converting the images to TSV. diff --git a/openverse_catalog/dags/common/storage/media.py b/openverse_catalog/dags/common/storage/media.py index 3dd479e29..e3eb988da 100644 --- a/openverse_catalog/dags/common/storage/media.py +++ b/openverse_catalog/dags/common/storage/media.py @@ -70,8 +70,7 @@ def __init__( def save_item(self, media) -> None: """ - Appends item data to the buffer as a tsv row, - only if data is valid. + Append item data to the buffer as a tsv row, only if data is valid. Args: media: a namedtuple with validated media metadata @@ -85,14 +84,13 @@ def save_item(self, media) -> None: @abc.abstractmethod def add_item(self, **kwargs): - """ - Abstract method to clean the item data and add it to the store - """ + """Abstract method to clean the item data and add it to the store.""" pass def clean_media_metadata(self, **media_data) -> dict | None: """ - Cleans and enriches the base media metadata common for all media types. + Clean and enrich the base media metadata common for all media types. + Even though we clean license info in the provider API scripts, we validate it here, too, to make sure we don't have invalid license information in the database. @@ -145,7 +143,7 @@ def clean_media_metadata(self, **media_data) -> dict | None: return media_data def commit(self): - """Writes all remaining media items in the buffer to disk.""" + """Write all remaining media items in the buffer to disk.""" self._flush_buffer() return self.total_items @@ -154,7 +152,9 @@ def _initialize_output_path( provider: str | None, version: str | None = None, ) -> str: - """Creates the path for the tsv file. + """ + Create the path for the tsv file. + If output_dir and output_file ar not given, the following filename is used: `/tmp/{provider_name}_{media_type}_{timestamp}.tsv` @@ -217,7 +217,8 @@ def _flush_buffer(self) -> int: @staticmethod def _tag_blacklisted(tag: str | dict) -> bool: """ - Tag is banned or contains a banned substring. + Determine if the is banned or contains a banned substring. + :param tag: the tag to be verified against the blacklist :return: true if tag is blacklisted, else returns false """ @@ -232,10 +233,7 @@ def _tag_blacklisted(tag: str | dict) -> bool: @staticmethod def _enrich_meta_data(meta_data, license_url, raw_license_url) -> dict: - """ - Makes sure that meta_data is a dictionary, and contains - license_url and raw_license_url - """ + """Ensure that meta_data is a dict and contains both license URLs.""" if type(meta_data) != dict: logger.debug(f"`meta_data` is not a dictionary: {meta_data}") enriched_meta_data = { @@ -250,7 +248,8 @@ def _enrich_meta_data(meta_data, license_url, raw_license_url) -> dict: return enriched_meta_data def _enrich_tags(self, raw_tags) -> list | None: - """Takes a list of tags and adds provider information to them + """ + Add provider information to tags. Args: raw_tags: List of strings or dictionaries @@ -279,7 +278,8 @@ def _format_raw_tag(self, tag): def _validate_filetype(self, filetype: str | None, url: str) -> str | None: """ - Extracts filetype from the media URL if filetype is None. + Extract filetype from the media URL if filetype is None. + Unifies filetypes that have variants such as jpg/jpeg and tiff/tif. :param filetype: Optional filetype string. :return: filetype string or None @@ -293,8 +293,9 @@ def _validate_filetype(self, filetype: str | None, url: str) -> str | None: @staticmethod def _validate_integer(value: int | None) -> int | None: """ - Checks to ensure that a provided integer value is less than the Postgres - maximum integer value. If the value exceeds this maximum, None is returned. + Ensure the provided int value is less than the Postgres maximum integer value. + + If the value exceeds this maximum, None is returned. TODO: Remove this logic once the column has been embiggened https://github.com/WordPress/openverse-catalog/issues/730 https://github.com/WordPress/openverse-catalog/issues/873 @@ -306,9 +307,7 @@ def _validate_integer(value: int | None) -> int | None: @staticmethod def _get_source(source, provider): - """ - Returns `source` if given, otherwise `provider` - """ + """Return `source` if given, otherwise `provider`.""" if not source: source = provider diff --git a/openverse_catalog/dags/common/storage/util.py b/openverse_catalog/dags/common/storage/util.py index d81cd00f7..3f6b75f22 100644 --- a/openverse_catalog/dags/common/storage/util.py +++ b/openverse_catalog/dags/common/storage/util.py @@ -1,6 +1,4 @@ -""" -This module has public methods which are useful for storage operations. -""" +"""This module has public methods which are useful for storage operations.""" import logging from common.storage.audio import AudioStore diff --git a/openverse_catalog/dags/common/urls.py b/openverse_catalog/dags/common/urls.py index ac43f7463..a646faa71 100644 --- a/openverse_catalog/dags/common/urls.py +++ b/openverse_catalog/dags/common/urls.py @@ -1,4 +1,6 @@ """ +URL utilities. + This module has a number of public methods which are useful for verifying and cleaning URLs. """ @@ -19,8 +21,7 @@ def validate_url_string(url_string, strip_slash: bool = True): """ - Determines whether the given `url_string` is a valid URL with an https - scheme. + Determine whether the given `url_string` is a valid URL with an https scheme. If not, attempts to mangle the URL scheme into the desired form, falling back to an http scheme in the event TLS is not supported. @@ -63,9 +64,9 @@ def validate_url_string(url_string, strip_slash: bool = True): @lru_cache(maxsize=2048) def rewrite_redirected_url(url_string): """ - Requests the given `url_string`, and rewrites it to the final URL - after any redirects. Caches the result to avoid repetitive network - requests. + Request the given url and rewrite it to the final URL after any redirects. + + Caches the result to avoid repetitive network requests. """ try: response = requests_get(url_string) @@ -87,8 +88,7 @@ def rewrite_redirected_url(url_string): def add_url_scheme(url_string, scheme="http", strip_slash: bool = True): """ - Replaces the scheme of `url_string` with `scheme`, - or adds the given `scheme` if necessary. + Replace the scheme of `url_string` with `scheme` or adds the `scheme` if necessary. Only strip the leading/trailing slash of url if flag is True. """ diff --git a/openverse_catalog/dags/data_refresh/dag_factory.py b/openverse_catalog/dags/data_refresh/dag_factory.py index 0815ae44d..3f3ef9771 100644 --- a/openverse_catalog/dags/data_refresh/dag_factory.py +++ b/openverse_catalog/dags/data_refresh/dag_factory.py @@ -1,5 +1,6 @@ """ -# Data Refresh DAG Factory +# Data Refresh DAG Factory. + This file generates our data refresh DAGs using a factory function. For the given media type these DAGs will first refresh the popularity data, then initiate a data refresh on the data refresh server and await the @@ -83,8 +84,9 @@ def _single_value(cursor): @provide_session def _month_check(dag_id: str, session: SASession = None) -> str: """ - Checks whether there has been a previous DagRun this month. If so, - returns the task_id for the matview refresh task; else, returns the + Check whether there has been a previous DagRun this month. + + If so, return the task_id for the matview refresh task; else, return the task_id for refresh popularity metrics task. Required Arguments: @@ -137,8 +139,9 @@ def _month_check(dag_id: str, session: SASession = None) -> str: def _month_check_with_reporting(dag_id: str, media_type: str) -> str: """ - Wrapper for the monthly check function to report which step is starting - and which step is next to slack. + Wrap the monthly check function. + + This reports which step is starting and which step is next to slack. """ next_task_id = _month_check(dag_id) next_step = { @@ -153,8 +156,10 @@ def _month_check_with_reporting(dag_id: str, media_type: str) -> str: def create_data_refresh_dag(data_refresh: DataRefresh, external_dag_ids: Sequence[str]): """ - This factory method instantiates a DAG that will run the popularity calculation and - subsequent data refresh for the given `media_type`. + Instantiate a DAG for a data refresh. + + This DAG will run the popularity calculation and subsequent data refresh for the + given `media_type`. Required Arguments: diff --git a/openverse_catalog/dags/data_refresh/data_refresh_task_factory.py b/openverse_catalog/dags/data_refresh/data_refresh_task_factory.py index f94c47101..c78e7e370 100644 --- a/openverse_catalog/dags/data_refresh/data_refresh_task_factory.py +++ b/openverse_catalog/dags/data_refresh/data_refresh_task_factory.py @@ -1,5 +1,6 @@ """ -# Data Refresh TaskGroup Factory +# Data Refresh TaskGroup Factory. + This file generates the data refresh TaskGroup using a factory function. This TaskGroup initiates a data refresh for a given media type and awaits the success or failure of the refresh. Importantly, it is also configured to @@ -71,9 +72,10 @@ def response_filter_stat(response: Response) -> str: """ - Filter for the `get_current_index` task, used to extract the name of the current - index that the concerned alias points to. This index name will be availabe via XCom - in the downstream tasks. + Handle the response for the `get_current_index` task. + + This is used to extract the name of the current index that the concerned alias + points to. This index name will be available via XCom in the downstream tasks. """ index_name = response.json()["alt_names"] # Indices are named as '-', so everything after the first @@ -84,9 +86,11 @@ def response_filter_stat(response: Response) -> str: def response_filter_data_refresh(response: Response) -> str: """ - Filter for the `trigger_data_refresh` task, used to grab the endpoint needed - to poll for the status of the triggered data refresh. This information will - then be available via XCom in the downstream tasks. + Handle the response for `trigger_data_refresh` task. + + This is used to grab the endpoint needed to poll for the status of the triggered + data refresh. This information will then be available via XCom in the downstream + tasks. """ status_check_url = response.json()["status_check"] return urlparse(status_check_url).path @@ -94,8 +98,9 @@ def response_filter_data_refresh(response: Response) -> str: def response_check_wait_for_completion(response: Response) -> bool: """ - Response check to the `wait_for_completion` Sensor. Processes the response to - determine whether the task can complete. + Handle the response for `wait_for_completion` Sensor. + + Processes the response to determine whether the task can complete. """ data = response.json() @@ -116,6 +121,8 @@ def create_data_refresh_task_group( data_refresh: DataRefresh, external_dag_ids: Sequence[str] ): """ + Create the data refresh tasks. + This factory method instantiates a DAG that will run the data refresh for the given `media_type`. diff --git a/openverse_catalog/dags/data_refresh/data_refresh_types.py b/openverse_catalog/dags/data_refresh/data_refresh_types.py index 9ff950bcb..447a416f6 100644 --- a/openverse_catalog/dags/data_refresh/data_refresh_types.py +++ b/openverse_catalog/dags/data_refresh/data_refresh_types.py @@ -1,7 +1,8 @@ """ -# Data Refresh DAG Configuration +# Data Refresh DAG Configuration. + This file defines the type for the `DataRefresh`, a dataclass containing -configuation for a Data Refresh DAG, and defines the actual `DATA_REFRESH_CONFIGS` +configuration for a Data Refresh DAG, and defines the actual `DATA_REFRESH_CONFIGS` for each of our media types. This configuration information is used to generate the dynamic Data Refresh dags. """ diff --git a/openverse_catalog/dags/data_refresh/refresh_view_data_task_factory.py b/openverse_catalog/dags/data_refresh/refresh_view_data_task_factory.py index e0b582f1e..7a86b58ab 100644 --- a/openverse_catalog/dags/data_refresh/refresh_view_data_task_factory.py +++ b/openverse_catalog/dags/data_refresh/refresh_view_data_task_factory.py @@ -1,5 +1,6 @@ """ -# Refresh Materialized View Task Factory +# Refresh Materialized View Task Factory. + This file generates a Task that refreshes the materialized view for a given media type, using a factory function. @@ -23,6 +24,8 @@ def create_refresh_view_data_task(data_refresh: DataRefresh): """ + Create the refresh related tasks. + The task refreshes the materialized view for the given media type. The view collates popularity data for each record. Refreshing has the effect of adding popularity data for records that were ingested since the last time the view was refreshed, and diff --git a/openverse_catalog/dags/database/image_expiration_workflow.py b/openverse_catalog/dags/database/image_expiration_workflow.py index 6d6282f21..7619df7ee 100644 --- a/openverse_catalog/dags/database/image_expiration_workflow.py +++ b/openverse_catalog/dags/database/image_expiration_workflow.py @@ -1,4 +1,6 @@ """ +# Image expiration. + This file configures the Apache Airflow DAG to expire the outdated images in the image table by setting the removed_from_source column value to true """ diff --git a/openverse_catalog/dags/database/recreate_popularity_calculation_dag_factory.py b/openverse_catalog/dags/database/recreate_popularity_calculation_dag_factory.py index bae487a47..9efaf2e8c 100644 --- a/openverse_catalog/dags/database/recreate_popularity_calculation_dag_factory.py +++ b/openverse_catalog/dags/database/recreate_popularity_calculation_dag_factory.py @@ -1,4 +1,6 @@ """ +Popularity table recreation. + This file generates Apache Airflow DAGs that, for the given media type, completely wipe out the PostgreSQL relations and functions involved in calculating our standardized popularity metric. It then recreates relations diff --git a/openverse_catalog/dags/database/report_pending_reported_media.py b/openverse_catalog/dags/database/report_pending_reported_media.py index b55ff8b87..c1c864c89 100644 --- a/openverse_catalog/dags/database/report_pending_reported_media.py +++ b/openverse_catalog/dags/database/report_pending_reported_media.py @@ -1,5 +1,6 @@ """ -# Report Pending Reported Media DAG +# Report Pending Reported Media DAG. + This DAG checks for any user-reported media pending manual review, and alerts via Slack. @@ -51,8 +52,7 @@ def get_pending_report_counts( db_conn_id: str, media_type: str, ti: TaskInstance ) -> ReportCountsByReason: """ - For the given media type, builds a dict of pending report counts grouped by - report reason. + Build a dict of pending report counts grouped by report reason for a media type. Required Arguments: diff --git a/openverse_catalog/dags/maintenance/airflow_log_cleanup_workflow.py b/openverse_catalog/dags/maintenance/airflow_log_cleanup_workflow.py index 0ef39bbbc..f0640e62d 100644 --- a/openverse_catalog/dags/maintenance/airflow_log_cleanup_workflow.py +++ b/openverse_catalog/dags/maintenance/airflow_log_cleanup_workflow.py @@ -1,4 +1,6 @@ """ +Clean up airflow logs. + A maintenance workflow that you can deploy into Airflow to periodically clean out the task logs to avoid those getting too big. By default, this will also clean child process logs from the 'scheduler' directory. diff --git a/openverse_catalog/dags/maintenance/check_silenced_dags.py b/openverse_catalog/dags/maintenance/check_silenced_dags.py index 8efa8b8a0..4c4cb787a 100644 --- a/openverse_catalog/dags/maintenance/check_silenced_dags.py +++ b/openverse_catalog/dags/maintenance/check_silenced_dags.py @@ -1,6 +1,7 @@ """ -Checks for DAGs that have silenced Slack alerts which may need to be turned back -on. +# Silenced DAGs check. + +Check for DAGs that have silenced Slack alerts which may need to be turned back on. When a DAG has known failures, it can be ommitted from Slack error reporting by adding an entry to the `SILENCED_SLACK_NOTIFICATIONS` Airflow variable. This is a dictionary @@ -38,9 +39,7 @@ def get_issue_info(issue_url: str) -> tuple[str, str, str]: - """ - Parses out the owner, repo, and issue_number from a GitHub issue url. - """ + """Parse out the owner, repo, and issue_number from a GitHub issue url.""" url_split = issue_url.split("/") if len(url_split) < 4: raise AirflowException(f"Issue url {issue_url} could not be parsed.") diff --git a/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py b/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py index b27d398a4..19ebec622 100644 --- a/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py +++ b/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py @@ -58,6 +58,8 @@ def pr_urgency(pr: dict) -> Urgency.Urgency: def days_without_weekends(today: datetime, updated_at: datetime) -> int: """ + Return the number of days between two dates, excluding weekends. + Adapted from: https://stackoverflow.com/a/3615984 CC BY-SA 2.5 """ diff --git a/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders_dag.py b/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders_dag.py index f2ba486ef..70793fb12 100644 --- a/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders_dag.py +++ b/openverse_catalog/dags/maintenance/pr_review_reminders/pr_review_reminders_dag.py @@ -1,4 +1,6 @@ """ +# PR Review Reminders. + Iterates through open PRs in our repositories and pings assigned reviewers who have not yet approved the PR or explicitly requested changes. diff --git a/openverse_catalog/dags/maintenance/rotate_db_snapshots.py b/openverse_catalog/dags/maintenance/rotate_db_snapshots.py index 98243d210..764e24fc6 100644 --- a/openverse_catalog/dags/maintenance/rotate_db_snapshots.py +++ b/openverse_catalog/dags/maintenance/rotate_db_snapshots.py @@ -1,6 +1,8 @@ """ -Manages weekly database snapshots. RDS does not support weekly snapshots -schedules on its own, so we need a DAG to manage this for us. +Manages weekly database snapshots. + +RDS does not support weekly snapshots schedules on its own, so we need a DAG to manage +this for us. It runs on Saturdays at 00:00 UTC in order to happen before the data refresh. diff --git a/openverse_catalog/dags/oauth2/authorize_dag.py b/openverse_catalog/dags/oauth2/authorize_dag.py index 2bd5a43b3..0b31b737b 100644 --- a/openverse_catalog/dags/oauth2/authorize_dag.py +++ b/openverse_catalog/dags/oauth2/authorize_dag.py @@ -1,11 +1,4 @@ -""" -# OAuth Provider Authorization - -**Author**: Madison Swain-Bowden - -**Created**: 2021-10-13 - -""" +"""# OAuth Provider Authorization.""" from datetime import datetime import oauth2 diff --git a/openverse_catalog/dags/oauth2/oauth2.py b/openverse_catalog/dags/oauth2/oauth2.py index 9173add8e..f51169df8 100644 --- a/openverse_catalog/dags/oauth2/oauth2.py +++ b/openverse_catalog/dags/oauth2/oauth2.py @@ -31,9 +31,7 @@ class OauthProvider(NamedTuple): def _var_get(key: str) -> dict[str, Any]: - """ - Helper function for Variable retrieval with deserialization and dictionary default. - """ + """Shortcut for Variable retrieval with deserialization and dictionary default.""" return Variable.get(key, default_var={}, deserialize_json=True) @@ -42,8 +40,9 @@ def _update_tokens( tokens: dict[str, str], ) -> None: """ - Update the access/refresh tokens for a specific provider in the Airflow Variable - store. This update does not affect the tokens for any other existing providers. + Update the access/refresh tokens for a provider in the Airflow Variable store. + + This update does not affect the tokens for any other existing providers. """ log.info(f"Updating tokens for provider: {provider_name}") current_tokens = _var_get(OAUTH2_TOKEN_KEY) @@ -58,9 +57,9 @@ def _get_provider_secrets( name: str, provider_secrets: dict[str, dict] = None ) -> dict[str, str]: """ - Retrieve provider secrets from the Airflow Variable store. Optionally provide - a previously retrieved Variable value for improved performance. + Retrieve provider secrets from the Airflow Variable store. + Optionally provide a previously retrieved Variable value for improved performance. Providers are expected to *at least* have a `client_id`, and may have more information defined as necessary. """ @@ -78,8 +77,10 @@ def _get_provider_secrets( def get_oauth_client(provider_name: str) -> OAuth2Session: """ - Create an OAuth2 client. This client behaves like a `requests.Session` instance, - but will automatically add the authorization necessary for a particular provider. + Create an OAuth2 client. + + This client behaves like a `requests.Session` instance, but will automatically add + the authorization necessary for a particular provider. """ secrets = _get_provider_secrets(provider_name) tokens = _var_get(OAUTH2_TOKEN_KEY) @@ -93,7 +94,8 @@ def get_oauth_client(provider_name: str) -> OAuth2Session: def authorize_providers(providers: Collection[OauthProvider]) -> None: """ - Iterate through all of the specified providers and authorize those that may need it. + Iterate through all the specified providers and authorize those that may need it. + The authorization flow will only be attempted if a provider has an authorization key defined in the Airflow Variable store. """ @@ -118,10 +120,11 @@ def authorize_providers(providers: Collection[OauthProvider]) -> None: def refresh(provider: OauthProvider) -> None: """ - Refresh the tokens for a given provider. This will use the stored refresh token to - attempt a fetch of a new access/refresh token pair. The new tokens will be updated - in the Airflow Variable store. Raises an AirflowSkipException if no tokens are - defined for the provider. + Refresh the tokens for a given provider. + + This will use the stored refresh token to attempt a fetch of a new access/refresh + token pair. The new tokens will be updated in the Airflow Variable store. Raises + an AirflowSkipException if no tokens are defined for the provider. """ current_tokens = _var_get(OAUTH2_TOKEN_KEY) if provider.name not in current_tokens: diff --git a/openverse_catalog/dags/oauth2/token_refresh_dag.py b/openverse_catalog/dags/oauth2/token_refresh_dag.py index f4459a6fa..97fb0b23c 100644 --- a/openverse_catalog/dags/oauth2/token_refresh_dag.py +++ b/openverse_catalog/dags/oauth2/token_refresh_dag.py @@ -1,10 +1,4 @@ -""" -# OAuth Provider Token Refresh - -**Author**: Madison Swain-Bowden - -**Created**: 2021-10-13 -""" +"""# OAuth Provider Token Refresh.""" from datetime import datetime import oauth2 diff --git a/openverse_catalog/dags/providers/factory_utils.py b/openverse_catalog/dags/providers/factory_utils.py index ea2ac8354..35b3fc352 100644 --- a/openverse_catalog/dags/providers/factory_utils.py +++ b/openverse_catalog/dags/providers/factory_utils.py @@ -20,10 +20,11 @@ def generate_tsv_filenames( args: Sequence = None, ) -> None: """ - Calculate the output directories for each media store to XComs. Output locations - are pushed under keys with the format `_tsv`. This is a temporary - workaround due to the nature of the current provider scripts. Once - https://github.com/WordPress/openverse-catalog/issues/229 is addressed and the + Calculate the output directories for each media store to XComs. + + Output locations are pushed under keys with the format `_tsv`. + This is a temporary workaround due to the nature of the current provider scripts. + Once https://github.com/WordPress/openverse-catalog/issues/229 is addressed and the provider scripts are refactored into classes, we can alter the process by which MediaStores create/set their output directory so that they are always expected to receive it. @@ -57,6 +58,8 @@ def pull_media_wrapper( args: Sequence = None, ): """ + Wrap the primary data pull function. + Run the provided callable after pushing setting the output directories for each media store using the provided values. This is a temporary workaround due to the nature of the current provider scripts. Once @@ -105,9 +108,9 @@ def date_partition_for_prefix( reingestion_date: datetime, ) -> str: """ - Given a schedule interval and the logical date for a DAG run, determine an - appropriate partition for the run. This partition will be used as part of the S3 - key prefix for a given TSV. + Determine the date partitions based on schedule, logical date, and reingestion date. + + This partition will be used as part of the S3 key prefix for a given TSV. Prefix mapping (schedule interval to partition): - Hourly -> `year=YYYY/month=MM/day=DD` diff --git a/openverse_catalog/dags/providers/provider_api_scripts/cleveland_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/cleveland_museum.py index 571b6029a..0afac3c4b 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/cleveland_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/cleveland_museum.py @@ -83,7 +83,8 @@ def _get_image_data(image_data): @staticmethod def _get_int_value(data: dict, key: str) -> int | None: """ - Converts the value of the key `key` in `data` to an integer. + Convert the value of the key `key` in `data` to an integer. + Returns None if the value is not convertible to an integer, or if the value doesn't exist. """ diff --git a/openverse_catalog/dags/providers/provider_api_scripts/flickr.py b/openverse_catalog/dags/providers/provider_api_scripts/flickr.py index c08f04ad6..e6e402f9a 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/flickr.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/flickr.py @@ -63,10 +63,11 @@ def __init__(self, *args, **kwargs): @staticmethod def _derive_timestamp_pair_list(date): """ - Build a list of timestamp pairs that divide the given date into equal - portions of the 24-hour period. Ingestion will be run separately for - each of these time divisions. This is necessary because requesting data - for too long a period may cause unexpected behavior from the API: + Create a list of start/end timestamps for equal portions of the day. + + Ingestion will be run separately for each of these time divisions. + This is necessary because requesting data for too long a period may cause + unexpected behavior from the API: https://github.com/WordPress/openverse-catalog/issues/26. """ seconds_in_a_day = 86400 @@ -206,9 +207,7 @@ def _url_join(self, *args): @staticmethod def _get_largest_image_size(image_data): - """ - Returns the key for the largest image size available. - """ + """Return the key for the largest image size available.""" for size in ["l", "m", "s"]: if f"url_{size}" in image_data: return size @@ -261,6 +260,8 @@ def _create_tags_list(image_data, max_tag_string_length=2000): @staticmethod def _get_category(image_data): """ + Get the category. + Flickr has three types: 0 for photos 1 for screenshots diff --git a/openverse_catalog/dags/providers/provider_api_scripts/freesound.py b/openverse_catalog/dags/providers/provider_api_scripts/freesound.py index 00ecddbed..fb91e3491 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/freesound.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/freesound.py @@ -168,6 +168,7 @@ def _get_audio_set_info(self, media_data): def _get_audio_file_size(self, url): """ Get the content length of a provided URL. + Freesound can be finicky, so we want to retry it a few times on these conditions: * SSLError - 'EOF occurred in violation of protocol (_ssl.c:1129)' @@ -214,7 +215,8 @@ def _get_audio_files( def get_record_data(self, media_data: dict) -> dict | list[dict] | None: """ - Extracts metadata about the audio file. + Extract metadata about the audio file. + Freesound does not have audio thumbnails. """ foreign_landing_url = media_data.get("url") diff --git a/openverse_catalog/dags/providers/provider_api_scripts/inaturalist.py b/openverse_catalog/dags/providers/provider_api_scripts/inaturalist.py index 4b77e8716..9b12b491b 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/inaturalist.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/inaturalist.py @@ -64,9 +64,7 @@ def get_next_query_params(self, prev_query_params=None, **kwargs): return {"offset_num": next_offset} def get_response_json(self, query_params: dict): - """ - Call the SQL to pull json from Postgres, where the raw data has been loaded. - """ + """Call the SQL to pull json from Postgres, where the raw data is loaded.""" sql_string = self.sql_template.format( batch_limit=self.batch_limit, offset_num=query_params["offset_num"] ) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/jamendo.py b/openverse_catalog/dags/providers/provider_api_scripts/jamendo.py index dcddc936a..8afc50cbe 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/jamendo.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/jamendo.py @@ -66,9 +66,7 @@ def get_batch_data(self, response_json): @staticmethod def _remove_param_from_url(url: str, param: str) -> str: - """ - Remove a parameter from a provided URL. - """ + """Remove a parameter from a provided URL.""" parsed_url = urlsplit(url) query = parse_qs(parsed_url.query) query.pop(param, None) @@ -76,6 +74,8 @@ def _remove_param_from_url(url: str, param: str) -> str: def _remove_trackid(self, thumbnail_url: str | None) -> str | None: """ + Remove the track ID from a URL. + ``audio_set`` data is used to create a separate database table in the API. To make sure that any given ``audio_set`` appears in that table only once, all the fields for that ``audio_set`` need to have the same values. In @@ -94,9 +94,10 @@ def _remove_trackid(self, thumbnail_url: str | None) -> str | None: def _get_audio_url(self, data): """ - Parses out audio URL, and removes the "from" parameter. Audio URLs have a "from" - param which seems to encapsulate information about the calling application. - Example from the API: + Parse out audio URL and remove the "from" parameter. + + Audio URLs have a "from" param which seems to encapsulate information about the + calling application. Example from the API: https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31&from=app-devsite This information looks like an API key or secret when returned, so we remove it since it's not necessary for serving the audio files. diff --git a/openverse_catalog/dags/providers/provider_api_scripts/nypl.py b/openverse_catalog/dags/providers/provider_api_scripts/nypl.py index b226ab757..408db7ff0 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/nypl.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/nypl.py @@ -17,9 +17,9 @@ def get_value_from_dict_or_list( dict_or_list: dict | list, keys: list[str] ) -> dict | list | str | None: """ - Returns the nested value. - If dict_or_list is a list, returns the value from the first - dictionary in the list. + Return the nested value. + + If dict_or_list is a list, returns the value from the first dictionary in the list. If it is a dict, returns the value from the dict. """ if not keys or not dict_or_list: @@ -147,7 +147,9 @@ def get_record_data(self, data): @staticmethod def _get_filetype(description: str): """ - Extracts the filetype from a description string like: + Extract the filetype from a description string. + + Example: "Cropped .jpeg (1600 pixels on the long side)" This is required because the filetype is not present/extractable from the url via the MediaStore class. @@ -161,6 +163,8 @@ def _get_filetype(description: str): @staticmethod def _get_image_data(images) -> tuple[None, None] | tuple[str, str]: """ + Get the image data from the list of images. + Receives a list of dictionaries of the following shape: { "$": "http://images.nypl.org/index.php?id=56738467&t=q&download=1 diff --git a/openverse_catalog/dags/providers/provider_api_scripts/phylopic.py b/openverse_catalog/dags/providers/provider_api_scripts/phylopic.py index ad45b14a2..1028c2161 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/phylopic.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/phylopic.py @@ -43,6 +43,8 @@ def __init__(self, *args, **kwargs): @property def endpoint(self) -> str: """ + Dynamically determine the endpoint. + If this is being run as a dated DAG, **only one request is ever issued** to retrieve all updated IDs. As such, the dated version will only return one endpoint. The full run DAG does require the typical offset + limit, which gets @@ -63,6 +65,8 @@ def endpoint(self) -> str: def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: """ + Get the query parameters for the next request. + Since the query range is determined via endpoint, this only increments the range to query. """ @@ -74,16 +78,15 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic def get_should_continue(self, response_json): """ - Override for upstream "return True". Dated runs will only ever make 1 query so - they should not continue to loop. + Override for upstream "return True". + + Dated runs will only ever make 1 query, so they should not continue to loop. """ return not bool(self.date) @staticmethod def _get_response_data(response_json) -> dict | list | None: - """ - Intermediate method for pulling out results from a Phylopic API request. - """ + """Intermediate method for pulling out results from a Phylopic API request.""" if response_json and response_json.get("success") is True: return response_json.get("result") diff --git a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index 8ee35b3a1..ec7c44697 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -16,6 +16,8 @@ class AggregateIngestionError(Exception): """ + Combined error for all ingestion errors. + Custom exception when multiple ingestion errors are skipped and then raised in aggregate at the end of ingestion. """ @@ -25,6 +27,8 @@ class AggregateIngestionError(Exception): class IngestionError(Exception): """ + Ingestion error which occurred during processing. + Custom exception which includes information about the query_params that were being used when the error was encountered. """ @@ -45,8 +49,7 @@ def repr_with_traceback(self): class ProviderDataIngester(ABC): """ - An abstract base class that initializes media stores and ingests records - from a given provider. + ABC which initializes media stores and ingests records from a given provider. Class variables of note: providers: a dictionary whose keys are the supported `media_types`, and values are @@ -69,6 +72,8 @@ class ProviderDataIngester(ABC): @abstractmethod def providers(self) -> dict[str, str]: """ + Providers supported by this ingester. + A dictionary mapping each supported media type to its corresponding `provider` string (the string that will populate the `provider` field in the Catalog DB). These strings should be defined as constants in @@ -89,13 +94,13 @@ def providers(self) -> dict[str, str]: @property @abstractmethod def endpoint(self): - """ - The URL with which to request records from the API. - """ + """The URL with which to request records from the API.""" pass def __init__(self, conf: dict = None, dag_id: str = None, date: str = None): """ + Initialize the ingester. + Optional Arguments: conf: The configuration dict for the running DagRun @@ -152,10 +157,7 @@ def __init__(self, conf: dict = None, dag_id: str = None, date: str = None): self.override_query_params = (qp for qp in query_params_list) def _init_media_stores(self) -> dict[str, MediaStore]: - """ - Initialize a media store for each media type supported by this - provider. - """ + """Initialize a media store for each media type supported by this provider.""" media_stores = {} for media_type, provider in self.providers.items(): @@ -166,7 +168,9 @@ def _init_media_stores(self) -> dict[str, MediaStore]: def ingest_records(self, **kwargs) -> None: """ - The main ingestion function that is called during the `pull_data` task. + Ingest all records. + + This is the main ingestion function that is called during the `pull_data` task. Optional Arguments: **kwargs: Optional arguments to be passed to `get_next_query_params`. @@ -238,10 +242,12 @@ def ingest_records(self, **kwargs) -> None: def _get_ingestion_errors(self) -> AggregateIngestionError | None: """ + Retrieve all ingestion errors that have been caught during processing. + If any errors were skipped during ingestion, log them as well as the associated query parameters. Then return an AggregateIngestionError. - It there are no errors to report, returns None. + If there are no errors to report, returns None. """ if self.ingestion_errors: # Log the affected query_params @@ -269,9 +275,10 @@ def _get_query_params( self, prev_query_params: dict | None, **kwargs ) -> dict | None: """ - Returns the next set of query_params for the next request, handling - optional overrides via the dag_run conf. This method should not be overridden; - instead override get_next_query_params. + Return the next set of query_params for the next request. + + This handles optional overrides via the dag_run conf. + This method should not be overridden; instead override get_next_query_params. """ # If we are getting query_params for the first batch and initial_query_params # have been set, return them. @@ -298,6 +305,8 @@ def _get_query_params( @abstractmethod def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: """ + Get the next set of query parameters. + Given the last set of query params, return the query params for the next request. Depending on the API, this may involve incrementing an `offset` or `page` param, for example. @@ -312,6 +321,8 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic def get_batch(self, query_params: dict) -> tuple[list | None, bool]: """ + Get a batch of records from the API. + Given query params, request the next batch of records from the API and return them in a list. @@ -343,8 +354,10 @@ def get_response_json( self, query_params: dict, endpoint: str | None = None, **kwargs ): """ - Make the actual API requests needed to ingest a batch. This can be overridden - in order to support APIs that require multiple requests, for example. + Make the actual API requests needed to ingest a batch. + + This can be overridden in order to support APIs that require multiple requests, + for example. """ return self.delayed_requester.get_response_json( endpoint or self.endpoint, @@ -356,6 +369,8 @@ def get_response_json( def get_should_continue(self, response_json): """ + Determine whether ingestion should continue after this batch has been processed. + This method should be overridden when an API has additional logic for determining whether ingestion should continue. For example, this can be used to check for the existence of a `continue_token` in the @@ -365,14 +380,13 @@ def get_should_continue(self, response_json): @abstractmethod def get_batch_data(self, response_json) -> None | list[dict]: - """ - Take an API response and return the list of records. - """ + """Take an API response and return the list of records.""" pass def process_batch(self, media_batch) -> int: """ Process a batch of records by adding them to the appropriate MediaStore. + Returns the total count of records ingested up to this point, for all media types. """ @@ -410,8 +424,10 @@ def process_batch(self, media_batch) -> int: def get_media_type(self, record: dict) -> str: """ - For a given record, return the media type it represents (eg "image", "audio", - etc.) If a provider only supports a single media type, this method defaults + Return the media type a record represents. + + (eg "image", "audio", etc.) + If a provider only supports a single media type, this method defaults to returning the only media type defined in the ``providers`` attribute. """ if len(self.providers) == 1: @@ -425,11 +441,11 @@ def get_media_type(self, record: dict) -> str: @abstractmethod def get_record_data(self, data: dict) -> dict | list[dict] | None: """ - Parse out the necessary information (license info, urls, etc) from the record - data into a dictionary. + Parse out the necessary information from the record data into a dictionary. - If the record being parsed contains data for additional related records, a list - may be returned of multiple record dictionaries. + Examples include license info, urls, etc. If the record being parsed contains + data for additional related records, a list may be returned of multiple + record dictionaries. """ pass diff --git a/openverse_catalog/dags/providers/provider_api_scripts/rawpixel.py b/openverse_catalog/dags/providers/provider_api_scripts/rawpixel.py index 1432f35d7..07fd7c78a 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/rawpixel.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/rawpixel.py @@ -85,6 +85,8 @@ def get_media_type(self, record: dict) -> str: def _get_signature(self, query_params: dict) -> str: """ + Get the query signature for a request. + URL encode the ordered parameters in a way that matches Node's querystring.stringify as closely as possible See: https://docs.python.org/3.10/library/urllib.parse.html#urllib.parse.urlencode # noqa @@ -132,6 +134,8 @@ def get_batch_data(self, response_json): @staticmethod def _get_image_url(data: dict, size_option: str) -> str | None: """ + Get the URL for an image. + Rawpixel provides a "style_uri" string which can be formatted with these values: 'image_24', 'image_png_24', 'image_48', 'image_png_48', 'image_100', 'image_png_100', 'image_150', 'image_png_150', 'image_200', 'image_png_200', @@ -176,6 +180,8 @@ def _clean_text(text: str) -> str: @staticmethod def _get_title(metadata: dict) -> str | None: """ + Get the title for an image. + Titles come in the following form, so we clean them up a bit: Bull elk searches for food | Free Photo - rawpixel Desktop wallpaper summer beach landscape, | Free Photo - rawpixel diff --git a/openverse_catalog/dags/providers/provider_api_scripts/science_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/science_museum.py index d6fe0715a..e90b6ee3a 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/science_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/science_museum.py @@ -45,6 +45,8 @@ def __init__(self, *args, **kwargs): @staticmethod def _get_year_ranges(final_year: int) -> list[tuple[int, int]]: """ + Get the year ranges based on a final year. + The Science Museum API currently raises a 400 when attempting to access any page number higher than 50 (https://github.com/TheScienceMuseum/collectionsonline/issues/1470). @@ -181,8 +183,10 @@ def check_url(image_url: str | None) -> str | None: @staticmethod def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]: """ - Returns the height and width of the image from "image_data"."measurements" - with keys of "dimension", "units", "value". + Return the height and width of the image. + + Uses the values from "image_data"."measurements" with keys of "dimension", + "units", "value". """ size = {} dimensions = image_data.get("measurements", {}).get("dimensions") diff --git a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py index 2c23bd1a3..1149797c0 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py @@ -58,18 +58,11 @@ def get_should_continue(self, response_json): return bool(response_json.get("nextPage")) def get_batch_data(self, response_json): - """ - Take an API response and return the list of records. - """ if response_json: return response_json.get("results") return None def get_record_data(self, data): - """ - Parse out the necessary information (license info, urls, etc) from the record - data (single image dict-like thing) into a dictionary. - """ try: foreign_id = data["img_id"] except (TypeError, KeyError, AttributeError): @@ -122,8 +115,10 @@ def _get_image_info(item): @staticmethod def _get_creator_data(item): """ - Get the author's name and website preferring their custom link over the - StockSnap profile. The latter is used if the first is not found. + Get the author's name and website. + + This prefers their custom link over the StockSnap profile. + The latter is used if the first is not found. """ creator_name = item.get("author_name") if creator_name is None: @@ -139,7 +134,9 @@ def _get_creator_data(item): @staticmethod def _get_title(item): """ - Get the first two photo's tags/keywords to make the title and transform it + Get the title. + + Gets the first two photo's tags/keywords to make the title and transform it to title case, as shown on its page. """ tags = item.get("keywords", [])[:2] @@ -148,9 +145,7 @@ def _get_title(item): return img_title.title() def _get_filesize(self, image_url): - """ - Get the size of the image in bytes. - """ + """Get the size of the image in bytes.""" resp = self.delayed_requester.head(image_url) if resp: filesize = int(resp.headers.get("Content-Length", 0)) @@ -158,9 +153,7 @@ def _get_filesize(self, image_url): @staticmethod def _get_metadata(item): - """ - Include popularity statistics. - """ + """Include popularity statistics.""" extras = ["downloads_raw", "page_views_raw", "favorites_raw"] metadata = {} for key in extras: diff --git a/openverse_catalog/dags/providers/provider_api_scripts/wikimedia_commons.py b/openverse_catalog/dags/providers/provider_api_scripts/wikimedia_commons.py index 118cc2b8a..4d5424175 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/wikimedia_commons.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/wikimedia_commons.py @@ -78,11 +78,12 @@ def get_next_query_params(self, prev_query_params, **kwargs): } def get_media_type(self, record): - """Get the media_type of a parsed Record""" return record["media_type"] def get_response_json(self, query_params): """ + Get the response data from the API. + Overrides the parent function to make multiple requests until we see "batchcomplete", rather than a single request to the endpoint. This ensures that global usage data used for calculating @@ -179,7 +180,7 @@ def get_record_data(self, record): return funcs[valid_media_type](record_data, media_info) def get_image_record_data(self, record_data, media_info): - """Extend record_data with image-specific fields""" + """Extend record_data with image-specific fields.""" record_data["image_url"] = record_data.pop("media_url") if record_data["filetype"] == "svg": record_data["category"] = "illustration" @@ -191,7 +192,7 @@ def get_image_record_data(self, record_data, media_info): } def get_audio_record_data(self, record_data, media_info): - """Extend record_data with audio-specific fields""" + """Extend record_data with audio-specific fields.""" record_data["audio_url"] = record_data.pop("media_url") duration = int(float(media_info.get("duration", 0)) * 1000) @@ -215,7 +216,7 @@ def get_audio_record_data(self, record_data, media_info): return record_data def parse_audio_file_meta_data(self, media_info): - """Parse out audio file metadata""" + """Parse out audio file metadata.""" metadata = media_info.get("metadata", []) streams = self.get_value_by_name(metadata, "streams") @@ -242,8 +243,7 @@ def extract_media_info_dict(media_data): @staticmethod def get_value_by_name(key_value_list: list, prop_name: str): - """Gets the first value for the given `prop_name` in a list of - key value pairs.""" + """Get the first value for the given prop_name in a list of key value pairs.""" if key_value_list is None: key_value_list = [] @@ -257,8 +257,7 @@ def get_value_by_name(key_value_list: list, prop_name: str): @staticmethod def get_value_by_names(key_value_list: list, prop_names: list): - """Gets the first available value for one of the `prop_names` - property names""" + """Get the first available value for one of the `prop_names` property names.""" for prop_name in prop_names: if val := WikimediaCommonsDataIngester.get_value_by_name( key_value_list, prop_name @@ -282,8 +281,12 @@ def extract_media_type(media_info): @staticmethod def extract_audio_category(parsed_data): - """Set category to "pronunciation" for any audio with - pronunciation of a word or a phrase""" + """ + Determine the audio category. + + Sets category to "pronunciation" for any audio with pronunciation + of a word or a phrase. + """ for category in parsed_data["meta_data"].get("categories", []): if "pronunciation" in category.lower(): return "pronunciation" diff --git a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py index 6046bd5a8..838ff072c 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py @@ -89,9 +89,7 @@ def get_should_continue(self, response_json): return True def get_record_data(self, data): - """ - Extract data for individual item. - """ + """Extract data for individual item.""" if (foreign_identifier := data.get("slug")) is None: return None diff --git a/openverse_catalog/dags/providers/provider_dag_factory.py b/openverse_catalog/dags/providers/provider_dag_factory.py index cfd63929b..465a3444a 100644 --- a/openverse_catalog/dags/providers/provider_dag_factory.py +++ b/openverse_catalog/dags/providers/provider_dag_factory.py @@ -1,5 +1,6 @@ """ -# Provider DAG Factory +# Provider DAG Factory. + This file contains two factory functions which generate the bulk of our provider workflow DAGs. These DAGs pull data in from a particular provider, and produce one or several TSVs of the results. @@ -100,8 +101,10 @@ def create_ingestion_workflow( conf: ProviderWorkflow, day_shift: int = 0, is_reingestion: bool = False ): """ - Creates a TaskGroup that performs the ingestion tasks, first pulling and then - loading data. Returns the TaskGroup, and a dictionary of reporting metrics. + Create a TaskGroup that performs the ingestion tasks. + + This flow first pulls and then loads the data. Returns the TaskGroup, and a + dictionary of reporting metrics. Required Arguments: @@ -301,8 +304,7 @@ def create_report_load_completion( def create_provider_api_workflow_dag(conf: ProviderWorkflow): """ - This factory method instantiates a DAG that will run the given - `main_function`. + Instantiate a DAG that will run the given `main_function`. Required Arguments: @@ -345,8 +347,10 @@ def _build_partitioned_ingest_workflows( partitioned_reingestion_days: list[list[int]], conf: ProviderReingestionWorkflow ): """ - Builds a list of lists of ingestion tasks, parameterized by the given - dag conf and a list of day shifts. Calculation is explained below. + Build a list of lists of ingestion tasks. + + These are parameterized by the given dag conf and a list of day shifts. + Calculation is explained below. Required Arguments: @@ -427,10 +431,10 @@ def create_day_partitioned_reingestion_dag( conf: ProviderReingestionWorkflow, partitioned_reingestion_days: list[list[int]] ): """ - Given a `conf` object and `reingestion_day_list_list`, this - factory method instantiates a DAG that will run ingestion using the - given configuration, parameterized by a number of dates calculated - using the reingestion day list. + Instantiate a DAG that will run ingestion using the given configuration. + + In addition to a `conf` object and `reingestion_day_list_list`, this is + parameterized by a number of dates calculated using the reingestion day list. Required Arguments: diff --git a/openverse_catalog/dags/providers/provider_reingestion_workflows.py b/openverse_catalog/dags/providers/provider_reingestion_workflows.py index e49e8031c..27cb1af53 100644 --- a/openverse_catalog/dags/providers/provider_reingestion_workflows.py +++ b/openverse_catalog/dags/providers/provider_reingestion_workflows.py @@ -14,6 +14,8 @@ @dataclass class ProviderReingestionWorkflow(ProviderWorkflow): """ + Provider reingestion workflow configurations. + Extends the ProviderWorkflow with configuration options used to set up a day-partitioned ingestion workflow DAG. diff --git a/openverse_catalog/dags/providers/provider_workflow_dag_factory.py b/openverse_catalog/dags/providers/provider_workflow_dag_factory.py index 7f242f0f2..9fe0be06e 100644 --- a/openverse_catalog/dags/providers/provider_workflow_dag_factory.py +++ b/openverse_catalog/dags/providers/provider_workflow_dag_factory.py @@ -1,5 +1,5 @@ -"""" -# Provider Workflow Dag Factory +""" +# Provider Workflow DAG Factory. This file iterates over the configurations defined in PROVIDER_WORKFLOWS and generates a provider workflow DAG in Airflow for each. diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index e668affef..182c3f541 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -30,6 +30,8 @@ @dataclass class ProviderWorkflow: """ + Provider workflow definition. + Required Arguments: ingester_class: ProviderDataIngester class whose `ingest_records` method is diff --git a/openverse_catalog/templates/create_provider_ingester.py b/openverse_catalog/templates/create_provider_ingester.py index f37f1ee05..1b51ef3e6 100644 --- a/openverse_catalog/templates/create_provider_ingester.py +++ b/openverse_catalog/templates/create_provider_ingester.py @@ -1,6 +1,4 @@ -""" -Script used to generate a templated ProviderDataIngester. -""" +"""Script used to generate a templated ProviderDataIngester.""" import argparse import re @@ -16,9 +14,7 @@ def _render_provider_configuration(provider: str, media_type: str): - """ - Render the provider configuration string for a particular media type. - """ + """Render the provider configuration string for a particular media type.""" return f'"{media_type}": prov.{provider}_{media_type.upper()}_PROVIDER,' @@ -117,6 +113,8 @@ def fill_template( def sanitize_provider(provider: str) -> str: """ + Sanitize the provider name. + Takes a provider string from user input and sanitizes it by: - removing trailing whitespace - replacing spaces and periods with underscores @@ -133,7 +131,9 @@ def sanitize_provider(provider: str) -> str: def parse_media_types(media_types: list[str]) -> list[str]: """ - Parses valid media types out from user input. Defaults to ["image",] + Parse valid media types out from user input. + + Defaults to ["image",]. """ valid_media_types = [] diff --git a/tests/conftest.py b/tests/conftest.py index 3c6c7d5b0..72f866cd1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ def pytest_addoption(parser): """ + Add options to pytest. + This functions alters the pytest CLI command options. It adds an "extended" flag which will run tests that take a significant amount of time that may not be useful for rapid local iteration. diff --git a/tests/dags/common/conftest.py b/tests/dags/common/conftest.py index a5ae2b732..e23944935 100644 --- a/tests/dags/common/conftest.py +++ b/tests/dags/common/conftest.py @@ -16,7 +16,9 @@ def _delete_bucket(bucket): def pytest_configure(config): """ - Dynamically allow the S3 host during testing. This is required because: + Dynamically allow the S3 host during testing. + + This is required because: * Docker will use different internal ports depending on what's available * Boto3 will open a socket with the IP address directly rather than the hostname * We can't add the allow_hosts mark to the empty_s3_bucket fixture directly diff --git a/tests/dags/common/test_resources/fake_provider_data_ingester.py b/tests/dags/common/test_resources/fake_provider_data_ingester.py index 4b28f687b..7edb6c3f3 100644 --- a/tests/dags/common/test_resources/fake_provider_data_ingester.py +++ b/tests/dags/common/test_resources/fake_provider_data_ingester.py @@ -1,5 +1,6 @@ """ This is a fake provider module used in test_dag_factory. + It is used to check that the output path acquisition logic is correct. """ diff --git a/tests/dags/common/test_resources/fake_provider_module.py b/tests/dags/common/test_resources/fake_provider_module.py index 811ca852e..7b5959267 100644 --- a/tests/dags/common/test_resources/fake_provider_module.py +++ b/tests/dags/common/test_resources/fake_provider_module.py @@ -1,5 +1,6 @@ """ This is a fake provider module used in test_dag_factory. + It is used to check that the output path acquisition logic is correct. """ from common.storage.audio import AudioStore diff --git a/tests/dags/conftest.py b/tests/dags/conftest.py index db86a78c9..8ff82f945 100644 --- a/tests/dags/conftest.py +++ b/tests/dags/conftest.py @@ -36,8 +36,10 @@ def oauth_provider_var_mock(): def _make_response(*args, **kwargs): """ - Mock the request used during license URL validation. Most times the results of this - function are expected to end with a `/`, so if the URL provided does not we add it. + Mock the request used during license URL validation. + + Most times the results of this function are expected to end with a `/`, so if the + URL provided does not we add it. """ response: Response = mock.Mock(spec=Response) if args: @@ -52,8 +54,9 @@ def _make_response(*args, **kwargs): @pytest.fixture(autouse=True) def requests_get_mock(): """ - Mock request.get calls that occur during testing done by the - `common.urls.rewrite_redirected_url` function. + Mock request.get calls that occur during testing. + + This is primarily done by the `common.urls.rewrite_redirected_url` function. """ with mock.patch("common.urls.requests_get", autospec=True) as mock_get: mock_get.side_effect = _make_response @@ -63,9 +66,9 @@ def requests_get_mock(): @pytest.fixture def freeze_time(monkeypatch): """ - Now() manager patches datetime return a fixed, settable, value - (freezes time) + Patch the `datetime.datetime.now` function to return a fixed, settable time. + This effectively freezes time. https://stackoverflow.com/a/28073449 CC BY-SA 3.0 """ import datetime @@ -90,7 +93,7 @@ def now(cls): @classmethod def delta(cls, timedelta=None, **kwargs): - """Moves time fwd/bwd by the delta""" + """Move time fwd/bwd by the delta.""" from datetime import timedelta as td if not timedelta: diff --git a/tests/dags/providers/provider_api_scripts/resources/inaturalist/pull_sample_records.py b/tests/dags/providers/provider_api_scripts/resources/inaturalist/pull_sample_records.py index afa56adae..969aef744 100644 --- a/tests/dags/providers/provider_api_scripts/resources/inaturalist/pull_sample_records.py +++ b/tests/dags/providers/provider_api_scripts/resources/inaturalist/pull_sample_records.py @@ -47,7 +47,7 @@ def pull_sample_records( input_path=RAW_DATA, ): """ - Reads through a full gzip file and keeps just the selected ID records + Read through a full gzip file and keep just the selected ID records. This is not wildly efficient for large ID lists and large files. """ @@ -95,6 +95,8 @@ def pull_sample_records( def get_sample_id_list(sample_file, joined_on): """ + Get the list of sample IDs from the file. + sample_file: from a table that you have already drawn sample records a file object so that you can handle if it's compressed or not, set up to read text diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py index 7bc90be2e..e58f69adc 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py @@ -1,5 +1,6 @@ """ Ad hoc exploratory analysis for identifying test cases for the API workflow. + To execute this, download MetObjects.csv from here: https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv """ diff --git a/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py b/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py index ea8b69b9f..9ff37e170 100644 --- a/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py +++ b/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py @@ -18,8 +18,7 @@ class MockProviderDataIngesterMixin: """ - A very simple concrete implementation of the ProviderDataIngester class, - for testing purposes. + A simple concrete ProviderDataIngester class for testing purposes. Excludes ``get_media_type`` to allow for testing implementations that do not require it (single media type providers). @@ -70,9 +69,7 @@ class MockAudioOnlyProviderDataIngester( class IncorrectlyConfiguredMockProviderDataIngester( MockProviderDataIngesterMixin, ProviderDataIngester ): - """ - Used for testing default method implementions. - """ + """Used for testing default method implementations.""" # Do not configure ``get_media_type`` to test the failure case # for the default implementation diff --git a/tests/factories/github/__init__.py b/tests/factories/github/__init__.py index eb72d4c97..9067c56ff 100644 --- a/tests/factories/github/__init__.py +++ b/tests/factories/github/__init__.py @@ -39,7 +39,8 @@ def walk_backwards_in_time_until_weekday_count(today: datetime.datetime, count: def make_pull(urgency: Urgency, past_due: bool) -> dict: """ - Creates a PR object like the one returned by the GitHub API. + Create a PR object like the one returned by the GitHub API. + The PR will also be created specifically to have the priority label associated with the passed in urgency.