Merge pull request #423 from Wikidata/rotten-urls

Rotten URLs for catalog providers
Wikidata · Aug 23, 2021 · 9ed8b4e · 9ed8b4e
2 parents 81a8bb5 + 1b90cd7
commit 9ed8b4e
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 56 deletions.
diff --git a/soweego/commons/url_utils.py b/soweego/commons/url_utils.py
@@ -5,13 +5,14 @@
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
-__version__ = '1.0'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Hjfocs'
+__copyright__ = 'Copyleft 2021, Hjfocs'
 
 import logging
 import re
 from functools import lru_cache
+from typing import Optional
 from urllib.parse import unquote, urlsplit
 
 import regex
@@ -100,7 +101,13 @@ def validate(url):
 
 
 @lru_cache()
-def resolve(url):
+def resolve(url: str) -> Optional[str]:
+    """Try to resolve an URL via a set of strategies.
+
+    :param url: an URL
+    :return: the resolved URL (may differ from the given one), or ``None``
+    if the resolution attempt failed
+    """
     # Don't show warnings in case of unverified HTTPS requests
     disable_warnings(InsecureRequestWarning)
     # Some Web sites return 4xx just because of a non-browser user agent header
@@ -122,42 +129,32 @@ def resolve(url):
             response = get(url, headers=browser_ua, stream=True, verify=False)
         except Exception as unexpected_error:
             LOGGER.warning(
-                'Dropping URL that led to an unexpected error: <%s> - Reason: %s',
-                url,
-                unexpected_error,
+                'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
             )
             return None
     except requests.exceptions.Timeout as timeout:
         LOGGER.info(
-            'Dropping URL that led to a request timeout: <%s> - Reason: %s',
-            url,
-            timeout,
+            'Request timeout: <%s> - Reason: %s', url, timeout,
         )
         return None
     except requests.exceptions.TooManyRedirects as too_many_redirects:
         LOGGER.info(
-            'Dropping URL because of too many redirects: <%s> - %s',
-            url,
-            too_many_redirects,
+            'Too many redirects: <%s> - %s', url, too_many_redirects,
         )
         return None
     except requests.exceptions.ConnectionError as connection_error:
         LOGGER.info(
-            'Dropping URL that led to an aborted connection: <%s> - Reason: %s',
-            url,
-            connection_error,
+            'Aborted connection: <%s> - Reason: %s', url, connection_error,
         )
         return None
     except Exception as unexpected_error:
         LOGGER.warning(
-            'Dropping URL that led to an unexpected error: <%s> - Reason: %s',
-            url,
-            unexpected_error,
+            'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
         )
         return None
     if not response.ok:
         LOGGER.info(
-            "Dropping dead URL that returned HTTP status '%s' (%d): <%s>",
+            "HTTP status '%s' (%d): <%s>",
             response.reason,
             response.status_code,
             url,

diff --git a/soweego/importer/cli.py b/soweego/importer/cli.py
@@ -5,15 +5,15 @@
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
-__version__ = '1.0'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Hjfocs'
+__copyright__ = 'Copyleft 2021, Hjfocs'
 
 import click
 
-from soweego.importer.importer import check_links_cli, import_cli
+from soweego.importer.importer import check_urls_cli, import_cli
 
-CLI_COMMANDS = {'import': import_cli, 'check_urls': check_links_cli}
+CLI_COMMANDS = {'import': import_cli, 'check_urls': check_urls_cli}
 
 
 @click.group(name='importer', commands=CLI_COMMANDS)

diff --git a/soweego/importer/importer.py b/soweego/importer/importer.py
@@ -3,11 +3,11 @@
 
 """Download, extract, and import a supported catalog."""
 
-__author__ = 'Massimo Frasson'
-__email__ = 'maxfrax@gmail.com'
-__version__ = '1.0'
+__author__ = 'Massimo Frasson, Marco Fossati'
+__email__ = 'maxfrax@gmail.com, fossati@spaziodati.eu'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, MaxFrax96'
+__copyright__ = 'Copyleft 2018-2021, MaxFrax96, Hjfocs'
 
 import datetime
 import logging
@@ -33,6 +33,7 @@
     keys.IMDB: IMDbDumpExtractor,
     keys.MUSICBRAINZ: MusicBrainzDumpExtractor,
 }
+ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt'
 
 
 @click.command()
@@ -48,7 +49,6 @@
     ),
 )
 @click.option(
-    '-d',
     '--dir-io',
     type=click.Path(file_okay=False),
     default=constants.SHARED_FOLDER,
@@ -70,47 +70,87 @@ def _resolve_url(res):
 @click.argument(
     'catalog', type=click.Choice(target_database.supported_targets())
 )
-def check_links_cli(catalog: str):
-    """Check for rotten URLs of an imported catalog."""
-    for entity_type in target_database.supported_entities_for_target(catalog):
+@click.option(
+    '-d', '--drop', is_flag=True, help=f'Drop rotten URLs from the DB.',
+)
+@click.option(
+    '--dir-io',
+    type=click.Path(file_okay=False),
+    default=constants.SHARED_FOLDER,
+    help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
+)
+def check_urls_cli(catalog, drop, dir_io):
+    """Check for rotten URLs of an imported catalog.
+    For every catalog entity, dump a text file with rotten URLs, one per line.
+
+    Use '-d' to drop rotten URLs from the DB on the fly.
+    """
+    for entity in target_database.supported_entities_for_target(catalog):
+        out_path = os.path.join(
+            dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity)
+        )
 
-        LOGGER.info("Validating %s %s links...", catalog, entity_type)
-        entity = target_database.get_link_entity(catalog, entity_type)
-        if not entity:
+        LOGGER.info('Starting check of %s %s URLs ...', catalog, entity)
+        link_entity = target_database.get_link_entity(catalog, entity)
+        if not link_entity:
             LOGGER.info(
-                "%s %s does not have a links table. Skipping...",
+                '%s %s does not have a links table. Skipping ...',
                 catalog,
-                entity_type,
+                entity,
             )
             continue
 
-        session = DBManager.connect_to_db()
-        total = session.query(entity).count()
-        removed = 0
+        query_session = DBManager.connect_to_db()
+        total = query_session.query(link_entity).count()
 
-        with Pool() as pool:
-            # Validate each link
+        rotten = 0
+        if drop:
+            removed = 0
+
+        # Parallel operation
+        with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
+            # Try to resolve every URL
             for resolved, res_entity in tqdm(
-                pool.imap_unordered(_resolve_url, session.query(entity)),
+                pool.imap_unordered(
+                    _resolve_url, query_session.query(link_entity)
+                ),
                 total=total,
             ):
                 if not resolved:
-                    session_delete = DBManager.connect_to_db()
-                    # if not valid delete
-                    session_delete.delete(res_entity)
-                    try:
-                        session_delete.commit()
-                        removed += 1
-                    except:
-                        session.rollback()
-                        raise
-                    finally:
-                        session_delete.close()
-
-        session.close()
+                    # Dump
+                    fout.write(res_entity.url + '\n')
+                    rotten += 1
+
+                    # Drop from DB
+                    if drop:
+                        delete_session = DBManager.connect_to_db()
+                        delete_session.delete(res_entity)
+                        try:
+                            delete_session.commit()
+                            removed += 1
+                        except:
+                            delete_session.rollback()
+                            raise
+                        finally:
+                            delete_session.close()
+        query_session.close()
+
         LOGGER.info(
-            "Removed %s/%s from %s %s", removed, total, catalog, entity_type
+            "Total %s %s rotten URLs dumped to '%s': %d / %d",
+            catalog,
+            entity,
+            out_path,
+            rotten,
+            total,
         )
+        if drop:
+            LOGGER.info(
+                'Total %s %s rotten URLs dropped from the DB: %d / %d',
+                catalog,
+                entity,
+                rotten,
+                removed,
+            )
 
 
 class Importer: