Skip to content

Commit

Permalink
Merge pull request #423 from Wikidata/rotten-urls
Browse files Browse the repository at this point in the history
Rotten URLs for catalog providers
  • Loading branch information
marfox committed Aug 23, 2021
2 parents 81a8bb5 + 1b90cd7 commit 9ed8b4e
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 56 deletions.
35 changes: 16 additions & 19 deletions soweego/commons/url_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
__version__ = '1.0'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, Hjfocs'
__copyright__ = 'Copyleft 2021, Hjfocs'

import logging
import re
from functools import lru_cache
from typing import Optional
from urllib.parse import unquote, urlsplit

import regex
Expand Down Expand Up @@ -100,7 +101,13 @@ def validate(url):


@lru_cache()
def resolve(url):
def resolve(url: str) -> Optional[str]:
"""Try to resolve an URL via a set of strategies.
:param url: an URL
:return: the resolved URL (may differ from the given one), or ``None``
if the resolution attempt failed
"""
# Don't show warnings in case of unverified HTTPS requests
disable_warnings(InsecureRequestWarning)
# Some Web sites return 4xx just because of a non-browser user agent header
Expand All @@ -122,42 +129,32 @@ def resolve(url):
response = get(url, headers=browser_ua, stream=True, verify=False)
except Exception as unexpected_error:
LOGGER.warning(
'Dropping URL that led to an unexpected error: <%s> - Reason: %s',
url,
unexpected_error,
'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
)
return None
except requests.exceptions.Timeout as timeout:
LOGGER.info(
'Dropping URL that led to a request timeout: <%s> - Reason: %s',
url,
timeout,
'Request timeout: <%s> - Reason: %s', url, timeout,
)
return None
except requests.exceptions.TooManyRedirects as too_many_redirects:
LOGGER.info(
'Dropping URL because of too many redirects: <%s> - %s',
url,
too_many_redirects,
'Too many redirects: <%s> - %s', url, too_many_redirects,
)
return None
except requests.exceptions.ConnectionError as connection_error:
LOGGER.info(
'Dropping URL that led to an aborted connection: <%s> - Reason: %s',
url,
connection_error,
'Aborted connection: <%s> - Reason: %s', url, connection_error,
)
return None
except Exception as unexpected_error:
LOGGER.warning(
'Dropping URL that led to an unexpected error: <%s> - Reason: %s',
url,
unexpected_error,
'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
)
return None
if not response.ok:
LOGGER.info(
"Dropping dead URL that returned HTTP status '%s' (%d): <%s>",
"HTTP status '%s' (%d): <%s>",
response.reason,
response.status_code,
url,
Expand Down
8 changes: 4 additions & 4 deletions soweego/importer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
__version__ = '1.0'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, Hjfocs'
__copyright__ = 'Copyleft 2021, Hjfocs'

import click

from soweego.importer.importer import check_links_cli, import_cli
from soweego.importer.importer import check_urls_cli, import_cli

CLI_COMMANDS = {'import': import_cli, 'check_urls': check_links_cli}
CLI_COMMANDS = {'import': import_cli, 'check_urls': check_urls_cli}


@click.group(name='importer', commands=CLI_COMMANDS)
Expand Down
106 changes: 73 additions & 33 deletions soweego/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

"""Download, extract, and import a supported catalog."""

__author__ = 'Massimo Frasson'
__email__ = 'maxfrax@gmail.com'
__version__ = '1.0'
__author__ = 'Massimo Frasson, Marco Fossati'
__email__ = 'maxfrax@gmail.com, fossati@spaziodati.eu'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, MaxFrax96'
__copyright__ = 'Copyleft 2018-2021, MaxFrax96, Hjfocs'

import datetime
import logging
Expand All @@ -33,6 +33,7 @@
keys.IMDB: IMDbDumpExtractor,
keys.MUSICBRAINZ: MusicBrainzDumpExtractor,
}
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt'


@click.command()
Expand All @@ -48,7 +49,6 @@
),
)
@click.option(
'-d',
'--dir-io',
type=click.Path(file_okay=False),
default=constants.SHARED_FOLDER,
Expand All @@ -70,47 +70,87 @@ def _resolve_url(res):
@click.argument(
'catalog', type=click.Choice(target_database.supported_targets())
)
def check_links_cli(catalog: str):
"""Check for rotten URLs of an imported catalog."""
for entity_type in target_database.supported_entities_for_target(catalog):
@click.option(
'-d', '--drop', is_flag=True, help=f'Drop rotten URLs from the DB.',
)
@click.option(
'--dir-io',
type=click.Path(file_okay=False),
default=constants.SHARED_FOLDER,
help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
)
def check_urls_cli(catalog, drop, dir_io):
"""Check for rotten URLs of an imported catalog.
For every catalog entity, dump a text file with rotten URLs, one per line.
Use '-d' to drop rotten URLs from the DB on the fly.
"""
for entity in target_database.supported_entities_for_target(catalog):
out_path = os.path.join(
dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity)
)

LOGGER.info("Validating %s %s links...", catalog, entity_type)
entity = target_database.get_link_entity(catalog, entity_type)
if not entity:
LOGGER.info('Starting check of %s %s URLs ...', catalog, entity)
link_entity = target_database.get_link_entity(catalog, entity)
if not link_entity:
LOGGER.info(
"%s %s does not have a links table. Skipping...",
'%s %s does not have a links table. Skipping ...',
catalog,
entity_type,
entity,
)
continue

session = DBManager.connect_to_db()
total = session.query(entity).count()
removed = 0
query_session = DBManager.connect_to_db()
total = query_session.query(link_entity).count()

with Pool() as pool:
# Validate each link
rotten = 0
if drop:
removed = 0

# Parallel operation
with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
# Try to resolve every URL
for resolved, res_entity in tqdm(
pool.imap_unordered(_resolve_url, session.query(entity)),
pool.imap_unordered(
_resolve_url, query_session.query(link_entity)
),
total=total,
):
if not resolved:
session_delete = DBManager.connect_to_db()
# if not valid delete
session_delete.delete(res_entity)
try:
session_delete.commit()
removed += 1
except:
session.rollback()
raise
finally:
session_delete.close()

session.close()
# Dump
fout.write(res_entity.url + '\n')
rotten += 1

# Drop from DB
if drop:
delete_session = DBManager.connect_to_db()
delete_session.delete(res_entity)
try:
delete_session.commit()
removed += 1
except:
delete_session.rollback()
raise
finally:
delete_session.close()
query_session.close()

LOGGER.info(
"Removed %s/%s from %s %s", removed, total, catalog, entity_type
"Total %s %s rotten URLs dumped to '%s': %d / %d",
catalog,
entity,
out_path,
rotten,
total,
)
if drop:
LOGGER.info(
'Total %s %s rotten URLs dropped from the DB: %d / %d',
catalog,
entity,
rotten,
removed,
)


class Importer:
Expand Down

0 comments on commit 9ed8b4e

Please sign in to comment.