Skip to content

Commit

Permalink
Merge pull request #430 from Wikidata/rotten-to-records
Browse files Browse the repository at this point in the history
Dump catalog IDs alongside rotten URLs
  • Loading branch information
marfox committed Sep 3, 2021
2 parents 2f339cb + 637f1a2 commit 84dab46
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 32 deletions.
53 changes: 53 additions & 0 deletions scripts/basic_url_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Simple statistics for rotten URL datasets, as output by
``python -m soweego importer check_urls``.
Dump two JSON files: Web domains ranked in descending order of frequency,
and URLs grouped by Web domains.
"""

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2021, Hjfocs'

import csv
import json
import sys
from collections import defaultdict, OrderedDict
from urllib.parse import urlsplit


def main(args):
if len(args) != 2:
print(f'Usage: python {__file__} URLS_CSV')
return 1

file_in = args[1]
rank_out = file_in.replace('.csv', '_domain_freq.json')
urls_out = file_in.replace('.csv', '_by_domain.json')

freq, urls = defaultdict(int), defaultdict(list)

with open(args[1]) as fin:
reader = csv.reader(fin)
for url, _ in reader:
domain = urlsplit(url).netloc
freq[domain] += 1
urls[domain].append(url)

rank = OrderedDict(sorted(freq.items(), key=lambda x: x[1], reverse=True))

with open(rank_out, 'w') as fout:
json.dump(rank, fout, indent=2)
with open(urls_out, 'w') as fout:
json.dump(urls, fout, indent=2)

return 0


if __name__ == '__main__':
sys.exit(main(sys.argv))

87 changes: 55 additions & 32 deletions soweego/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
__email__ = 'maxfrax@gmail.com, fossati@spaziodati.eu'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018-2021, MaxFrax96, Hjfocs'
__copyright__ = 'Copyleft 2021, MaxFrax96, Hjfocs'

import csv
import datetime
import logging
import os
from multiprocessing import Pool

import click
from sqlalchemy.exc import SQLAlchemyError
from tqdm import tqdm

from soweego.commons import constants
Expand All @@ -33,7 +35,7 @@
keys.IMDB: IMDbDumpExtractor,
keys.MUSICBRAINZ: MusicBrainzDumpExtractor,
}
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt'
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.csv'


@click.command()
Expand Down Expand Up @@ -62,10 +64,6 @@ def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
Importer().refresh_dump(dir_io, extractor, url_check)


def _resolve_url(res):
return url_utils.resolve(res.url), res


@click.command()
@click.argument(
'catalog', type=click.Choice(target_database.supported_targets())
Expand All @@ -81,7 +79,9 @@ def _resolve_url(res):
)
def check_urls_cli(catalog, drop, dir_io):
"""Check for rotten URLs of an imported catalog.
For every catalog entity, dump a text file with rotten URLs, one per line.
For every catalog entity, dump rotten URLs to a file.
CSV format: URL,catalog_ID
Use '-d' to drop rotten URLs from the DB on the fly.
"""
Expand Down Expand Up @@ -109,32 +109,50 @@ def check_urls_cli(catalog, drop, dir_io):

# Parallel operation
with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
# Try to resolve every URL
for resolved, res_entity in tqdm(
pool.imap_unordered(
_resolve_url, query_session.query(link_entity)
),
total=total,
):
if not resolved:
# Dump
fout.write(res_entity.url + '\n')
rotten += 1

# Drop from DB
if drop:
delete_session = DBManager.connect_to_db()
delete_session.delete(res_entity)
try:
delete_session.commit()
removed += 1
except:
delete_session.rollback()
raise
finally:
delete_session.close()
query_session.close()
writer = csv.writer(fout)
try:
# Resolve every URL
for resolved, result in tqdm(
pool.imap_unordered(
_resolve, query_session.query(link_entity)
),
total=total,
):
if not resolved:
# Dump
writer.writerow((result.url, result.catalog_id))
rotten += 1

# Drop from DB
if drop:
delete_session = DBManager.connect_to_db()
delete_session.delete(result)
try:
delete_session.commit()
removed += 1
except SQLAlchemyError as error:
LOGGER.error(
'Failed deletion of %s: %s',
result,
error.__class__.__name__,
)
LOGGER.debug(error)
delete_session.rollback()
finally:
delete_session.close()
except SQLAlchemyError as error:
LOGGER.error(
'%s while querying %s %s URLs',
error.__class__.__name__,
catalog,
entity,
)
LOGGER.debug(error)
session.rollback()
finally:
query_session.close()

LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info())
LOGGER.info(
"Total %s %s rotten URLs dumped to '%s': %d / %d",
catalog,
Expand All @@ -143,6 +161,7 @@ def check_urls_cli(catalog, drop, dir_io):
rotten,
total,
)

if drop:
LOGGER.info(
'Total %s %s rotten URLs dropped from the DB: %d / %d',
Expand All @@ -153,6 +172,10 @@ def check_urls_cli(catalog, drop, dir_io):
)


def _resolve(link_entity):
return url_utils.resolve(link_entity.url), link_entity


class Importer:
"""Handle a catalog dump: check its freshness and dispatch the appropriate
extractor."""
Expand Down

0 comments on commit 84dab46

Please sign in to comment.