Merge pull request #430 from Wikidata/rotten-to-records

Dump catalog IDs alongside rotten URLs
Wikidata · Sep 3, 2021 · 84dab46 · 84dab46
2 parents 2f339cb + 637f1a2
commit 84dab46
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 32 deletions.
diff --git a/scripts/basic_url_stats.py b/scripts/basic_url_stats.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Simple statistics for rotten URL datasets, as output by
+``python -m soweego importer check_urls``.
+Dump two JSON files: Web domains ranked in descending order of frequency,
+and URLs grouped by Web domains.
+"""
+
+__author__ = 'Marco Fossati'
+__email__ = 'fossati@spaziodati.eu'
+__version__ = '2.0'
+__license__ = 'GPL-3.0'
+__copyright__ = 'Copyleft 2021, Hjfocs'
+
+import csv
+import json
+import sys
+from collections import defaultdict, OrderedDict
+from urllib.parse import urlsplit
+
+
+def main(args):
+    if len(args) != 2:
+        print(f'Usage: python {__file__} URLS_CSV')
+        return 1
+
+    file_in = args[1]
+    rank_out = file_in.replace('.csv', '_domain_freq.json')
+    urls_out = file_in.replace('.csv', '_by_domain.json')
+
+    freq, urls = defaultdict(int), defaultdict(list)
+
+    with open(args[1]) as fin:
+        reader = csv.reader(fin)
+        for url, _ in reader:
+            domain = urlsplit(url).netloc
+            freq[domain] += 1
+            urls[domain].append(url)
+
+    rank = OrderedDict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
+
+    with open(rank_out, 'w') as fout:
+        json.dump(rank, fout, indent=2)
+    with open(urls_out, 'w') as fout:
+        json.dump(urls, fout, indent=2)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv))
+
diff --git a/soweego/importer/importer.py b/soweego/importer/importer.py
@@ -7,14 +7,16 @@
 __email__ = 'maxfrax@gmail.com, fossati@spaziodati.eu'
 __version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018-2021, MaxFrax96, Hjfocs'
+__copyright__ = 'Copyleft 2021, MaxFrax96, Hjfocs'
 
+import csv
 import datetime
 import logging
 import os
 from multiprocessing import Pool
 
 import click
+from sqlalchemy.exc import SQLAlchemyError
 from tqdm import tqdm
 
 from soweego.commons import constants
@@ -33,7 +35,7 @@
     keys.IMDB: IMDbDumpExtractor,
     keys.MUSICBRAINZ: MusicBrainzDumpExtractor,
 }
-ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt'
+ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.csv'
 
 
 @click.command()
@@ -62,10 +64,6 @@ def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
     Importer().refresh_dump(dir_io, extractor, url_check)
 
 
-def _resolve_url(res):
-    return url_utils.resolve(res.url), res
-
-
 @click.command()
 @click.argument(
     'catalog', type=click.Choice(target_database.supported_targets())
@@ -81,7 +79,9 @@ def _resolve_url(res):
 )
 def check_urls_cli(catalog, drop, dir_io):
     """Check for rotten URLs of an imported catalog.
-    For every catalog entity, dump a text file with rotten URLs, one per line.
+
+    For every catalog entity, dump rotten URLs to a file.
+    CSV format: URL,catalog_ID
 
     Use '-d' to drop rotten URLs from the DB on the fly.
     """
@@ -109,32 +109,50 @@ def check_urls_cli(catalog, drop, dir_io):
 
         # Parallel operation
         with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
-            # Try to resolve every URL
-            for resolved, res_entity in tqdm(
-                pool.imap_unordered(
-                    _resolve_url, query_session.query(link_entity)
-                ),
-                total=total,
-            ):
-                if not resolved:
-                    # Dump
-                    fout.write(res_entity.url + '\n')
-                    rotten += 1
-
-                    # Drop from DB
-                    if drop:
-                        delete_session = DBManager.connect_to_db()
-                        delete_session.delete(res_entity)
-                        try:
-                            delete_session.commit()
-                            removed += 1
-                        except:
-                            delete_session.rollback()
-                            raise
-                        finally:
-                            delete_session.close()
-        query_session.close()
+            writer = csv.writer(fout)
+            try:
+                # Resolve every URL
+                for resolved, result in tqdm(
+                    pool.imap_unordered(
+                        _resolve, query_session.query(link_entity)
+                    ),
+                    total=total,
+                ):
+                    if not resolved:
+                        # Dump
+                        writer.writerow((result.url, result.catalog_id))
+                        rotten += 1
+
+                        # Drop from DB
+                        if drop:
+                            delete_session = DBManager.connect_to_db()
+                            delete_session.delete(result)
+                            try:
+                                delete_session.commit()
+                                removed += 1
+                            except SQLAlchemyError as error:
+                                LOGGER.error(
+                                    'Failed deletion of %s: %s',
+                                    result,
+                                    error.__class__.__name__,
+                                )
+                                LOGGER.debug(error)
+                                delete_session.rollback()
+                            finally:
+                                delete_session.close()
+            except SQLAlchemyError as error:
+                LOGGER.error(
+                    '%s while querying %s %s URLs',
+                    error.__class__.__name__,
+                    catalog,
+                    entity,
+                )
+                LOGGER.debug(error)
+                session.rollback()
+            finally:
+                query_session.close()
 
+        LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info())
         LOGGER.info(
             "Total %s %s rotten URLs dumped to '%s': %d / %d",
             catalog,
@@ -143,6 +161,7 @@ def check_urls_cli(catalog, drop, dir_io):
             rotten,
             total,
         )
+
         if drop:
             LOGGER.info(
                 'Total %s %s rotten URLs dropped from the DB: %d / %d',
@@ -153,6 +172,10 @@ def check_urls_cli(catalog, drop, dir_io):
             )
 
 
+def _resolve(link_entity):
+    return url_utils.resolve(link_entity.url), link_entity
+
+
 class Importer:
     """Handle a catalog dump: check its freshness and dispatch the appropriate
     extractor."""