Merge pull request #405 from Wikidata/criterion-2

Validation criterion 2: links
Wikidata · Aug 2, 2021 · e1ffc70 · e1ffc70
2 parents 7cb850f + dee5cf4
commit e1ffc70
Show file tree

Hide file tree

Showing 11 changed files with 491 additions and 144 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -75,7 +75,7 @@ pymysql==0.9.3            # via -r requirements.in
 pyparsing==2.4.6          # via packaging
 python-dateutil==2.8.1    # via pandas
 pytz==2019.3              # via babel, pandas
-pywikibot==3.0.20200326   # via -r requirements.in
+pywikibot==3.0.20200703   # MANUALLY SET
 pyyaml==5.3.1             # via keras
 recordlinkage==0.14       # via -r requirements.in
 regex==2020.2.20          # via -r requirements.in, black

diff --git a/scripts/build_web_domains_table.py b/scripts/build_web_domains_table.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Build a wiki table holding Web domains, frequency, and random examples
+from a given URL dataset, as output by ``python -m soweego sync links``.
+The input file name must start with ``CATALOG_ENTITY_urls``,
+e.g., ``musicbrainz_band_urls_to_be_added.csv``.
+"""
+
+__author__ = 'Marco Fossati'
+__email__ = 'fossati@spaziodati.eu'
+__version__ = '2.0'
+__license__ = 'GPL-3.0'
+__copyright__ = 'Copyleft 2021, Hjfocs'
+
+import csv
+import json
+import os
+import sys
+from collections import defaultdict, OrderedDict
+from random import sample
+from urllib.parse import urlsplit
+
+HEADER = """====TARGET====
+{| class="sortable wikitable" style="font-size: 100%; text-align: center;"
+! Domain
+! Frequency
+! Examples
+|-
+"""
+FOOTER = '|}'
+ROW = '| {domain} || {freq} || {examples}\n|-\n'
+FREQ_THRESHOLD = 100
+N_EXAMPLES = 3
+CATALOG_URL_PREFIXES = {
+    'discogs_band': 'https://www.discogs.com/artist/',
+    'discogs_musician': 'https://www.discogs.com/artist/',
+    'discogs_musical_work': 'https://www.discogs.com/master/',
+    'musicbrainz_band': 'https://musicbrainz.org/artist/',
+    'musicbrainz_musician': 'https://musicbrainz.org/artist/',
+    'musicbrainz_musical_work': 'https://musicbrainz.org/release-group/'
+}
+WIKI_PROJECTS = (
+    'wikipedia',
+    'wikibooks',
+    'wiktionary',
+    'wikiquote',
+    'commons.wikimedia',
+    'wikisource',
+    'wikiversity',
+    'wikidata',
+    'mediawiki',
+    'wikivoyage',
+    'meta.wikimedia',
+)
+
+
+def main(args):
+    if len(args) != 2:
+        print(
+            f"Usage: python {__file__} URLS_CSV\n"
+             "URLS_CSV file name must start with 'CATALOG_ENTITY_urls', "
+             "e.g., 'discogs_band_urls'"
+        )
+        return 1
+
+    file_in = args[1]
+    catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0]
+    file_out = f'{catalog_and_entity}_web_domains_table.mediawiki'
+    json_out = f'{catalog_and_entity}.json'
+    header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title())
+    prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity)
+
+    if prefix is None:
+        raise ValueError(f'Invalid input file name: {file_in}')
+        return 2
+
+    freq = defaultdict(int)
+    urls = defaultdict(list)
+    wiki_urls = 0
+
+    with open(file_in) as fin:
+        r = csv.reader(fin)
+        for (_, _, url, tid,) in r:
+            domain = urlsplit(url).netloc
+            if any(wiki_project in domain for wiki_project in WIKI_PROJECTS):
+                wiki_urls += 1
+                continue
+            freq[domain] += 1
+            urls[domain].append((url, tid,))
+
+    print(f'Total wiki URLs found: {wiki_urls}')
+
+    rank = OrderedDict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
+
+    with open(json_out, 'w') as jout:
+        json.dump(rank, jout)
+
+    with open(file_out, 'w') as fout:
+        fout.write(header)
+
+        for domain, freq in rank.items():
+            if freq < FREQ_THRESHOLD:
+                continue
+
+            examples = sample(urls[domain], N_EXAMPLES)
+            buffer = []
+            for i, (url, tid,) in enumerate(examples, 1):
+                buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ')
+
+            fout.write(ROW.format(
+                domain=domain, freq=freq, examples=''.join(buffer)
+            ))
+        fout.write(FOOTER)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv))
+
diff --git a/soweego/commons/constants.py b/soweego/commons/constants.py
@@ -56,7 +56,7 @@
 
 # As per https://meta.wikimedia.org/wiki/User-Agent_policy
 HTTP_USER_AGENT = (
-    'soweego/1.0 ([[:m:Grants:Project/Hjfocs/soweego]]; [[:m:User:Hjfocs]])'
+    'soweego/2.0 ([[:m:Grants:Project/Hjfocs/soweego_2]]; [[:m:User:Hjfocs]])'
 )
 
 # Wikidata items & properties regexes

diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py
@@ -18,6 +18,7 @@
 
 import regex
 from sqlalchemy import or_
+from tqdm import tqdm
 
 from soweego.commons import constants, keys, target_database, url_utils
 from soweego.commons.db_manager import DBManager
@@ -427,32 +428,42 @@ def gather_relevant_pids():
     url_pids = set()
     for result in sparql_queries.url_pids():
         url_pids.add(result)
+
     ext_id_pids_to_urls = defaultdict(dict)
-    for result in sparql_queries.external_id_pids_and_urls():
-        for pid, formatters in result.items():
-            for formatter_url, formatter_regex in formatters.items():
-                if formatter_regex:
-                    try:
-                        compiled_regex = re.compile(formatter_regex)
-                    except re.error:
-                        LOGGER.debug(
-                            "Using 'regex' third-party library. Formatter regex not supported by the 're' standard library: %s",
-                            formatter_regex,
-                        )
-                        try:
-                            compiled_regex = regex.compile(formatter_regex)
-                        except regex.error:
-                            LOGGER.debug(
-                                "Giving up. Formatter regex not supported by 'regex': %s",
-                                formatter_regex,
-                            )
-                            compiled_regex = None
-                else:
-                    compiled_regex = None
-                ext_id_pids_to_urls[pid][formatter_url] = compiled_regex
+    for (pid, formatter_url, id_regex, url_regex,) in sparql_queries.external_id_pids_and_urls():
+        compiled_id_regex = _compile(id_regex, 'ID')
+        compiled_url_regex = _compile(url_regex, 'URL')
+
+        ext_id_pids_to_urls[pid][formatter_url] = (
+            compiled_id_regex, compiled_url_regex,
+        )
+
     return url_pids, ext_id_pids_to_urls
 
 
+def _compile(regexp, id_or_url):
+    if regexp is None:
+        return None
+
+    try:
+        compiled = re.compile(regexp)
+    except re.error:
+        LOGGER.debug(
+            "Using 'regex' third-party library. %s regex not supported by the 're' standard library: %s",
+            id_or_url, regexp,
+        )
+        try:
+            compiled = regex.compile(regexp)
+        except regex.error:
+            LOGGER.debug(
+                "Giving up. %s regex not supported by 'regex': %s",
+                id_or_url, regexp,
+            )
+            return None
+
+    return compiled
+
+
 def gather_target_ids(entity, catalog, catalog_pid, aggregated):
     LOGGER.info(
         'Gathering Wikidata %s items with %s identifiers ...', entity, catalog
@@ -496,13 +507,13 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
     LOGGER.info('Starting extraction of IDs from target links to be added ...')
     ext_ids_to_add = []
     urls_to_add = []
-    for qid, urls in to_be_added.items():
+    for (qid, tid,), urls in tqdm(to_be_added.items(), total=len(to_be_added)):
         for url in urls:
-            ext_id, pid = url_utils.get_external_id_from_url(
+            (ext_id, pid,) = url_utils.get_external_id_from_url(
                 url, ext_id_pids_to_urls
             )
-            if ext_id:
-                ext_ids_to_add.append((qid, pid, ext_id))
+            if ext_id is not None:
+                ext_ids_to_add.append((qid, pid, ext_id, tid,))
             else:
-                urls_to_add.append((qid, vocabulary.DESCRIBED_AT_URL, url))
-    return ext_ids_to_add, urls_to_add
+                urls_to_add.append((qid, vocabulary.DESCRIBED_AT_URL, url, tid,))
+    return (ext_ids_to_add, urls_to_add,)
diff --git a/soweego/commons/logging.py b/soweego/commons/logging.py
@@ -15,11 +15,14 @@
 import logging
 import logging.config
 import os
+from datetime import datetime
 from io import StringIO
 from urllib.parse import unquote_plus
 
 import tqdm
 
+# Unix-friendly timestamp to be used as log file name
+TIMESTAMP = datetime.now().strftime('%Y-%m-%d_%H.%M.%S.%f')
 LEVELS = {
     'DEBUG': logging.DEBUG,
     'INFO': logging.INFO,
@@ -50,7 +53,7 @@
         'debug_file_handler': {
             'formatter': 'soweego',
             'level': 'DEBUG',
-            'filename': 'debug.log',
+            'filename': f'{TIMESTAMP}.log',
             'mode': 'w',
             'class': 'logging.FileHandler',
             'encoding': 'utf8',

diff --git a/soweego/commons/url_utils.py b/soweego/commons/url_utils.py
@@ -209,57 +209,95 @@ def tokenize(url, domain_only=False) -> set:
 
 
 def get_external_id_from_url(url, ext_id_pids_to_urls):
-    LOGGER.debug('Trying to extract an identifier from URL <%s>', url)
-    url = url.rstrip('/')
-    # Always use HTTPS
-    if not url.startswith('https'):
-        url = url.replace('http', 'https')
+    LOGGER.debug('Trying to extract an identifier from <%s>', url)
+
+    # Tidy up: remove trailing slash & use HTTPS
+    tidy = url.rstrip('/')
+    if not tidy.startswith('https'):
+        tidy = tidy.replace('http', 'https', 1)
+
+    # Start extraction
     for pid, formatters in ext_id_pids_to_urls.items():
-        for formatter_url, formatter_regex in formatters.items():
+        for formatter_url, (id_regex, url_regex,) in formatters.items():
+
+            # Optimal case: match the original input URL against a full URL regex
+            if url_regex is not None:
+                match = (
+                    re.match(url_regex, url)
+                    if isinstance(url_regex, re.Pattern)
+                    else regex.match(url_regex, url)
+                )
+                if match is not None:
+                    groups = match.groups()
+                    # This shouldn't happen, but who knows?
+                    # For some reason, we have plenty of groups
+                    # with `None` as the second element
+                    if len(groups) > 1 and groups[1] is not None:
+                        LOGGER.warning(
+                            'Found multiple matching groups in <%s>: '
+                            'Will use the first of %s',
+                            url, groups,
+                        )
+                    ext_id = groups[0]
+                    LOGGER.debug(
+                        'Input URL matches the full URL regex. '
+                        'URL: %s -> ID: %s - URL regex: %s',
+                        url, ext_id, url_regex,
+                    )
+                    return (ext_id, pid,)
+
+            # No URL regex: best matching effort using the tidy URL
+            # Look for matching head & tail
             before, _, after = formatter_url.partition('$1')
             after = after.rstrip('/')
-            if url.startswith(before) and url.endswith(after):
+            if tidy.startswith(before) and tidy.endswith(after):
                 LOGGER.debug(
-                    'Input URL matches external ID formatter URL: <%s> -> <%s>',
-                    url,
-                    formatter_url,
+                    'Clean URL matches external ID formatter URL: <%s> -> <%s>',
+                    tidy, formatter_url,
                 )
                 url_fragment = (
-                    url[len(before) : -len(after)]
+                    tidy[len(before) : -len(after)]
                     if len(after)
-                    else url[len(before) :]
+                    else tidy[len(before) :]
                 )
-                if not formatter_regex:
+
+                # No ID regex: use the partitioned substring
+                if id_regex is None:
                     LOGGER.debug(
-                        'Missing formatter regex, will assume the URL substring as the ID. URL: %s - URL substring: %s',
-                        url,
-                        url_fragment,
+                        'Missing ID regex, '
+                        'will assume the URL substring as the ID. '
+                        'URL: %s -> substring: %s',
+                        tidy, url_fragment,
                     )
                     return url_fragment, pid
-                ext_id_match = (
-                    re.search(formatter_regex, url_fragment)
-                    if isinstance(formatter_regex, re.Pattern)
-                    else regex.search(formatter_regex, url_fragment)
+
+                # Use `re.match` instead of `re.search`
+                # More precision, less recall:
+                # valid IDs may be left in the URLs output
+                match = (
+                    re.match(id_regex, url_fragment)
+                    if isinstance(id_regex, re.Pattern)
+                    else regex.match(id_regex, url_fragment)
                 )
-                if not ext_id_match:
+                # Give up if the ID regex doesn't match
+                if match is None:
                     LOGGER.debug(
-                        "Skipping target URL <%s> with fragment '%s' not matching the expected formatter regex %s",
-                        url,
-                        url_fragment,
-                        formatter_regex.pattern,
+                        "Skipping clean URL <%s> with substring '%s' "
+                        "not matching the expected ID regex %s",
+                        tidy, url_fragment, id_regex.pattern,
                     )
-                    return None, None
-                ext_id = ext_id_match.group()
+                    return (None, None,)
+
+                ext_id = match.group()
                 LOGGER.debug(
-                    'URL: %s - URL substring: %s - formatter regex: %s - extracted ID: %s',
-                    url,
-                    url_fragment,
-                    formatter_regex,
-                    ext_id,
+                    'Clean URL: %s -> ID: %s - substring: %s - ID regex: %s',
+                    tidy, ext_id, url_fragment, id_regex,
                 )
-                return ext_id, pid
-    LOGGER.debug('Could not extract any identifier from cleaned URL <%s>', url)
-    return None, None
+                return (ext_id, pid,)
+
+    # Nothing worked: give up
+    LOGGER.debug('Could not extract any identifier from <%s>', url)
+    return (None, None,)
 
 
 def is_wiki_link(url):