Skip to content

Commit

Permalink
Merge pull request #405 from Wikidata/criterion-2
Browse files Browse the repository at this point in the history
Validation criterion 2: links
  • Loading branch information
marfox committed Aug 2, 2021
2 parents 7cb850f + dee5cf4 commit e1ffc70
Show file tree
Hide file tree
Showing 11 changed files with 491 additions and 144 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ pymysql==0.9.3 # via -r requirements.in
pyparsing==2.4.6 # via packaging
python-dateutil==2.8.1 # via pandas
pytz==2019.3 # via babel, pandas
pywikibot==3.0.20200326 # via -r requirements.in
pywikibot==3.0.20200703 # MANUALLY SET
pyyaml==5.3.1 # via keras
recordlinkage==0.14 # via -r requirements.in
regex==2020.2.20 # via -r requirements.in, black
Expand Down
121 changes: 121 additions & 0 deletions scripts/build_web_domains_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Build a wiki table holding Web domains, frequency, and random examples
from a given URL dataset, as output by ``python -m soweego sync links``.
The input file name must start with ``CATALOG_ENTITY_urls``,
e.g., ``musicbrainz_band_urls_to_be_added.csv``.
"""

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2021, Hjfocs'

import csv
import json
import os
import sys
from collections import defaultdict, OrderedDict
from random import sample
from urllib.parse import urlsplit

HEADER = """====TARGET====
{| class="sortable wikitable" style="font-size: 100%; text-align: center;"
! Domain
! Frequency
! Examples
|-
"""
FOOTER = '|}'
ROW = '| {domain} || {freq} || {examples}\n|-\n'
FREQ_THRESHOLD = 100
N_EXAMPLES = 3
CATALOG_URL_PREFIXES = {
'discogs_band': 'https://www.discogs.com/artist/',
'discogs_musician': 'https://www.discogs.com/artist/',
'discogs_musical_work': 'https://www.discogs.com/master/',
'musicbrainz_band': 'https://musicbrainz.org/artist/',
'musicbrainz_musician': 'https://musicbrainz.org/artist/',
'musicbrainz_musical_work': 'https://musicbrainz.org/release-group/'
}
WIKI_PROJECTS = (
'wikipedia',
'wikibooks',
'wiktionary',
'wikiquote',
'commons.wikimedia',
'wikisource',
'wikiversity',
'wikidata',
'mediawiki',
'wikivoyage',
'meta.wikimedia',
)


def main(args):
if len(args) != 2:
print(
f"Usage: python {__file__} URLS_CSV\n"
"URLS_CSV file name must start with 'CATALOG_ENTITY_urls', "
"e.g., 'discogs_band_urls'"
)
return 1

file_in = args[1]
catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0]
file_out = f'{catalog_and_entity}_web_domains_table.mediawiki'
json_out = f'{catalog_and_entity}.json'
header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title())
prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity)

if prefix is None:
raise ValueError(f'Invalid input file name: {file_in}')
return 2

freq = defaultdict(int)
urls = defaultdict(list)
wiki_urls = 0

with open(file_in) as fin:
r = csv.reader(fin)
for (_, _, url, tid,) in r:
domain = urlsplit(url).netloc
if any(wiki_project in domain for wiki_project in WIKI_PROJECTS):
wiki_urls += 1
continue
freq[domain] += 1
urls[domain].append((url, tid,))

print(f'Total wiki URLs found: {wiki_urls}')

rank = OrderedDict(sorted(freq.items(), key=lambda x: x[1], reverse=True))

with open(json_out, 'w') as jout:
json.dump(rank, jout)

with open(file_out, 'w') as fout:
fout.write(header)

for domain, freq in rank.items():
if freq < FREQ_THRESHOLD:
continue

examples = sample(urls[domain], N_EXAMPLES)
buffer = []
for i, (url, tid,) in enumerate(examples, 1):
buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ')

fout.write(ROW.format(
domain=domain, freq=freq, examples=''.join(buffer)
))
fout.write(FOOTER)

return 0


if __name__ == '__main__':
sys.exit(main(sys.argv))

2 changes: 1 addition & 1 deletion soweego/commons/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

# As per https://meta.wikimedia.org/wiki/User-Agent_policy
HTTP_USER_AGENT = (
'soweego/1.0 ([[:m:Grants:Project/Hjfocs/soweego]]; [[:m:User:Hjfocs]])'
'soweego/2.0 ([[:m:Grants:Project/Hjfocs/soweego_2]]; [[:m:User:Hjfocs]])'
)

# Wikidata items & properties regexes
Expand Down
67 changes: 39 additions & 28 deletions soweego/commons/data_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import regex
from sqlalchemy import or_
from tqdm import tqdm

from soweego.commons import constants, keys, target_database, url_utils
from soweego.commons.db_manager import DBManager
Expand Down Expand Up @@ -427,32 +428,42 @@ def gather_relevant_pids():
url_pids = set()
for result in sparql_queries.url_pids():
url_pids.add(result)

ext_id_pids_to_urls = defaultdict(dict)
for result in sparql_queries.external_id_pids_and_urls():
for pid, formatters in result.items():
for formatter_url, formatter_regex in formatters.items():
if formatter_regex:
try:
compiled_regex = re.compile(formatter_regex)
except re.error:
LOGGER.debug(
"Using 'regex' third-party library. Formatter regex not supported by the 're' standard library: %s",
formatter_regex,
)
try:
compiled_regex = regex.compile(formatter_regex)
except regex.error:
LOGGER.debug(
"Giving up. Formatter regex not supported by 'regex': %s",
formatter_regex,
)
compiled_regex = None
else:
compiled_regex = None
ext_id_pids_to_urls[pid][formatter_url] = compiled_regex
for (pid, formatter_url, id_regex, url_regex,) in sparql_queries.external_id_pids_and_urls():
compiled_id_regex = _compile(id_regex, 'ID')
compiled_url_regex = _compile(url_regex, 'URL')

ext_id_pids_to_urls[pid][formatter_url] = (
compiled_id_regex, compiled_url_regex,
)

return url_pids, ext_id_pids_to_urls


def _compile(regexp, id_or_url):
if regexp is None:
return None

try:
compiled = re.compile(regexp)
except re.error:
LOGGER.debug(
"Using 'regex' third-party library. %s regex not supported by the 're' standard library: %s",
id_or_url, regexp,
)
try:
compiled = regex.compile(regexp)
except regex.error:
LOGGER.debug(
"Giving up. %s regex not supported by 'regex': %s",
id_or_url, regexp,
)
return None

return compiled


def gather_target_ids(entity, catalog, catalog_pid, aggregated):
LOGGER.info(
'Gathering Wikidata %s items with %s identifiers ...', entity, catalog
Expand Down Expand Up @@ -496,13 +507,13 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
LOGGER.info('Starting extraction of IDs from target links to be added ...')
ext_ids_to_add = []
urls_to_add = []
for qid, urls in to_be_added.items():
for (qid, tid,), urls in tqdm(to_be_added.items(), total=len(to_be_added)):
for url in urls:
ext_id, pid = url_utils.get_external_id_from_url(
(ext_id, pid,) = url_utils.get_external_id_from_url(
url, ext_id_pids_to_urls
)
if ext_id:
ext_ids_to_add.append((qid, pid, ext_id))
if ext_id is not None:
ext_ids_to_add.append((qid, pid, ext_id, tid,))
else:
urls_to_add.append((qid, vocabulary.DESCRIBED_AT_URL, url))
return ext_ids_to_add, urls_to_add
urls_to_add.append((qid, vocabulary.DESCRIBED_AT_URL, url, tid,))
return (ext_ids_to_add, urls_to_add,)
5 changes: 4 additions & 1 deletion soweego/commons/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
import logging
import logging.config
import os
from datetime import datetime
from io import StringIO
from urllib.parse import unquote_plus

import tqdm

# Unix-friendly timestamp to be used as log file name
TIMESTAMP = datetime.now().strftime('%Y-%m-%d_%H.%M.%S.%f')
LEVELS = {
'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
Expand Down Expand Up @@ -50,7 +53,7 @@
'debug_file_handler': {
'formatter': 'soweego',
'level': 'DEBUG',
'filename': 'debug.log',
'filename': f'{TIMESTAMP}.log',
'mode': 'w',
'class': 'logging.FileHandler',
'encoding': 'utf8',
Expand Down
108 changes: 73 additions & 35 deletions soweego/commons/url_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,57 +209,95 @@ def tokenize(url, domain_only=False) -> set:


def get_external_id_from_url(url, ext_id_pids_to_urls):
LOGGER.debug('Trying to extract an identifier from URL <%s>', url)
url = url.rstrip('/')
# Always use HTTPS
if not url.startswith('https'):
url = url.replace('http', 'https')
LOGGER.debug('Trying to extract an identifier from <%s>', url)

# Tidy up: remove trailing slash & use HTTPS
tidy = url.rstrip('/')
if not tidy.startswith('https'):
tidy = tidy.replace('http', 'https', 1)

# Start extraction
for pid, formatters in ext_id_pids_to_urls.items():
for formatter_url, formatter_regex in formatters.items():
for formatter_url, (id_regex, url_regex,) in formatters.items():

# Optimal case: match the original input URL against a full URL regex
if url_regex is not None:
match = (
re.match(url_regex, url)
if isinstance(url_regex, re.Pattern)
else regex.match(url_regex, url)
)
if match is not None:
groups = match.groups()
# This shouldn't happen, but who knows?
# For some reason, we have plenty of groups
# with `None` as the second element
if len(groups) > 1 and groups[1] is not None:
LOGGER.warning(
'Found multiple matching groups in <%s>: '
'Will use the first of %s',
url, groups,
)
ext_id = groups[0]
LOGGER.debug(
'Input URL matches the full URL regex. '
'URL: %s -> ID: %s - URL regex: %s',
url, ext_id, url_regex,
)
return (ext_id, pid,)

# No URL regex: best matching effort using the tidy URL
# Look for matching head & tail
before, _, after = formatter_url.partition('$1')
after = after.rstrip('/')
if url.startswith(before) and url.endswith(after):
if tidy.startswith(before) and tidy.endswith(after):
LOGGER.debug(
'Input URL matches external ID formatter URL: <%s> -> <%s>',
url,
formatter_url,
'Clean URL matches external ID formatter URL: <%s> -> <%s>',
tidy, formatter_url,
)
url_fragment = (
url[len(before) : -len(after)]
tidy[len(before) : -len(after)]
if len(after)
else url[len(before) :]
else tidy[len(before) :]
)
if not formatter_regex:

# No ID regex: use the partitioned substring
if id_regex is None:
LOGGER.debug(
'Missing formatter regex, will assume the URL substring as the ID. URL: %s - URL substring: %s',
url,
url_fragment,
'Missing ID regex, '
'will assume the URL substring as the ID. '
'URL: %s -> substring: %s',
tidy, url_fragment,
)
return url_fragment, pid
ext_id_match = (
re.search(formatter_regex, url_fragment)
if isinstance(formatter_regex, re.Pattern)
else regex.search(formatter_regex, url_fragment)

# Use `re.match` instead of `re.search`
# More precision, less recall:
# valid IDs may be left in the URLs output
match = (
re.match(id_regex, url_fragment)
if isinstance(id_regex, re.Pattern)
else regex.match(id_regex, url_fragment)
)
if not ext_id_match:
# Give up if the ID regex doesn't match
if match is None:
LOGGER.debug(
"Skipping target URL <%s> with fragment '%s' not matching the expected formatter regex %s",
url,
url_fragment,
formatter_regex.pattern,
"Skipping clean URL <%s> with substring '%s' "
"not matching the expected ID regex %s",
tidy, url_fragment, id_regex.pattern,
)
return None, None
ext_id = ext_id_match.group()
return (None, None,)

ext_id = match.group()
LOGGER.debug(
'URL: %s - URL substring: %s - formatter regex: %s - extracted ID: %s',
url,
url_fragment,
formatter_regex,
ext_id,
'Clean URL: %s -> ID: %s - substring: %s - ID regex: %s',
tidy, ext_id, url_fragment, id_regex,
)
return ext_id, pid
LOGGER.debug('Could not extract any identifier from cleaned URL <%s>', url)
return None, None
return (ext_id, pid,)

# Nothing worked: give up
LOGGER.debug('Could not extract any identifier from <%s>', url)
return (None, None,)


def is_wiki_link(url):
Expand Down

0 comments on commit e1ffc70

Please sign in to comment.