Skip to content

Commit

Permalink
format code & organize imports
Browse files Browse the repository at this point in the history
  • Loading branch information
marfox committed Aug 2, 2021
1 parent e1ffc70 commit 49ac13e
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 69 deletions.
25 changes: 19 additions & 6 deletions soweego/commons/data_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,18 @@ def gather_relevant_pids():
url_pids.add(result)

ext_id_pids_to_urls = defaultdict(dict)
for (pid, formatter_url, id_regex, url_regex,) in sparql_queries.external_id_pids_and_urls():
for (
pid,
formatter_url,
id_regex,
url_regex,
) in sparql_queries.external_id_pids_and_urls():
compiled_id_regex = _compile(id_regex, 'ID')
compiled_url_regex = _compile(url_regex, 'URL')

ext_id_pids_to_urls[pid][formatter_url] = (
compiled_id_regex, compiled_url_regex,
compiled_id_regex,
compiled_url_regex,
)

return url_pids, ext_id_pids_to_urls
Expand All @@ -450,14 +456,16 @@ def _compile(regexp, id_or_url):
except re.error:
LOGGER.debug(
"Using 'regex' third-party library. %s regex not supported by the 're' standard library: %s",
id_or_url, regexp,
id_or_url,
regexp,
)
try:
compiled = regex.compile(regexp)
except regex.error:
LOGGER.debug(
"Giving up. %s regex not supported by 'regex': %s",
id_or_url, regexp,
id_or_url,
regexp,
)
return None

Expand Down Expand Up @@ -515,5 +523,10 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
if ext_id is not None:
ext_ids_to_add.append((qid, pid, ext_id, tid,))
else:
urls_to_add.append((qid, vocabulary.DESCRIBED_AT_URL, url, tid,))
return (ext_ids_to_add, urls_to_add,)
urls_to_add.append(
(qid, vocabulary.DESCRIBED_AT_URL, url, tid,)
)
return (
ext_ids_to_add,
urls_to_add,
)
42 changes: 32 additions & 10 deletions soweego/commons/url_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,21 @@ def get_external_id_from_url(url, ext_id_pids_to_urls):
LOGGER.warning(
'Found multiple matching groups in <%s>: '
'Will use the first of %s',
url, groups,
url,
groups,
)
ext_id = groups[0]
LOGGER.debug(
'Input URL matches the full URL regex. '
'URL: %s -> ID: %s - URL regex: %s',
url, ext_id, url_regex,
url,
ext_id,
url_regex,
)
return (
ext_id,
pid,
)
return (ext_id, pid,)

# No URL regex: best matching effort using the tidy URL
# Look for matching head & tail
Expand All @@ -253,7 +259,8 @@ def get_external_id_from_url(url, ext_id_pids_to_urls):
if tidy.startswith(before) and tidy.endswith(after):
LOGGER.debug(
'Clean URL matches external ID formatter URL: <%s> -> <%s>',
tidy, formatter_url,
tidy,
formatter_url,
)
url_fragment = (
tidy[len(before) : -len(after)]
Expand All @@ -267,7 +274,8 @@ def get_external_id_from_url(url, ext_id_pids_to_urls):
'Missing ID regex, '
'will assume the URL substring as the ID. '
'URL: %s -> substring: %s',
tidy, url_fragment,
tidy,
url_fragment,
)
return url_fragment, pid

Expand All @@ -284,20 +292,34 @@ def get_external_id_from_url(url, ext_id_pids_to_urls):
LOGGER.debug(
"Skipping clean URL <%s> with substring '%s' "
"not matching the expected ID regex %s",
tidy, url_fragment, id_regex.pattern,
tidy,
url_fragment,
id_regex.pattern,
)
return (
None,
None,
)
return (None, None,)

ext_id = match.group()
LOGGER.debug(
'Clean URL: %s -> ID: %s - substring: %s - ID regex: %s',
tidy, ext_id, url_fragment, id_regex,
tidy,
ext_id,
url_fragment,
id_regex,
)
return (
ext_id,
pid,
)
return (ext_id, pid,)

# Nothing worked: give up
LOGGER.debug('Could not extract any identifier from <%s>', url)
return (None, None,)
return (
None,
None,
)


def is_wiki_link(url):
Expand Down
133 changes: 92 additions & 41 deletions soweego/ingester/wikidata_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@

# Approved task 2: URL-based validation, criterion 2
# https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2
URL_VALIDATION_SUMMARY = (
'[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] with extra P887 and catalog ID reference'
)
URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] with extra P887 and catalog ID reference'

# Approved task 3: works by people
# https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_3
Expand Down Expand Up @@ -210,24 +208,31 @@ def people_cli(catalog, statements, sandbox):

if sandbox:
LOGGER.info(
'Running on the Wikidata sandbox item %s ...',
vocabulary.SANDBOX_2
'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2
)

stmt_reader = csv.reader(statements)
for statement in stmt_reader:
person, predicate, value, person_tid = statement
if sandbox:
_add_or_reference(
vocabulary.SANDBOX_2, predicate, value,
catalog_qid, person_pid, person_tid,
summary=URL_VALIDATION_SUMMARY
vocabulary.SANDBOX_2,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=URL_VALIDATION_SUMMARY,
)
else:
_add_or_reference(
person, predicate, value,
catalog_qid, person_pid, person_tid,
summary=URL_VALIDATION_SUMMARY
person,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=URL_VALIDATION_SUMMARY,
)


Expand Down Expand Up @@ -326,7 +331,9 @@ def add_identifiers(
)


def add_people_statements(catalog: str, statements: Iterable, sandbox: bool) -> None:
def add_people_statements(
catalog: str, statements: Iterable, sandbox: bool
) -> None:
"""Add statements to existing Wikidata people.
Statements typically come from validation criteria 2 or 3
Expand All @@ -349,15 +356,23 @@ def add_people_statements(catalog: str, statements: Iterable, sandbox: bool) ->
)
if sandbox:
_add_or_reference(
vocabulary.SANDBOX_2, predicate, value,
catalog_qid, person_pid, person_tid,
summary=URL_VALIDATION_SUMMARY
vocabulary.SANDBOX_2,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=URL_VALIDATION_SUMMARY,
)
else:
_add_or_reference(
subject, predicate, value,
catalog_qid, person_pid, person_tid,
summary=URL_VALIDATION_SUMMARY
subject,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=URL_VALIDATION_SUMMARY,
)


Expand Down Expand Up @@ -501,12 +516,22 @@ def _add_or_reference_works(


def _add_or_reference(
subject: str, predicate: str, value: str,
catalog_qid: str, person_pid: str, person_tid: str, summary=None
subject: str,
predicate: str,
value: str,
catalog_qid: str,
person_pid: str,
person_tid: str,
summary=None,
) -> None:
subject_item, claims = _essential_checks(
subject, predicate, value, catalog_qid,
person_pid=person_pid, person_tid=person_tid, summary=summary
subject,
predicate,
value,
catalog_qid,
person_pid=person_pid,
person_tid=person_tid,
summary=summary,
)

if None in (subject_item, claims):
Expand All @@ -518,8 +543,11 @@ def _add_or_reference(
# See https://www.wikidata.org/wiki/User_talk:Jura1#Thanks_for_your_feedback_on_User:Soweego_bot_task_2
if _check_for_same_value(
claims,
subject, vocabulary.OFFICIAL_WEBSITE, value, catalog_qid,
summary=summary
subject,
vocabulary.OFFICIAL_WEBSITE,
value,
catalog_qid,
summary=summary,
):
return

Expand Down Expand Up @@ -607,12 +635,16 @@ def _handle_addition(
if case_insensitive:
for claim in given_predicate_claims:
if claim.getTarget().lower() == value:
_reference(claim, catalog_qid, person_pid, person_tid, summary=summary)
_reference(
claim, catalog_qid, person_pid, person_tid, summary=summary
)
return

for claim in given_predicate_claims:
if claim.getTarget() == value:
_reference(claim, catalog_qid, person_pid, person_tid, summary=summary)
_reference(
claim, catalog_qid, person_pid, person_tid, summary=summary
)


def _handle_redirect_and_dead(qid):
Expand All @@ -631,8 +663,13 @@ def _handle_redirect_and_dead(qid):


def _essential_checks(
subject, predicate, value, catalog_qid,
person_pid=None, person_tid=None, summary=None
subject,
predicate,
value,
catalog_qid,
person_pid=None,
person_tid=None,
summary=None,
):
item, data = _handle_redirect_and_dead(subject)

Expand All @@ -643,9 +680,13 @@ def _essential_checks(
if not data:
LOGGER.warning('%s has no data at all', subject)
_add(
item, predicate, value,
catalog_qid, person_pid, person_tid,
summary=summary
item,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=summary,
)
return None, None

Expand All @@ -654,9 +695,13 @@ def _essential_checks(
if not claims:
LOGGER.warning('%s has no claims', subject)
_add(
item, predicate, value,
catalog_qid, person_pid, person_tid,
summary=summary
item,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=summary,
)
return None, None

Expand Down Expand Up @@ -684,9 +729,7 @@ def _check_for_same_value(
value,
)
_reference(
claim,
catalog_qid, person_pid, person_tid,
summary=summary
claim, catalog_qid, person_pid, person_tid, summary=summary
)
return True
return False
Expand Down Expand Up @@ -728,9 +771,13 @@ def _get_works_args(catalog):


def _add(
subject_item, predicate, value,
catalog_qid, person_pid, person_tid,
summary=None
subject_item,
predicate,
value,
catalog_qid,
person_pid,
person_tid,
summary=None,
):
claim = pywikibot.Claim(REPO, predicate)
claim.setTarget(value)
Expand Down Expand Up @@ -774,7 +821,11 @@ def _reference(claim, catalog_qid, person_pid, person_tid, summary=None):

try:
claim.addSources(
[based_on_heuristic_reference, stated_in_reference, retrieved_reference],
[
based_on_heuristic_reference,
stated_in_reference,
retrieved_reference,
],
summary=summary,
)

Expand Down

0 comments on commit 49ac13e

Please sign in to comment.