Skip to content

Commit

Permalink
Merge pull request #435 from Wikidata/dev
Browse files Browse the repository at this point in the history
Don't use a generator for gathering target links
  • Loading branch information
marfox committed Sep 6, 2021
2 parents 84dab46 + dc88ddc commit 878e48a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
16 changes: 9 additions & 7 deletions soweego/commons/data_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import logging
import re
from collections import defaultdict
from typing import Iterable
from typing import Iterable, Iterator, Optional

import regex
from sqlalchemy import or_
Expand All @@ -34,7 +34,9 @@
LOGGER = logging.getLogger(__name__)


def gather_target_biodata(entity, catalog):
def gather_target_biodata(
entity: str, catalog: str
) -> Optional[Iterator[tuple]]:
LOGGER.info(
'Gathering %s birth/death dates/places and gender metadata ...', catalog
)
Expand All @@ -51,6 +53,7 @@ def gather_target_biodata(entity, catalog):
raw_result = _run_query(query, catalog, entity)
if raw_result is None:
return None
# Here is the generator
result = _parse_target_biodata_query_result(raw_result)
session.commit()
except:
Expand Down Expand Up @@ -307,7 +310,7 @@ def _parse_target_biodata_query_result(result_set):
LOGGER.debug('%s: no death place available', identifier)


def gather_target_links(entity, catalog):
def gather_target_links(entity: str, catalog: str) -> Optional[list]:
LOGGER.info('Gathering %s %s links ...', catalog, entity)
link_entity = target_database.get_link_entity(catalog, entity)

Expand Down Expand Up @@ -335,6 +338,8 @@ def gather_target_links(entity, catalog):
)
return None
LOGGER.info('Got %d links from %s %s', count, catalog, entity)
# Slurp query result into a list:
# a generator here may break the DB connection
result = query.all()
session.commit()
except:
Expand All @@ -343,10 +348,7 @@ def gather_target_links(entity, catalog):
finally:
session.close()

if result is None:
return None
for row in result:
yield row.catalog_id, row.url
return result


def _get_catalog_entity(entity_type, catalog_constants):
Expand Down
10 changes: 4 additions & 6 deletions soweego/validator/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,11 +780,9 @@ def _bio_statements_generator(stmts_dict, for_catalogs=False):
yield tid, PID_PREFIX + pid, value, QID_PREFIX + qid


def _validate(
criterion, wd, target_generator, deprecate, add, reference, wd_only
):
def _validate(criterion, wd, target_data, deprecate, add, reference, wd_only):
LOGGER.info('Starting check against target %s ...', criterion)
target = _consume_target_generator(target_generator)
target = _prepare_target(target_data)

# Large loop size: total Wikidata class instances with identifiers,
# e.g., 80k musicians
Expand Down Expand Up @@ -1063,9 +1061,9 @@ def _dump_csv_output(data, out_path, log_msg_subject):
LOGGER.info("No %s, won't dump to file", log_msg_subject)


def _consume_target_generator(target_generator):
def _prepare_target(dataset):
target = defaultdict(set)
for identifier, *data in target_generator:
for identifier, *data in dataset:
if len(data) == 1: # Links
target[identifier].add(data.pop())
else: # Biographical data
Expand Down

0 comments on commit 878e48a

Please sign in to comment.