Skip to content

Commit

Permalink
Prevent uploading birth/death year when day date exists (#28)
Browse files Browse the repository at this point in the history
Introduce a check for datetime values in birth/death date statements to detect whether the new value is a less precise version of the old value, in which case it will be treated as a duplicate and not uploaded.

Note that if the two values are actually different, like 1988-02-02 and 1989, they will not be treated as duplicates, as they are not different versions of the same datetime.

Task: https://phabricator.wikimedia.org/T202400
  • Loading branch information
Vesihiisi committed Sep 13, 2018
1 parent 0418774 commit 9695dff
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 5 deletions.
2 changes: 1 addition & 1 deletion importer/Person.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def __init__(self, raw_data, repository, data_files, existing):
self.set_nationality()
self.set_labels()
self.set_ids()
# self.set_lifespan()
self.set_lifespan()
# self.set_surname()
# self.set_first_name()
# self.set_descriptions()
37 changes: 33 additions & 4 deletions importer/Uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,43 @@ def add_descriptions(self, target_item, descriptions):
self.wdstuff.add_multiple_descriptions(
descriptions_for_upload, target_item)

def is_redundant_date(self, claim, wd_item):
"""
Check if birth/death date is redundant.
If target WD item has birth/death dates,
check if they're more precise than those
in the data being uploaded. If they are both
more precise and in the same year, do not upload
the less precise new date. E.g.
item has 1999-12-01 → do not upload 1999,
same point in time, different precision;
do upload 1998,
different point in time
"""
prop = claim["prop"]
value = claim["value"]
if prop in [PROPS["born"], PROPS["dead"]]:
# Let's check if the target item already has one...
dates_in_item = utils.get_value_of_property(wd_item.getID(),
prop, self.repo)
for date in dates_in_item:
if (date.precision > value.itis.precision and
date.year == value.itis.year):
print("Avoiding duplicate timestamp.")
return True
return False

def add_claims(self, wd_item, claims):
if wd_item:
for claim in claims:
wd_item.get()
prop = claim["prop"]
value = claim["value"]
ref = claim["ref"]
self.wdstuff.addNewClaim(prop, value, wd_item, ref)
if not self.is_redundant_date(claim, wd_item):
self.wdstuff.addNewClaim(claim["prop"],
claim["value"],
wd_item,
claim["ref"])

def create_new_item(self):
return self.wdstuff.make_new_item({}, self.summary)
Expand Down
12 changes: 12 additions & 0 deletions importer/importer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,15 @@ def date_to_dict(datestring, dateformat):
def format_isni(st):
"""Format ISNI id by inserting space after every 4 chars."""
return ' '.join(st[i:i + 4] for i in range(0, len(st), 4))


def get_value_of_property(q_number, property_id, site):
results = []
item = pywikibot.ItemPage(site, q_number)
if item.exists() and item.claims.get(property_id):
for claim in item.claims.get(property_id):
target = claim.getTarget()
if isinstance(target, pywikibot.ItemPage):
target = target.getID()
results.append(target)
return results

0 comments on commit 9695dff

Please sign in to comment.