Skip to content

Commit

Permalink
Merge pull request #441 from Wikidata/dev
Browse files Browse the repository at this point in the history
URL-decode IDs & claims deletion
  • Loading branch information
marfox committed Oct 5, 2021
2 parents 3e3c6a0 + f09a6ef commit 017dfcb
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 0 deletions.
73 changes: 73 additions & 0 deletions scripts/delete_claims.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Delete claims made by the soweego bot.
The required input comes from a SPARQL query like:
SELECT DISTINCT ?stmt WHERE {
?item p:P6262 ?stmt .
?stmt prov:wasDerivedFrom ?ref .
?ref pr:P887 wd:Q1266546 ;
pr:P248 wd:Q14005 .
}
Just replace the PID in `p:P6262` (Fandom article ID) with the relevant one,
and the QID in `wd:Q14005` (MusicBrainz) with the target catalog.
N.B.: we look for (based on heuristic, record linkage), (stated in, catalog)
references
"""

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2021, Hjfocs'

import requests
import sys

WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php'
STMT_PREFIX = 'http://www.wikidata.org/entity/statement/'


def main(args):
if len(args) != 3:
print(f'Usage: python {__file__} GUIDS_CSV EDIT_SUMMARY')
return 1

file_in, summary = args[1], args[2]
guids = set()

with open(file_in) as fin:
for line in fin:
line = line.rstrip()
line = line.lstrip(STMT_PREFIX)
# Statement URIs don't have the dollar
guid = line.replace('-', '$', 1)
guids.add(guid)

session = requests.Session()

# Get edit token
params = {'action': 'query', 'meta': 'tokens', 'format': 'json'}
r = session.get(WIKIDATA_API_URL, params=params)
token = r.json()['query']['tokens']['csrftoken']

# Fire a POST for each GUID
for guid in guids:
data = {
'action': 'wbremoveclaims', 'format': 'json', 'token': token,
'bot': True, 'claim': guid, 'summary': summary
}
r = session.post(WIKIDATA_API_URL, data=data)

if r.ok:
print(r.json())

return 0


if __name__ == '__main__':
sys.exit(main(sys.argv))

10 changes: 10 additions & 0 deletions soweego/commons/data_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import re
from collections import defaultdict
from typing import Iterable, Iterator, Optional
from urllib.parse import unquote

import regex
from sqlalchemy import or_
Expand Down Expand Up @@ -532,6 +533,15 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
url, ext_id_pids_to_urls
)
if ext_id is not None:
# Percent-decode IDs
if '%' in ext_id:
try:
ext_id = unquote(ext_id, errors='strict')
except UnicodeDecodeError:
LOGGER.warning(
'Skipping invalid percent-encoded ID: %s', ext_id
)
continue
ext_ids_to_add.append((qid, pid, ext_id, tid,))
else:
urls_to_add.append((qid, vocabulary.EXACT_MATCH, url, tid,))
Expand Down

0 comments on commit 017dfcb

Please sign in to comment.