New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Austria monuments #64
Changes from 20 commits
ae14f15
f2d8170
9529fae
db01120
f999a84
7941bb3
86bc14f
5fa297c
4a79e5d
0867a96
c870293
ed204e3
2a64a8d
4963bf3
f7b6029
944567a
6bda01a
ddbda24
11b8cbd
84fbce5
0663c6f
b37c754
c6171b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
from Monument import Monument | ||
import importer_utils as utils | ||
|
||
|
||
MAPPING_DIR = "mappings" | ||
|
||
|
||
class AtDe(Monument): | ||
|
||
def get_type_keyword(self): | ||
""" | ||
Extract type of monument from name. | ||
|
||
Does matching of name against list of monument | ||
types from | ||
https://www.wikidata.org/wiki/Wikidata:WikiProject_WLM/Mapping_tables/at_(de)/types | ||
""" | ||
raw_name = self.name.lower() | ||
types = self.data_files["types"]["mappings"] | ||
keywords = list(types.keys()) | ||
keywords = [x.lower() for x in keywords] | ||
return utils.get_longest_match(raw_name, keywords) | ||
|
||
def set_type(self): | ||
""" | ||
Set specific P31, if possible. | ||
|
||
Default P31 is cultural property. | ||
If extracting a type keyword succeeded, | ||
use the more specific item as P31. | ||
""" | ||
types = self.data_files["types"]["mappings"] | ||
type_match = self.get_type_keyword() | ||
if type_match: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if if len(type_match) == 1:
do stuff
else:
self.add_to_report("type", self.name, "is") since you are not making a reporting distinction between none and many matches |
||
if isinstance(type_match, list): | ||
# multiple matches - report | ||
self.add_to_report("type", self.name, "is") | ||
else: | ||
# single match - add | ||
match_item = [types[x]["items"] | ||
for x in types if x.lower() == type_match][0][0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separate list comprehension from getting to the contents (i.e. the What is the format of types? Based on this line I'm guessing something like: {
"krankenhaus": {
items: ["Q16917"]
}
} if so why is "items" a list (rather than |
||
self.substitute_statement("is", match_item) | ||
else: | ||
# no matches - report | ||
self.add_to_report("type", self.name, "is") | ||
|
||
def update_labels(self): | ||
""" | ||
Set the labels. | ||
|
||
The data does not contain any markup, | ||
so we're adding them as is. | ||
""" | ||
self.add_label("de", self.name) | ||
|
||
def set_descriptions(self): | ||
""" | ||
Set descriptions in several languages. | ||
|
||
Base format: $type in $country | ||
If specific type in German exists, substitute for $type. | ||
If municipality name exists, insert in all languages | ||
""" | ||
type_match = self.get_type_keyword() | ||
municipality_name = self.get_municipality_name() | ||
base_desc_german_english = "{} in {}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe do these as small dicts instead base_desc = {
"en": "{} in {}",
"de": "{} in {}",
"sv": "{} i {}"
}
place_desc = {
"en": "Austria",
"de": "Österreich",
"sv": "Österike"
}
type_desc = {
"en": "cultural property",
"de": "Denkmalgeschütztes Objekt",
"sv": "kulturarv"
} That way you should quite easily be able to loop through them in the end. |
||
base_desc_swedish = "{} i {}" | ||
place_german = "Österreich" | ||
place_english = "Austria" | ||
place_swedish = "Österike" | ||
type_german = "Denkmalgeschütztes Objekt" | ||
type_english = "cultural property" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do the English and Swedish names come from? Wondering since I would expect the English one to be "cultural heritage property" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: rm english and swedish. |
||
type_swedish = "kulturarv" | ||
|
||
if isinstance(type_match, str): | ||
type_german = type_match.title() | ||
if municipality_name: | ||
place_german = municipality_name | ||
place_swedish = "{}, {}".format(municipality_name, place_swedish) | ||
place_english = "{}, {}".format(municipality_name, place_english) | ||
desc_german = base_desc_german_english.format( | ||
type_german, place_german) | ||
desc_english = base_desc_german_english.format( | ||
type_english, place_english) | ||
desc_swedish = base_desc_swedish.format(type_swedish, place_swedish) | ||
|
||
self.add_description("de", desc_german) | ||
self.add_description("en", desc_english) | ||
self.add_description("sv", desc_swedish) | ||
|
||
def set_street_address(self): | ||
""" | ||
Set the street address. | ||
|
||
Most of these look good. Patterns for invalid ones: | ||
* 251, gegenüber | ||
Filtering by "starts with digit" and "contains comma", | ||
as we can't rely only on "starts with digit" because | ||
10. Oktober-Straße 18 is legit. | ||
* zwischen Entensteinstraße 36 u. 38 | ||
Filtering by "zwischen". | ||
* 76 | ||
Filtering by containing only digits. | ||
""" | ||
bad_address = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be clearer to use |
||
if self.has_non_empty_attribute("adresse"): | ||
if (utils.first_char_is_number(self.adresse) and | ||
"," in self.adresse): | ||
bad_address = self.adresse | ||
elif "zwischen" in self.adresse: | ||
bad_address = self.adresse | ||
elif self.adresse.isdigit(): | ||
bad_address = self.adresse | ||
else: | ||
address = self.adresse | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be clearer to have this as the Also not sure |
||
self.add_statement( | ||
"located_street", address) | ||
if bad_address: | ||
self.add_to_report("adresse", self.adresse, "located_street") | ||
|
||
def set_heritage_id(self): | ||
"""Set the heritage ID, P2951.""" | ||
self.add_statement("austria_heritage_id", self.objektid) | ||
|
||
def set_monuments_all_id(self): | ||
"""Map which column in table to ID in monuments_all.""" | ||
self.monuments_all_id = self.objektid | ||
|
||
def get_municipality_name(self): | ||
"""Get municipality name based on gemeindekennzahl.""" | ||
all_codes = self.data_files["municipalities"] | ||
municip_code = str(self.gemeindekennzahl) | ||
match = [x["itemLabel"] | ||
for x in all_codes if x["value"] == municip_code] | ||
if len(match) == 1: | ||
return match[0] | ||
|
||
def set_adm_location(self): | ||
""" | ||
Set the administrative location (municipality). | ||
|
||
Using the gemeindekennzahl field, municipality ID, | ||
mapped to municipality WD items in external file. | ||
""" | ||
all_codes = self.data_files["municipalities"] | ||
if self.has_non_empty_attribute("gemeindekennzahl"): | ||
municip_code = str(self.gemeindekennzahl) | ||
match = [x["item"] | ||
for x in all_codes if x["value"] == municip_code] | ||
if len(match) == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it safer to do if len(match) == 1:
add stuff
else:
report |
||
self.add_to_report("gemeindekennzahl", | ||
municip_code, "located_adm") | ||
else: | ||
self.add_statement("located_adm", match[0]) | ||
|
||
def exists_with_monument_article(self, language): | ||
return super().exists_with_monument_article("de", "artikel") | ||
|
||
def __init__(self, db_row_dict, mapping, data_files, existing, repository): | ||
Monument.__init__(self, db_row_dict, mapping, | ||
data_files, existing, repository) | ||
self.set_monuments_all_id() | ||
self.set_changed() | ||
self.wlm_source = self.create_wlm_source(self.monuments_all_id) | ||
self.update_labels() | ||
self.set_descriptions() | ||
self.set_is() | ||
self.set_type() | ||
self.set_country() | ||
self.set_image("foto") | ||
self.set_heritage() | ||
self.set_heritage_id() | ||
self.set_adm_location() | ||
self.set_street_address() | ||
self.set_coords(("lat", "lon")) | ||
self.set_commonscat() | ||
self.set_wd_item(self.find_matching_wikidata(mapping)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from CmFr import CmFr | ||
from CzCs import CzCs | ||
from AtDe import AtDe | ||
from DkBygningDa import DkBygningDa | ||
from DkFortidsDa import DkFortidsDa | ||
from EeEt import EeEt | ||
|
@@ -85,9 +86,12 @@ def create_connection(arguments): | |
"monuments_pt_(pt)": {"class": PtPt, "data_files": {}}, | ||
"monuments_ro_(ro)": {"class": RoRo, "data_files": {}}, | ||
"monuments_xk_(sq)": {"class": XkSq, "data_files": {}}, | ||
"monuments_ie_(en)": {"class": IeEn, "data_files": {"counties": "ireland_counties.json"}}, | ||
"monuments_ie_(en)": {"class": IeEn, "data_files":{"counties": "ireland_counties.json"}}, | ||
"monuments_za_(en)": {"class": ZaEn, "data_files": {}}, | ||
"monuments_cm_(fr)": {"class": CmFr, "data_files": {}}, | ||
"monuments_at_(de)": {"class": AtDe, "data_files": {"municipalities": | ||
"austria_municipalities.json", | ||
"types": "at_(de)_types.json"}}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or changed to use "lookup_downloads" |
||
"monuments_dk-bygninger_(da)": {"class": DkBygningDa, | ||
"data_files": {}, | ||
"subclass_downloads": {"settlement": "Q486972"}}, | ||
|
@@ -102,7 +106,7 @@ def create_connection(arguments): | |
"data_files": {}}, | ||
"monuments_se-bbr_(sv)": {"class": SeBbrSv, | ||
"data_files": {"functions": "se-bbr_(sv)_functions.json", | ||
"settlements": "sweden_settlements.json"}}, | ||
"settlements": "sweden_settlements.json"}}, | ||
"monuments_ee_(et)": {"class": EeEt, | ||
"data_files": {"counties": "estonia_counties.json"}}, | ||
"monuments_se-fornmin_(sv)": | ||
|
@@ -256,7 +260,8 @@ def get_items(connection, | |
problem_reports = [] | ||
wikidata_site = utils.create_site_instance("wikidata", "wikidata") | ||
for row in database_rows: | ||
monument = class_to_use(row, mapping, data_files, existing, wikidata_site) | ||
monument = class_to_use(row, mapping, data_files, | ||
existing, wikidata_site) | ||
problem_report = monument.get_report() | ||
if table: | ||
raw_data = "<pre>" + str(row) + "</pre>\n" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,6 +207,44 @@ def parse_year(text): | |
return year | ||
|
||
|
||
def get_longest_string(in_list): | ||
""" | ||
Get the longest string(s) in a list. | ||
|
||
:param in_list: list of strings | ||
:return: single string if there's only one with the max length, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldn't it be easier to always return a list with either zero, one or many entries? That way you could always make iterate over the results (or use |
||
or a list of strings if there are several. | ||
""" | ||
if len(in_list) == 0: | ||
return None | ||
max_length = max(len(x) for x in in_list) | ||
matches = [x for x in in_list if len(x) == max_length] | ||
if len(matches) == 1: | ||
return matches[0] | ||
else: | ||
return matches | ||
|
||
|
||
def get_longest_match(word, keywords): | ||
""" | ||
Given a list of keywords, get longest keyword that overlaps with input. | ||
|
||
A naive attempt to match words in languages that use | ||
compound nouns written together. Given a string and a list of | ||
keywords, return the longest of these keywords that's | ||
contained in the input string. That way, if the keyword list | ||
contains both a simple word ("bro") and its compound ("järnvägsbro"), | ||
we only get the more specific one: | ||
* "götaälvsbron" -> "bro" | ||
* "en stor järnvägsbro" -> "järnvägsbro" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. define the parameters |
||
""" | ||
matches = [] | ||
for k in keywords: | ||
if k in word: | ||
matches.append(k) | ||
return get_longest_string(matches) | ||
|
||
|
||
def remove_characters(text, string_of_chars_to_remove): | ||
translator = str.maketrans( | ||
{key: None for key in string_of_chars_to_remove}) | ||
|
@@ -265,6 +303,11 @@ def last_char_is_vowel(text): | |
return is_vowel(get_last_char(text)) | ||
|
||
|
||
def first_char_is_number(text): | ||
"""Check if string starts with a number.""" | ||
return text[0].isdigit() | ||
|
||
|
||
def socken_to_q(socken, landskap): | ||
if last_char_is_vowel(socken) or get_last_char(socken) == "s": | ||
socken_name = socken + " socken" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"country": { | ||
"item": "Q40", | ||
"name": "Austria" | ||
}, | ||
"country_code": "at", | ||
"default_is": { | ||
"item": "Q2065736", | ||
"name": "cultural property" | ||
}, | ||
"heritage": { | ||
"item": "Q1188447", | ||
"name": "Denkmalgeschutztes Objekt" | ||
}, | ||
"language": "de", | ||
"table_name": "at", | ||
"unique": { | ||
"comment": "Cultural heritage database in Austria ObjektID ", | ||
"property": "P2951" | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to convert keys() to a list since you can iterate over them anyway and the end result will give a list.
keywords = [x.lower() for x in types.keys()]