Run the line below (without the # symbol) to install pandas if not already installed

In [4]:
# !py -m pip install pandas

In [5]:
import requests
import csv
import os
import pandas as pd
import json
import re
import time
from datetime import datetime, timedelta
import math
import traceback

In [49]:
input_path = "C:\\Users\\khan32\\Documents\\factgrid_py\\test"
# input_path = "."

In [50]:
output_path = input_path

In [8]:
place_name = "Bamberg"

In [9]:
today_string = datetime.now().strftime('%Y-%m-%d')

In [10]:
# SELECT * FROM `role` r INNER JOIN url_external u on r.id = u.item_id where u.authority_id = 42 

Export the results from the query from https://vwebfile.gwdg.de/phpmyadmin
```sql
SELECT * FROM role r 
LEFT JOIN (
    select * from url_external where authority_id = 42
) u on r.id = u.item_id;
```  
from the main wiag database in the csv format.

Rename it to include the date. An example filename would be `role_2024_04_24.csv`

In [11]:
input_file = f"role_2024_06_10.csv"
input_path_file = os.path.join(input_path, input_file)
wiag_roles_df = pd.read_csv(input_path_file, names=['id', 'note', 'name', 'comment', 'gs_reg_id', 'generic_term', 'plural', 'definition', 'role_gp_fg_id', 'gender', 'lang', 'role_group_id','r_id','item_id','r_note','authority_id','role_fg_id','r_comment'])
len(wiag_roles_df)

263

### Download data from wiag
https://wiag-vokabulare.uni-goettingen.de/query/can

In [12]:
input_file = f"WIAG-Domherren-DB-Ämter_2024-06-10.csv"
input_path_file = os.path.join(input_path, input_file)
role_all_df = pd.read_csv(input_path_file, sep=';')
len(role_all_df)

2827

In [13]:
last_modified = datetime.fromtimestamp(os.path.getmtime(input_file))
now = datetime.now()
assert last_modified.day == now.day and last_modified.month == now.month, f"The file was last updated on {last_modified.strftime('%d.%m')}"

### ERROR: If you get an error when you run the line above this means that the file was not updated today. 
A few solutions: 
* update the file again by downloading it again
* change the file name to something correct
* (not recommended) continue if you are sure that you need to use old data.

In [14]:
len(role_all_df)

2827

In [15]:
role_all_df.head()

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,date_begin,date_end,date_sort_key,GND,GSN,FactGrid
0,WIAG-Pers-CANON-19874-001,19230,Dompropst,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Lübeck,676.0,,,1160,1177.0,1160150,,,Q727495
1,WIAG-Pers-CANON-19937-001,21185,Domdekan,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Lübeck,676.0,,,1163,1177.0,1163150,,,Q727662
2,WIAG-Pers-CANON-18893-001,22200,Domkustos,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Lübeck,676.0,,,1170,1177.0,1170150,,,Q725483
3,WIAG-Pers-CANON-18987-001,20018,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Lübeck,676.0,,,1170,,1170150,,,Q725674
4,WIAG-Pers-CANON-18979-001,20953,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Lübeck,676.0,,,1170,,1170150,,,Q725658


# Download data from factgrid

If any of the following requests to factgrid fail, try re running the cells.

In [86]:
url = 'https://database.factgrid.de/sparql'
query = (
    """SELECT ?item ?gsn WHERE {
  ?item wdt:P471 ?gsn
}
"""
)
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# make request: 
r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_institution_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_institution_df)

272

In [87]:
url = 'https://database.factgrid.de/sparql'
query = (
"""
SELECT ?item ?wiagid ?label ?alternative WHERE {
  ?item wdt:P2/wdt:P3* wd:Q164535.
  #?item schema:description ?itemDesc.
  ?item rdfs:label ?label.
  OPTIONAL {?item schema:description ?itemDesc.}
  OPTIONAL {?item skos:altLabel ?alternative. }
  OPTIONAL {?item wdt:P601 ?wiagid.}
  FILTER(LANG(?label) in ("en", "de"))
}
"""
)


# version 2
# SELECT ?item ?wiagid (group_concat(DISTINCT ?label; separator=',') as ?labels) (group_concat(DISTINCT ?itemDesc; separator=',') as ?itemDescs) (group_concat(DISTINCT ?alternative ; separator=',') as ?alternatives) WHERE {
#   ?item wdt:P2/wdt:P3* wd:Q164535.
#   ?item schema:description ?itemDesc.
#   ?item rdfs:label ?label.
#   OPTIONAL {?item schema:description ?itemDesc.}
#   OPTIONAL {?item skos:altLabel ?alternative. }
#   OPTIONAL {?item wdt:P601 ?wiagid.}
#   FILTER(LANG(?label) in ("en", "de"))
# }
# GROUP BY ?item ?wiagid

# SELECT ?item ?wiagid WHERE {
#   ?item wdt:P2/wdt:P3* wd:Q164535.
#   ?item wdt:P601 ?wiagid
# }
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# make request: 
r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_diocese_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_diocese_df)

1280

In [88]:
url = 'https://database.factgrid.de/sparql'
query = (
"""
SELECT ?item ?label WHERE {
  ?item wdt:P2 wd:Q257052.
  ?item rdfs:label ?label.
  FILTER(LANG(?label) in ("de"))
}
"""
)

r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_inst_roles_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_inst_roles_df)

4354

# Clean Factgrid data

In [None]:
# extract out q id
def extract_qid(df, column):
    df[column] = df[column].map(lambda x: x.strip('https://database.factgrid.de/entity/'))
 
#factgrid_df['item.value'] = factgrid_df['item.value'].map(lambda x: x.strip('https://database.factgrid.de/entity/'))

# drop irrelevant columns
def drop_type_columns(df):
    df.drop(columns=[column for column in df.columns if column.endswith('type')], inplace=True)
    df.drop(columns=[column for column in df.columns if column.endswith('xml:lang')], inplace=True)

In [83]:
extract_qid(factgrid_institution_df, 'item.value')
extract_qid(factgrid_diocese_df, 'item.value')
extract_qid(factgrid_inst_roles_df, 'item.value')

KeyError: 'item.value'

In [None]:
drop_type_columns(factgrid_institution_df)
drop_type_columns(factgrid_diocese_df)
drop_type_columns(factgrid_inst_roles_df)

In [None]:
# rename columns
factgrid_institution_df.columns = ['fg_institution_id', 'fg_gsn_id']
factgrid_diocese_df.columns = ["fg_diocese_id", "dioc_label", "dioc_alt", "dioc_wiag_id"]
factgrid_inst_roles_df.columns = ["fg_inst_role_id", "inst_role"]

In [84]:
# clean the diocese alts by removing BITECA and BETA entries 
factgrid_diocese_df['dioc_alt'] = factgrid_diocese_df['dioc_alt'].replace(['^BITECA.*', '^BETA.*'], '', regex=True)
factgrid_diocese_df
# set(factgrid_diocese_df['fg_alts'])

Unnamed: 0,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,Q153261,Bistum Metz,,WIAG-Inst-DIOCGatz-062-001
1,Q153261,Diocese of Metz,,WIAG-Inst-DIOCGatz-062-001
2,Q153261,Bistum Metz,,WIAG-Inst-DIOCGatz-062-001
3,Q153261,Diocese of Metz,,WIAG-Inst-DIOCGatz-062-001
4,Q153264,Bistum Straßburg,Diözese Straßburg,WIAG-Inst-DIOCGatz-068-001
...,...,...,...,...
1275,Q395389,Archdiocese of Agrigento,Bistum Agrigent,
1276,Q395389,Erzbistum Agrigent,Diözese Agrigent,
1277,Q395389,Archdiocese of Agrigento,Diözese Agrigent,
1278,Q395390,Erzbistum Catania,,


In [85]:
factgrid_institution_df['fg_gsn_id'] = pd.to_numeric(factgrid_institution_df['fg_gsn_id'], downcast='float')

KeyError: 'fg_gsn_id'

In [25]:
fg_gp = factgrid_institution_df.groupby('fg_gsn_id').count()
duplicate_fg_entries = factgrid_institution_df[factgrid_institution_df['fg_gsn_id'].isin(list(fg_gp[fg_gp['fg_institution_id'] > 1].index))]
duplicate_fg_entries.head()

Unnamed: 0,fg_institution_id,fg_gsn_id


In [26]:
assert duplicate_fg_entries.empty, f"There are possible duplicates on factgrid\n.{duplicate_fg_entries}"
factgrid_institution_df = factgrid_institution_df[~factgrid_institution_df['fg_gsn_id'].isin(duplicate_fg_entries['fg_gsn_id'].to_list())]

In [27]:
# factgrid_institution_df.drop_duplicates(['fg_gsn_id'], inplace=True)

In [28]:
factgrid_institution_df.dtypes

fg_institution_id     object
fg_gsn_id            float32
dtype: object

In [29]:
factgrid_diocese_df.dtypes

fg_diocese_id    object
dioc_label       object
dioc_alt         object
dioc_wiag_id     object
dtype: object

In [30]:
role_all_df.dtypes

person_id            object
id                    int64
name                 object
role_group           object
role_group_en        object
role_group_fq_id     object
institution          object
institution_id      float64
diocese              object
diocese_id           object
date_begin           object
date_end             object
date_sort_key         int64
GND                  object
GSN                  object
FactGrid             object
dtype: object

In [31]:
# TODO: Find a way to query amter data directly from wiag
# url = 'https://wiag-vocab.adw-goe.de/domherr/data'
# r = requests.get(url, params={'domstift': place_name})
# data = r.json()
# person_import_df = pd.json_normalize(data['persons'])

# print(len(person_import_df))
# person_import_df.head()

In [81]:
factgrid_institution_df.head()

Unnamed: 0,item.type,item.value,gsn.type,gsn.value
0,uri,https://database.factgrid.de/entity/Q422286,literal,114
1,uri,https://database.factgrid.de/entity/Q470546,literal,119
2,uri,https://database.factgrid.de/entity/Q633292,literal,120
3,uri,https://database.factgrid.de/entity/Q633339,literal,131
4,uri,https://database.factgrid.de/entity/Q633346,literal,139


In [33]:
factgrid_diocese_df.head()

Unnamed: 0,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,Q153261,Bistum Metz,,WIAG-Inst-DIOCGatz-062-001
1,Q153261,Diocese of Metz,,WIAG-Inst-DIOCGatz-062-001
2,Q153261,Bistum Metz,,WIAG-Inst-DIOCGatz-062-001
3,Q153261,Diocese of Metz,,WIAG-Inst-DIOCGatz-062-001
4,Q153264,Bistum Straßburg,Diözese Straßburg,WIAG-Inst-DIOCGatz-068-001


# Check for missing data

## Check for missing institutions

In [34]:
missing_institution_on_factgrid_df = role_all_df.merge(
    factgrid_institution_df, indicator = True, how='left', left_on='institution_id', right_on='fg_gsn_id', suffixes=('_wiag', '_institute_fg')
).loc[lambda x : x['_merge']!='both']

In [35]:
print(len(missing_institution_on_factgrid_df))
missing_institution_on_factgrid_df.head()

685


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,date_begin,date_end,date_sort_key,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge
29,WIAG-Pers-EPISCGatz-03024-001,80808,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Lübeck,WIAG-Inst-DIOCGatz-014-001,1210,1230.0,1210150,,,Q652312,,,left_only
49,WIAG-Pers-CANON-19889-001,9303,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Kollegiatstift St. Peter, Bardowick",324.0,,,1219,,1219150,,,Q727542,,,left_only
65,WIAG-Pers-CANON-19903-001,70200,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Verden,3487.0,,,1229,1231.0,1229150,100989101.0,,Q727571,,,left_only
77,WIAG-Pers-EPISCGatz-03475-001,80170,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Schwerin,WIAG-Inst-DIOCGatz-028-001,1240,1247.0,1240150,,,Q652425,,,left_only
82,WIAG-Pers-EPISCGatz-03232-001,98649,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Olmütz,WIAG-Inst-DIOCGatz-064-001,1245,1281.0,1245150,104280050.0,060-00594-001,Q390757,,,left_only


In [36]:
# missing_institution_on_factgrid_df[['institution', 'institution_id', 'diocese', 'diocese_id']]

## Create a csv file with persons having no institution_id, diocese_id and diocese

Note: the file generated by this cell is only relevant in case of incorrect entries on wiag.

If this cell throws an error, please check the file `simple_null_entries_amter-{today'sd date}.csv` and check the records on wiag using the person_id from there.

In [56]:
missing_institution_diocese_on_factgrid_df = missing_institution_on_factgrid_df.merge(
    factgrid_diocese_df, indicator = '_second_merge', how='left', left_on='diocese_id', right_on='dioc_wiag_id', suffixes=('_wiag', '_dioc_fg')
).loc[lambda x : x['_second_merge']!='both']
missing_institution_diocese_on_factgrid_df.drop(['_merge','_second_merge'], axis=1, inplace=True)
null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df[['institution_id', 'diocese_id', 'diocese']].isna().all(axis=1)]
null_entries = null_entries[null_entries['name'] != 'Kardinal'].sort_values('person_id')

# 4312 to 7979 should be removed
null_entries = null_entries[null_entries.person_id.str.contains('[0-7][0-9]{4}-[0-9]{3}$', regex=True)]
if not null_entries.empty:
    null_entries.to_csv(os.path.join(output_path, f'simple_null_entries_amter-{today_string}.csv'), sep=';')
assert len(null_entries) == 0

## Add diocese information from factgrid

Diocese is matched by 3 queries between wiag and factgrid. They are applied in the following priority. If a higher priority query find a match, the following query does nothing:
* wiag id (wiag) => wiag id (factgrid)
* diocese name (wiag) => diocese label (factgrid)
* diocese name (wiag) => diocese alt label (factgrid)

In [57]:
# lookup for the diocese by the diocese_id, then diocese labels, then diocese alts
def join_dioceses(input_df):
    rows = []
    for _, row in input_df.iterrows():
        search_row = pd.Series({'fg_diocese_id':None, 'dioc_label':None, 'dioc_alt':None, 'dioc_wiag_id':None})

        # NOTE: in the following three code blocks only the query changes.
        # this could be refactored somehow without running the query when defining it
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_wiag_id'] == row['diocese_id']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
    
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_label'] == row['diocese']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
    
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_alt'] == row['diocese']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
        rows.append(row.combine_first(search_row))
    
    dioc_joined_df = pd.DataFrame(rows)
    dioc_joined_df = dioc_joined_df[[*input_df.columns, *factgrid_diocese_df.columns]]
    
    return dioc_joined_df
dioc_joined_df = join_dioceses(missing_institution_on_factgrid_df)
print(len(dioc_joined_df))
dioc_joined_df.head()

685


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
29,WIAG-Pers-EPISCGatz-03024-001,80808,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Lübeck,WIAG-Inst-DIOCGatz-014-001,...,,,Q652312,,,left_only,Q153228,Bistum Lübeck,Diözese Lübeck,WIAG-Inst-DIOCGatz-014-001
49,WIAG-Pers-CANON-19889-001,9303,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Kollegiatstift St. Peter, Bardowick",324.0,,,...,,,Q727542,,,left_only,,,,
65,WIAG-Pers-CANON-19903-001,70200,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Verden,3487.0,,,...,100989101.0,,Q727571,,,left_only,,,,
77,WIAG-Pers-EPISCGatz-03475-001,80170,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Schwerin,WIAG-Inst-DIOCGatz-028-001,...,,,Q652425,,,left_only,Q153242,Bistum Schwerin,Diözese Schwerin,WIAG-Inst-DIOCGatz-028-001
82,WIAG-Pers-EPISCGatz-03232-001,98649,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Olmütz,WIAG-Inst-DIOCGatz-064-001,...,104280050.0,060-00594-001,Q390757,,,left_only,Q153262,Bistum Olmütz,Diözese Olmütz,WIAG-Inst-DIOCGatz-064-001


In [58]:
missing_institution_diocese_on_factgrid_df = dioc_joined_df[dioc_joined_df['fg_diocese_id'].isna()]

In [59]:
# missing_institution_diocese_on_factgrid_df.drop(['_merge'], axis=1, inplace=True)

In [60]:
# missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df.person_id.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]

## Create a csv file with missing ids for institution or diocese

In [61]:
# null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df['institution_id'].isnull()][missing_institution_diocese_on_factgrid_df['diocese_id'].isnull()]
null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df['role_group'] != 'Kardinal'].sort_values('person_id')
null_entries = null_entries[null_entries['role_group'] != 'Kurienamt']
null_entries = null_entries[null_entries['role_group'] != 'Papst']
null_entries = null_entries.sort_values(['diocese', 'institution'])

# remove pr entries. why? TODO: ask barbara for confirmation
# null_entries = null_entries[null_entries.person_id.str.contains('[0-7][0-9]{4}-[0-9]{3}$', regex=True)]
null_entries[null_entries.columns[:-7]].to_csv(os.path.join(output_path, f'missing_amter-{today_string}.csv'), sep=';')
null_entries

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
1986,WIAG-Pers-EPISCGatz-05361-001,132324,Propst,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,"Augustinerchorherrenstift Neuwerk, Halle",3613.0,,,...,1149618302,,Q654377,,,left_only,,,,
952,WIAG-Pers-EPISCGatz-03223-001,109691,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,"Benediktiner-, dann Prämonstratenserabtei St. ...",7106.0,,,...,1062751981,305-00050-001,Q655092,,,left_only,,,,
1731,WIAG-Pers-CANON-19465-001,133498,"Vikar, Anwärter",Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Benediktinerabtei St. Maria, Stade",891.0,,,...,,,Q726499,,,left_only,,,,
1431,WIAG-Pers-CANON-19513-001,18153,"Vikar, Anwärter",Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Benediktinerabtei St. Maria, Stade",891.0,,,...,,,Q726609,,,left_only,,,,
982,WIAG-Pers-CANON-19943-001,100237,Providierter Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Benediktinerabtei, dann Kollegiatstift St. Ser...",3234.0,,,...,,,Q727685,,,left_only,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,WIAG-Pers-CANON-19604-001,112601,Propst,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,"Kollegiatstift St. Willehad, Bremen",344.0,,,...,,,Q726822,,,left_only,,,,
2026,WIAG-Pers-EPISCGatz-20091-001,119117,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Kollegiatstift St. Willehad, Bremen",344.0,,,...,137406495,,Q654018,,,left_only,,,,
1607,WIAG-Pers-CANON-19789-001,107165,Propst,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,"Minderstift St. Johannes Baptista, Lüneburg",845.0,,,...,,,Q727298,,,left_only,,,,
1229,WIAG-Pers-EPISCGatz-05574-001,100339,"Vikar, Anwärter",Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Minderstift St. Johannes Baptista, Lüneburg",845.0,,,...,1150819278,305-00228-001,Q656098,,,left_only,,,,


# Create the missing institutions on factgrid here

In [74]:
# get all institutions with a kloster id 
missing_kloester_df = null_entries[
    ~(pd.isna(null_entries['institution_id']) |
    pd.isna(null_entries['institution']))
][null_entries.columns[:-7]]
len(missing_kloester_df)

474

Creates a file with the name institution_creation_<date>.csv

Fill in the empty columns of the file and then use the file on quickstatements.

In [75]:
create_institution_factgrid_df = missing_kloester_df[['institution', 'institution_id']].copy()
create_institution_factgrid_df.columns = ['Lde', 'P471']
create_institution_factgrid_df['P471'] = create_institution_factgrid_df['P471'].astype(int)
create_institution_factgrid_df.insert(0, column='qid', value='')
create_institution_factgrid_df.insert(2, column='Les', value='')
create_institution_factgrid_df.insert(2, column='Lfr', value='')
create_institution_factgrid_df.insert(2, column='Len', value='')
create_institution_factgrid_df.insert(6, column='Den', value='')
create_institution_factgrid_df.insert(6, column='Dde', value='')
create_institution_factgrid_df['P131'] = 'Q153178'
create_institution_factgrid_df.set_index('qid')
create_institution_factgrid_df.to_csv(os.path.join(output_path, f'institution_creation_{today_string}.csv'), index=False)
create_institution_factgrid_df

Unnamed: 0,qid,Lde,Len,Lfr,Les,P471,Dde,Den,P131
1986,,"Augustinerchorherrenstift Neuwerk, Halle",,,,3613,,,Q153178
952,,"Benediktiner-, dann Prämonstratenserabtei St. ...",,,,7106,,,Q153178
1731,,"Benediktinerabtei St. Maria, Stade",,,,891,,,Q153178
1431,,"Benediktinerabtei St. Maria, Stade",,,,891,,,Q153178
982,,"Benediktinerabtei, dann Kollegiatstift St. Ser...",,,,3234,,,Q153178
...,...,...,...,...,...,...,...,...,...
1344,,"Kollegiatstift St. Willehad, Bremen",,,,344,,,Q153178
2026,,"Kollegiatstift St. Willehad, Bremen",,,,344,,,Q153178
1607,,"Minderstift St. Johannes Baptista, Lüneburg",,,,845,,,Q153178
1229,,"Minderstift St. Johannes Baptista, Lüneburg",,,,845,,,Q153178


In [73]:
# for _, row in missing_kloester_df.iterrows():
#     print(row)
#     print("CREATE")
#     # Labels
#     name = row['institution']
#     name = f'"{name}"'
#     print('\t'.join(["LAST", "Lde", name]))
#     print('\t'.join(["LAST", "Len", name]))
#     print('\t'.join(["LAST", "Lfr", name]))
#     print('\t'.join(["LAST", "Les", name]))

#     # klosterdatenbank
#     kloster_id = str(int(row['institution_id']))
#     print('\t'.join(["LAST", "P471", kloster_id]))

#     # research project
#     print('\t'.join(["LAST", "P131", 'Q153178']))

#     # p1100 do we need it for institution? I don't think so.
#     # print('\t'.join(["LAST", "P1100", f'off_gsn{kloster_id}']))

#     # query data from klosterdatenbank
#     # https://api.gs.sub.uni-goettingen.de/v1/monastery/3346/json
#     # r = requests.get(f'https://api.gs.sub.uni-goettingen.de/v1/monastery/{kloster_id}', headers={"Accept": "application/json"})
#     # data = r.json()
#     # del data["literature"]
#     # del data["persons"]
#     # print(json.dumps(data, sort_keys=True, indent=4))
    
#     break

### To continue below your csv files should be empty. The check below ensures that.

In [77]:
assert len(create_institution_factgrid_df) == 0, "Create the entries on factgrid before continuing and then re run all code cells from the beginning."

AssertionError: Create the entries on factgrid before continuing and then re run all code cells from the beginning.

In [78]:
# TODO: sent ouput to file instead of stdout

# for _, row in null_entries[null_entries['diocese'].notnull()][['diocese', 'diocese_id']].iterrows():
#     print("CREATE")
#     print('\t'.join(['LAST', 'Lde', f'"{row["diocese"]}"']))
#     print('\t'.join(['LAST', 'P2', 'Q153166']))

#     print('\t'.join(['LAST', 'P131', 'Q153178']))
#     if type(row['diocese_id']) == str:
#         print('\t'.join(['LAST', 'P601', f'"{row["diocese_id"]}"']))

In [79]:
# missing_institution_diocese_on_factgrid_df.to_csv('no_institution_or_dioc_on_fg_bamberg.csv', sep=';')

# Add factgrid information

## Add institution factgrid id

In [80]:
institution_joined_df = role_all_df.merge(
    factgrid_institution_df, how='left', left_on='institution_id', right_on='fg_gsn_id', suffixes=('_wiag', '_institute_fg')
)
print(len(institution_joined_df))
institution_joined_df.head()

KeyError: 'fg_gsn_id'

In [108]:
assert len(role_all_df) == len(institution_joined_df), "There may be duplicates on factgrid"

## Add diocese factgrid id

In [109]:
dioc_joined_df = join_dioceses(institution_joined_df)
print(len(dioc_joined_df))
dioc_joined_df.head()

31472


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,date_sort_key,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,...,909210,118625284,059-01621-001,Q653546,Q898020,3498.0,,,,
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,923150,118625284,059-01621-001,Q653546,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,...,948100,118625284,059-01621-001,Q653546,Q266876,60182.0,,,,
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,...,972150,118625284,059-01621-001,Q653546,,,,,,
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,1077150,13805746X,050-06892-001,Q653421,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001


## Add role factgrid id
Note: This role does not include the institution information. ie, it adds factgrid ids for roles like 'archbishop' and not 'archbishop of trier'

The part of the script below could be used to create quickstatements for career statements.

In [110]:
print(len(wiag_roles_df))
wiag_roles_df.head()
#wiag_roles_df[wiag_roles_df['name'] == '']

263


Unnamed: 0,id,note,name,comment,gs_reg_id,generic_term,plural,definition,role_gp_fg_id,gender,lang,role_group_id,r_id,item_id,r_note,authority_id,role_fg_id,r_comment
0,1,,Titularkönig,,,,,,,männlich,de,32.0,,,,,,
1,2,,Abtbischof,,,,,,Q648236,männlich,de,33.0,,,,,,
2,3,,Administrator des Bistums,,,,,Ein Bistumsadministrator (auch: Diözesanadmini...,Q648236,männlich,de,33.0,1298102.0,3.0,,42.0,Q902170,
3,4,,Administrator des Erzbistums,,,,,,Q648236,männlich,de,33.0,1298104.0,4.0,,42.0,Q902170,
4,5,,Administrator des Fürstbischofs,,,,,Ein Bistumsadministrator (auch: Diözesanadmini...,Q648236,männlich,de,33.0,1298982.0,5.0,,42.0,Q902170,


In [111]:
joined_df = dioc_joined_df.merge(
    wiag_roles_df[['name', 'role_fg_id']], how='left', left_on='name', right_on='name', suffixes=('_wiag', '_institute_fg')
)
print(len(joined_df))
joined_df.head()

31695


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,...,118625284,059-01621-001,Q653546,Q898020,3498.0,,,,,Q38837
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,118625284,059-01621-001,Q653546,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001,Q38809
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,...,118625284,059-01621-001,Q653546,Q266876,60182.0,,,,,Q38954
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,...,118625284,059-01621-001,Q653546,,,,,,,Q38954
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,13805746X,050-06892-001,Q653421,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001,Q902184


### Ignore all Kanonikatsbewerber and Vikariatsbewerber offices


In [112]:
#
joined_df = joined_df[~joined_df['name'].isin(
    ['Vikariatsbewerber', 
     'Kanonikatsbewerber']
)]

## The output below should be empty. The cell after the next one will throw an error if it's not the case

In [113]:
missing_roles_df = joined_df[joined_df['role_fg_id'].isna()]
missing_roles_df.head()

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id
33,WIAG-Pers-CANON-13216-001,18373,Propst und Archidiakon,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,"Benediktinerkloster Ansbach, später Kollegiats...",1015.0,,,...,,,Q648997,Q400534,1015.0,,,,,
50,WIAG-Pers-CANON-49220-001,109435,Domkämmerer,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Speyer,3489.0,,,...,1038267439.0,,Q700918,Q898039,3489.0,,,,,
61,WIAG-Pers-EPISCGatz-02684-001,124663,Generalvikar des Erzbischofs,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Salzburg,WIAG-Inst-DIOCGatz-045-001,...,140745025.0,,Q651525,,,Q153249,Erzbistum Salzburg,Bistum Salzburg,WIAG-Inst-DIOCGatz-045-001,
62,WIAG-Pers-EPISCGatz-02684-001,124665,Offizial des Erzbischofs,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Salzburg,WIAG-Inst-DIOCGatz-045-001,...,140745025.0,,Q651525,,,Q153249,Erzbistum Salzburg,Bistum Salzburg,WIAG-Inst-DIOCGatz-045-001,
66,WIAG-Pers-EPISCGatz-02684-001,124673,"Domthesaurar, Anwärter",Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Augsburg,3498.0,,,...,140745025.0,,Q651525,Q898020,3498.0,,,,,


In [114]:
wiag_roles_df[wiag_roles_df['name'].str.contains('Domp.*')]

Unnamed: 0,id,note,name,comment,gs_reg_id,generic_term,plural,definition,role_gp_fg_id,gender,lang,role_group_id,r_id,item_id,r_note,authority_id,role_fg_id,r_comment
149,149,,"Dompropst, Anwärter",,10150.0,Dompropst,"Dompröpste, Anwärter",,Q648232,männlich,de,41.0,1298124.0,149.0,,42.0,Q902178,
150,150,,Dompropst,,10150.0,Propst,Dompröpste,,Q648232,männlich,de,41.0,1102327.0,150.0,,42.0,Q38842,
151,151,,Domprediger,,13040.0,Prediger,Domprediger,,Q648226,männlich,de,42.0,1102335.0,151.0,,42.0,Q38841,
152,152,,Dompfortner,,,Pfortner,Dompfortner,,Q648226,männlich,de,42.0,,,,,,
191,190,,Providierter Dompropst,,10150.0,Dompropst,Providierte Dompröpste,,Q648232,männlich,de,41.0,,,,,,
252,250,,Domprior,,10140.0,,Domprioren,,Q648232,männlich,de,41.0,,,,,,
262,60700,,Dompfarrer,,,,Dompfarrer,,,männlich,de,42.0,1298122.0,60700.0,,42.0,Q902177,


In [119]:
missing_roles = joined_df[joined_df['role_fg_id'].isna()]['name'].unique()
print(len(missing_roles))
missing_roles

128


array(['Propst und Archidiakon', 'Domkämmerer',
       'Generalvikar des Erzbischofs', 'Offizial des Erzbischofs',
       'Domthesaurar, Anwärter', 'Thesaurar', 'Providierter Propst',
       'Domscholaster, Anwärter', 'Kaplan', 'Dechant (Prag)',
       'Fürsterzbischof', 'Stiftsherr', 'Domprior', 'Altarist',
       'Bischöflicher Offizial', 'Gewählter Koadjutor des Bischofs',
       'Domdechant', 'Ernannter Erzbischof',
       'Providierter und konfirmierter Bischof', 'Providierter Dompropst',
       'Vizedominus', 'Generaloffizial', 'Providierter Domvikar',
       'Dekan, Anwärter', 'Lektor', 'Domdekan, Anwärter',
       'Archidiakon (Schleswig)', 'Evangelischer Administrator',
       'Bischofskandidat', 'Offizial des Bischofs',
       'Koadjutor des Fürstpropstes', 'Fürstpropst',
       'Koadjutor des Fürstabtes', 'Koadjutor des Propstes',
       'Domchorbischof', 'Apostolischer Vikar', 'Gegenbischof', 'Viztum',
       'Domküster', 'Kanonikatsanwärter', 'Vikarieanwärter',
       'Dom

### Create a csv file to be manually filled and later read to generate quickstatements 

In [136]:
rows = []
for role in missing_roles:
    role_group_fq_id = missing_roles_df[missing_roles_df['name'] == role]['role_group_fq_id'].head(1).squeeze(axis=0)
    item_id = wiag_roles_df[wiag_roles_df['name'] == role].head(1).squeeze(axis=0)['id']
    if type(item_id) == pd.Series:
        item_id = ""
    row = {
        "Lde": f'{role}',
        "Len": "",
        "Lfr": "",
        "Dde": "",
        "Den": "",
        "P2": "Q37073",
        "P131": "Q153178",
        "item_id": item_id,
    }
    if not pd.isna(role_group_fq_id):
        row["P3"] = role_group_fq_id
    rows.append(row)
create_missing_roles_df = pd.DataFrame(rows)
create_missing_roles_df.to_csv(os.path.join(output_path, f"create-missing-roles-{today_string}.csv"))
create_missing_roles_df

Unnamed: 0,Lde,Len,Lfr,Dde,Den,P2,P131,item_id,P3
0,Propst und Archidiakon,,,,,Q37073,Q153178,202,Q648236
1,Domkämmerer,,,,,Q37073,Q153178,102,Q648232
2,Generalvikar des Erzbischofs,,,,,Q37073,Q153178,44,Q648236
3,Offizial des Erzbischofs,,,,,Q37073,Q153178,84,Q648236
4,"Domthesaurar, Anwärter",,,,,Q37073,Q153178,143,Q648232
...,...,...,...,...,...,...,...,...,...
123,Pfründenbewerber,,,,,Q37073,Q153178,,
124,Präbendar,,,,,Q37073,Q153178,,
125,Dekanatsbewerber,,,,,Q37073,Q153178,,
126,Generalvikar für das Obererzstift,,,,,Q37073,Q153178,46,Q648236


In [156]:
filename = f"create-missing-roles-{today_string}.qs"
create_missing_roles_df = pd.read_csv('create-missing-roles-2024-05-28.csv', index_col=0)
ignore_cols = ['item_id']
string_cols = ['Lde', 'Len', 'Lfr', 'Dde', 'Den']
with open(filename, 'w') as file:
    for _, row in create_missing_roles_df.iterrows():
        file.write("CREATE\n")
        for col in create_missing_roles_df.columns:
            value = row[col]
            if pd.isna(value) and col in string_cols:
                value = ''
            if not pd.isna(value):
                if col in string_cols:
                    value = f'"{value}"'
                if col not in ignore_cols:
                    file.write("\t".join(["LAST", col, value + "\n"]))
print(filename)

create-missing-roles-2024-05-28.qs


In [None]:
filename = f"create-missing-roles-{today_string}.qs"
with open(filename, 'w') as file:
    for role in missing_roles:
        file.write("CREATE\n")
        file.write("\t".join(["LAST", "Lde", role + "\n"]))
        file.write("\t".join(["LAST", "P2", "Q37073\n"]))
        file.write("\t".join(["LAST", "P131", "Q153178\n"]))
        search_res = missing_roles_df[missing_roles_df['name'] == role]['role_group_fq_id'].head(1).squeeze(axis=0)
        if not pd.isna(search_res):
            file.write("\t".join(["LAST", "P3", search_res + "\n"]))
print(filename)

In [111]:
assert len(joined_df[joined_df['role_fg_id'].isna()]) == 0, "Missing roles with factgrid id in wiag database"

## Check people with missing factgrid entries or missing factgrid ids in wiag

In [112]:
joined_df[joined_df['FactGrid'].isna()]

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id
65,WIAG-Pers-CANON-80982-001,56198,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,Bistum Bamberg,WIAG-Inst-DIOCGatz-002-001,...,,007-01739-001,,Q400530,3492.0,Q153216,Bistum Bamberg,Diözese Bamberg,WIAG-Inst-DIOCGatz-002-001,Q38837
637,WIAG-Pers-CANON-25882-001,140432,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Würzburg,3502.0,,,...,13948941X,012-01202-001,,Q400557,3502.0,,,,,Q38837
638,WIAG-Pers-CANON-25882-001,140434,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,,,...,13948941X,012-01202-001,,Q400530,3492.0,,,,,Q38837
639,WIAG-Pers-CANON-25882-001,140436,Domdekan,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Würzburg,3502.0,,,...,13948941X,012-01202-001,,Q400557,3502.0,,,,,Q38836
640,WIAG-Pers-CANON-25882-001,140438,Generalvikar,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Würzburg,WIAG-Inst-DIOCGatz-033-001,...,13948941X,012-01202-001,,,,Q153247,Bistum Würzburg,Diözese Würzburg,WIAG-Inst-DIOCGatz-033-001,Q39117
1149,WIAG-Pers-CANON-48974-001,139834,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,,,...,,,,Q400530,3492.0,,,,,Q38837
1228,WIAG-Pers-CANON-45781-001,139446,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,,,...,,,,Q400530,3492.0,,,,,Q38837
1229,WIAG-Pers-CANON-45781-001,139448,Domcellerar,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Bamberg,3492.0,,,...,,,,Q400530,3492.0,,,,,Q893490
1230,WIAG-Pers-CANON-45781-001,139450,Domkustos,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Bamberg,3492.0,,,...,,,,Q400530,3492.0,,,,,Q902176
1231,WIAG-Pers-CANON-45781-001,139452,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,...,,,,Q898020,3498.0,,,,,Q38837


In [113]:
joined_df[joined_df['FactGrid'].isna()]['person_id'].unique()

array(['WIAG-Pers-CANON-80982-001', 'WIAG-Pers-CANON-25882-001',
       'WIAG-Pers-CANON-48974-001', 'WIAG-Pers-CANON-45781-001',
       'WIAG-Pers-CANON-48993-001', 'WIAG-Pers-CANON-24731-001',
       'WIAG-Pers-CANON-25122-001', 'WIAG-Pers-CANON-49126-001',
       'WIAG-Pers-CANON-14941-001', 'WIAG-Pers-CANON-14944-001',
       'WIAG-Pers-CANON-15005-001', 'WIAG-Pers-CANON-26451-001',
       'WIAG-Pers-CANON-15070-001'], dtype=object)

## (Optinally) generate the quickstatements for creating the persons here

In [114]:
joined_df[joined_df['FactGrid'].isna()].to_csv(os.path.join(output_path, f'missing_factgrid-{today_string}_Bamberg.csv', sep=';'))

## The cell below will throw an error if there are entries on Factgrid but not on Wiag

In [115]:
assert joined_df[joined_df['FactGrid'].isna()].empty, "There are missing persons on factgrid"

AssertionError: There are missing persons on factgrid

## WARNING: the code section below ignores all entries absent on factgrid

In [116]:
joined_df = joined_df[~joined_df['FactGrid'].isna()]

In [117]:
factgrid_inst_roles_df

Unnamed: 0,fg_inst_role_id,inst_role
0,Q172316,Generalvikar der Diözese Trento
1,Q172317,Generalvikar der Diözese Utrecht
2,Q172318,Generalvikar der Diözese Vienna
3,Q172319,Generalvikar der Diözese Lausanne
4,Q172320,Generalvikar der Diözese Pomesanien
...,...,...
4338,Q452118,Schauspieler/Schauspielerin des Künstlertheaters
4339,Q452119,Schauspieler/in des Belvárosi-Theaters
4340,Q452120,Schauspieler/Schauspielerin des Pester Theaters
4341,Q452121,Schauspieler/Schauspielerin des Theaters der U...


## Add factgrid ids for roles
Note: this role has information of the institution as well

In [118]:
# add factgrid ids for roles
found = 0
data_dict = {}
not_found = []
dupl = {}
for i, (name, inst, dioc) in joined_df[['name', 'institution', 'diocese']].iterrows():
    if name == "Kardinal":
        # this is okay
        # manually add qid for kardinals Q254893
        data_dict[i] = "Q254893"
        continue
    search_res = pd.DataFrame()
    if pd.isna(inst):
    # if name in ["Bischof", "Fürstbischof", "Ernannter Bischof", "Erzbischof"]:
        if pd.isna(dioc):
            print(i, name, inst, dioc)
        if name not in ["Archidiakon", "Koadjutor"]:
            dioc = dioc.lstrip('Bistum').lstrip('Erzbistum').lstrip('Patriarchat').lstrip()
        if name == "Fürstbischof" and dioc in ["Passau", "Straßburg"]:
            name = "Bischof"    
        search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['inst_role'].str.contains(f"^{name}.*{dioc}")]
        if name == "Erzbischof" and dioc == "Salzburg":
            # will be merged in later
            search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['fg_inst_role_id'] == 'Q172567']
    else:
        name = name.replace('Domkanoniker', 'Domherr')
        search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['inst_role'] == f"{name} {inst}"]
    if len(search_res) == 1:
        found += 1
        data_dict[i] = search_res['fg_inst_role_id'].values[0]
    elif len(search_res) >= 2:
        # print("+" * 10)
        print(name, inst, dioc)
        print(search_res)
        print()
        dupl[i] = (name, inst, dioc, search_res)
    elif len(search_res) == 0:
        not_found.append((name, inst, dioc))
        print(name, inst, dioc)
print("===Summary of roles===")
print("Found:", found, "Duplicates:", len(dupl), "Not found:", len(not_found))

Propst Kollegiatstift St. Nikolaus, Spalt nan
Domsenior Domstift Würzburg nan
Cellerar Domstift Eichstätt nan
Propst Benediktinerkloster, später Kollegiatstift St. Cyriakus, Wiesensteig nan
Domscholaster Domstift Eichstätt nan
Domcellerar Domstift Augsburg nan
Kapitularvikar nan Würzburg
Propst Kollegiatstift Unsere Liebe Frau, Eichstätt nan
===Summary of roles===
Found: 2742 Duplicates: 0 Not found: 8


In [119]:
final_joined_df = joined_df.merge(pd.Series(data_dict).rename('fg_inst_role_id'), left_index=True, right_index=True)
print(len(final_joined_df))
final_joined_df.head()

2743


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id,fg_inst_role_id
0,WIAG-Pers-EPISCGatz-05072-001,5154,Erzbischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Trier,WIAG-Inst-DIOCGatz-030-001,...,062-01186-001,Q653844,,,Q153244,Erzbistum Trier,Bistum Trier,WIAG-Inst-DIOCGatz-030-001,Q172539,Q172572
1,WIAG-Pers-EPISCGatz-04050-001,3770,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Eichstätt,WIAG-Inst-DIOCGatz-006-001,...,059-00710-001,Q652691,,,Q153220,Bistum Eichstätt,Diözese Eichstätt,WIAG-Inst-DIOCGatz-006-001,Q38809,Q172630
2,WIAG-Pers-CANON-24965-001,22325,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Würzburg,3502.0,,,...,,Q728753,Q400557,3502.0,,,,,Q38837,Q390632
3,WIAG-Pers-CANON-24965-001,31418,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,,,...,,Q728753,Q400530,3492.0,,,,,Q38837,Q400601
4,WIAG-Pers-EPISCGatz-03869-001,3586,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Bamberg,WIAG-Inst-DIOCGatz-002-001,...,006-00007-001,Q652702,,,Q153216,Bistum Bamberg,Diözese Bamberg,WIAG-Inst-DIOCGatz-002-001,Q38809,Q195266


## Parse begin and end date from the wiag data

In [137]:
# https://database.factgrid.de/query/embed.html#SELECT%20%3FPropertyLabel%20%3FProperty%20%3FPropertyDescription%20%3Freciprocal%20%3FreciprocalLabel%20%3Fexample%20%3Fuseful_statements%20%3Fwd%20WHERE%20%7B%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20%3FProperty%20wdt%3AP8%20wd%3AQ77483.%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP364%20%3Fexample.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP86%20%3Freciprocal.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP343%20%3Fwd.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP310%20%3Fuseful_statements.%20%7D%0A%7D%0AORDER%20BY%20%3FPropertyLabel

def format_datetime(entry: datetime, resolution):
    julian_ending = "/J" if entry.year < 1582 else ""
    ret_val =  f"+{entry.isoformat()}Z/{resolution}" + julian_ending
    if resolution <= 9:
        ret_val = ret_val.replace(f"{entry.year}-01-01", f"{entry.year}-00-00", 1)
    return ret_val

def date_parsing(date_string: str, end=False, only_date=False):
    return_property = "P106" if only_date else "P50" if end else "P49"
    qualifier = None
    entry = None
    resolution = 7
    string_precision_qualifier_clause = ("P73" if only_date else "P788" if end else "P787") + f'\t"{date_string}"'
    exact_precision_qualifier = ("P73" if only_date else "P786" if end else "P785")

    if pd.isna(date_string) or date_string == '?':
        return tuple()
    
    if matches := re.match(r'frühestens (\d{3,4})', date_string):
        return_property = "P41" if only_date else "P1125" if end else "P1126"
        entry = datetime(int(matches.group(1)), 1, 1)
        resolution = 9
    
    elif matches := re.match(r'(kurz )?vor (\d{3,4})', date_string):
        return_property = "P43" if only_date else "P1123" if end else "P1124"
        if matches.group(1):
            qualifier = string_precision_qualifier_clause
        entry = datetime(int(matches.group(2)), 1, 1)
        resolution = 9
        
    elif matches := re.match(r'(kurz )?nach (\d{3,4})', date_string):
        return_property = "P41" if only_date else "P1125" if end else "P1126"
        if matches.group(1):
            qualifier = string_precision_qualifier_clause
        entry = datetime(int(matches.group(2)), 1, 1)
        resolution = 9
        
    elif matches := re.match(r'(\d{1,2})\. Jahrhundert', date_string):
        century = int(matches.group(1))
        entry = datetime(100 * (century), 1, 1)
    
    elif matches := re.match(r'(\d)\. Hälfte (des )?(\d{1,2})\. (Jhs\.|Jahrhunderts|Jahrhundert)', date_string):
        half = int(matches.group(1))
        year = int(matches.group(3)) - 1
        latest_year   = year * 100 + (half * 50)
        earliest_year = latest_year - 50 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = string_precision_qualifier_clause
    
    elif matches := re.match(r'(\w+) Viertel des (\d{1,2})\. Jhs\.', date_string):
        number_map = {
            "erstes":  1,
            "zweites": 2,
            "drittes": 3,
            "viertes": 4,
        }
        quarter = number_map[matches.group(1)]
        year    = int(matches.group(2)) - 1
        latest_year   = year * 100 + (quarter * 25)
        earliest_year = latest_year - 25 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'frühes (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1)) - 1
        latest_year   = year * 100 + 20
        earliest_year = latest_year - 20 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'spätes (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1))
        latest_year   = year * 100
        earliest_year = latest_year - 20 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'(Anfang|Mitte|Ende) (\d{1,2})\. Jh\.', date_string):
        number_map = {
            "Anfang":  1,
            "Mitte": 2,
            "Ende": 3,
        }
        third = number_map[matches.group(1)]
        year = int(matches.group(2)) - 1
        latest_year   = year * 100 + (third * 33)
        earliest_year = latest_year - 33 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'(ca\.|um) (\d{3,4})', date_string):
        year = int(matches.group(2))
        latest_year   = year + 5
        earliest_year = latest_year - 10
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        resolution = 9
        qualifier = exact_precision_qualifier + "\tQ10"

    elif matches := re.match(r'(\d{3,4})er Jahre', date_string):
        year = int(matches.group(1))
        latest_year   = year + 10
        earliest_year = latest_year - 10 + 1
        entry = datetime(year, 1, 1)
        resolution = 8
    
    elif matches := re.match(r'Wende zum (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1)) - 1
        latest_year   = year * 100 + 10
        earliest_year = latest_year - 20 + 1
        entry = datetime(latest_year, 1, 1)
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'Anfang der (\d{3,4})er Jahre', date_string):
        year = int(matches.group(1))
        latest_year   = year + 3
        earliest_year = year
        entry = datetime(year, 1, 1)
        resolution = 8
        qualifier = string_precision_qualifier_clause

    elif matches := re.match(r'\((\d{3,4}) \?\) (\d{3,4})', date_string):
        year1 = int(matches.group(1))
        year2 = int(matches.group(2))
        entry = datetime(year2, 1, 1)
        resolution = 9
        qualifier = string_precision_qualifier_clause
    
    elif matches := re.match(r'(\d{3,4})/(\d{3,4})', date_string):
        year1 = int(matches.group(1))
        year2 = int(matches.group(2))

        if year2 - year1 == 1:
            # check for consecutive years
            qualifier = exact_precision_qualifier + "\tQ912616"
        entry = datetime(year1, 1, 1)
        resolution = 9

    elif matches := re.match(r'(\d{3,4})\?', date_string):
        year = int(matches.group(1))
        entry = datetime(year, 1, 1)
        resolution = 9
        qualifier = exact_precision_qualifier + f'\t"{date_string}"'
    
    elif matches := re.match(r'(\d{3,4})', date_string):
        year = int(matches.group(1))
        entry = datetime(year, 1, 1)
        resolution = 9
    else:
        print(date_string)
        raise Exception("Couldn't parse date")
        
    if qualifier:
        return (return_property, format_datetime(entry, resolution), qualifier)
    else:
        return (return_property, format_datetime(entry, resolution))

tests = {
    "1205": "+1205-00-00T00:00:00Z/9/J",
    "12. Jahrhundert": "+1200-00-00T00:00:00Z/7/J",
    "1. Hälfte des 12. Jhs.": "+1125-00-00T00:00:00Z/7/J",
    "2. Hälfte des 12. Jhs.": "+1175-00-00T00:00:00Z/7/J",
    "erstes Viertel des 12. Jhs.": "+1113-00-00T00:00:00Z/7/J",
    "zweites Viertel des 12. Jhs.": "+1138-00-00T00:00:00Z/7/J",
    "drittes Viertel des 12. Jhs.": "+1163-00-00T00:00:00Z/7/J",
    "viertes Viertel des 12. Jhs.": "+1188-00-00T00:00:00Z/7/J",
    "frühes 12. Jh.": "+1110-00-00T00:00:00Z/7/J",
    "spätes 12. Jh.": "+1190-00-00T00:00:00Z/7/J",
    "Anfang 12. Jh.": "+1117-00-00T00:00:00Z/7/J",
    "Mitte 12. Jh.": "+1150-00-00T00:00:00Z/7/J",
    "Ende 12. Jh.": "+1183-00-00T00:00:00Z/7/J",
    "ca. 1050": "+1050-00-00T00:00:00Z/9/J",
    "um 1050": "+1050-00-00T00:00:00Z/9/J",
    "1230er Jahre": "+1230-00-00T00:00:00Z/8/J",
    "Wende zum 12. Jh.": "+1110-00-00T00:00:00Z/7/J",
    "Anfang der 1480er Jahre": "+1480-00-00T00:00:00Z/8/J",
    "frühestens 1342": "+1342-00-00T00:00:00Z/9/J",
    "vor 1230": "+1230-00-00T00:00:00Z/9/J",
    "nach 1230": "+1230-00-00T00:00:00Z/9/J",
    "kurz vor 1200": "+1200-00-00T00:00:00Z/9/J",
    "kurz nach 1200": "+1200-00-00T00:00:00Z/9/J",
    "1164/1165": "+1164-00-00T00:00:00Z/9/J",
    "1164/1177": "+1164-00-00T00:00:00Z/9/J",
}

for key, value in tests.items():
    retval = date_parsing(key)[1]
    assert retval == value, f"{key}: Returned {retval} instead of {value}"


## Reconcile office data with factgrid

In [138]:
#https://database.factgrid.de/wiki/Special:EntityData/Q515.json

## Generate quickstatements for offices

In [139]:
with open(f'quickstatments_{today_string}.qs', 'w') as file:
    for _, row in final_joined_df.iterrows():
        try:
            date_clauses = ()
            if pd.isna(row['date_begin']):
                date_clauses = date_parsing(row['date_begin'], only_date=True)
            elif pd.isna(row['date_end']):
                date_clauses = date_parsing(row['date_begin'], only_date=True)
            else:
                date_clauses = (*date_parsing(row['date_begin']), *date_parsing(row['date_end'], end=True))
            file.write('\t'.join([
                row['FactGrid'], 
                'P165', 
                row['fg_inst_role_id'],
                'S601', 
                '"' + row['person_id'] + '"',
                *date_clauses,
            ]) + '\n')
        except Exception as e:
            print(traceback.format_exc())
            print(row)

In [132]:
final_joined_df[final_joined_df['name'] == 'Kanoniker'][final_joined_df['person_id'] == 'WIAG-Pers-EPISCGatz-10347-001']

  final_joined_df[final_joined_df['name'] == 'Kanoniker'][final_joined_df['person_id'] == 'WIAG-Pers-EPISCGatz-10347-001']


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id,fg_inst_role_id
2434,WIAG-Pers-EPISCGatz-10347-001,137596,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Kollegiatstift St. Viktor, Mainz",3249.0,,,...,082-01197-001,Q654987,Q400416,3249.0,,,,,Q38823,Q907996
2436,WIAG-Pers-EPISCGatz-10347-001,137600,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Kollegiatstift St. Peter und St. Alexander, As...",1016.0,,,...,082-01197-001,Q654987,Q400401,1016.0,,,,,Q38823,Q400574
2439,WIAG-Pers-EPISCGatz-10347-001,137606,Kanoniker,Amt Stift,Affiliate of a collegiate church or convent,Q648228,"Benediktinerabtei, dann Kollegiatstift St. Alb...",3241.0,,,...,082-01197-001,Q654987,Q400526,3241.0,,,,,Q38823,Q907994


In [45]:
# all_roles = set()

# def add_possible_list(a_set: set, element):
#     if type(element) == list:
#         # add all items in list
#         for item in element:
#             a_set.add(item)
#     else:
#         # simply add the given element
#         a_set.add(element)

# for summary in missing_on_factgrid_df['summary_offices'].tolist():
#     print(summary)
#     offices = re.split(r'(\d+),', summary)
#     offices = [office.lstrip() for office in offices]
#     new_offices = offices
#     for index, office in enumerate(offices):
#         if re.match('\d+', office):
#             new_offices[index - 1] += office
#             new_offices.pop(index)
#     print(new_offices)
#     for office in new_offices:
#         for office_name in re.match(r'\w+(, \w+)*', office).group().split(','):
#             office_name = office_name.strip()
#             print(office_name)
#             all_roles.add(office_name)
#     print()

# print('#'*10)
# print(all_roles)

# url = 'https://database.factgrid.de/sparql'
# query = (
#     f"""SELECT DISTINCT ?item ?label
# WHERE
# {{
#   ?item wdt:P2 wd:Q37073;
#         rdfs:label ?label.
#   FILTER(LANG(?label) = "de").
#   FILTER REGEX (?label, "({'|'.join(list(all_roles))})$").
# }}
# ORDER BY ?label  
# """
# )
# print(query)
# # SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# # make request: 
# r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
# data = r.json()
# factgrid_roles_df = pd.json_normalize(data['results']['bindings'])

# len(factgrid_roles_df)

# list(all_roles)