Run the line below (without the # symbol) to install pandas if not already installed

In [1]:
# !py -m pip install pandas

In [1]:
import requests
import csv
import os
import pandas as pd
import json
import re
import time
from datetime import datetime, timedelta
import math
import traceback

In [2]:
# input_path = "C:\\Users\\khan32\\Documents\\factgrid_py"
input_path = "."

In [3]:
output_path = input_path

In [4]:
place_name = "Bamberg"

In [5]:
today_string = datetime.now().strftime('%Y-%m-%d')

In [6]:
# SELECT * FROM `role` r INNER JOIN url_external u on r.id = u.item_id where u.authority_id = 42 

Export the results from the query 
```sql
SELECT * FROM role r INNER JOIN url_external u on r.id = u.item_id where u.authority_id = 42
```  
from the main wiag database in the csv format.

Rename it to include the date. An example filename would be `role_2024_04_24.csv`

In [7]:
input_file = f"role_2024_05_10.csv"
input_path_file = os.path.join(input_path, input_file)
wiag_roles_df = pd.read_csv(input_path_file, names=['id', 'note', 'name', 'comment', 'gs_reg_id', 'generic_term', 'plural', 'definition', 'role_gp_fg_id', 'gender', 'lang', 'role_group_id','r_id','item_id','r_note','authority_id','role_fg_id','r_comment'])
len(wiag_roles_df)

102

### Download data from wiag
https://wiag-vokabulare.uni-goettingen.de/query/can

In [8]:
input_file = f"WIAG-Domherren-DB-Ämter_2024-05-28.csv"
input_path_file = os.path.join(input_path, input_file)
role_all_df = pd.read_csv(input_path_file, sep=';')
len(role_all_df)

31454

In [9]:
last_modified = datetime.fromtimestamp(os.path.getmtime(input_file))
now = datetime.now()
assert last_modified.day == now.day and last_modified.month == now.month, f"The file was last updated on {last_modified.strftime('%d.%m')}"

### ERROR: If you get an error when you run the line above this means that the file was not updated today. 
A few solutions: 
* update the file again by downloading it again
* change the file name to something correct
* (not recommended) continue if you are sure that you need to use old data.

In [10]:
len(role_all_df)

31454

In [11]:
role_all_df.head()

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,date_begin,date_end,date_sort_key,GND,GSN,FactGrid
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,um 909,,909210,118625284,059-01621-001,Q653546
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,923,973,923150,118625284,059-01621-001,Q653546
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,vor 948?,952?,948100,118625284,059-01621-001,Q653546
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,972,973,972150,118625284,059-01621-001,Q653546
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,1077,1096,1077150,13805746X,050-06892-001,Q653421


# Download data from factgrid

If any of the following requests to factgrid fail, try re running the cells.

In [53]:
url = 'https://database.factgrid.de/sparql'
query = (
    """SELECT ?item ?gsn WHERE {
  ?item wdt:P471 ?gsn
}
"""
)
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# make request: 
r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_institution_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_institution_df)

271

In [54]:
url = 'https://database.factgrid.de/sparql'
query = (
"""
SELECT ?item ?wiagid ?label ?alternative WHERE {
  ?item wdt:P2/wdt:P3* wd:Q164535.
  #?item schema:description ?itemDesc.
  ?item rdfs:label ?label.
  OPTIONAL {?item schema:description ?itemDesc.}
  OPTIONAL {?item skos:altLabel ?alternative. }
  OPTIONAL {?item wdt:P601 ?wiagid.}
  FILTER(LANG(?label) in ("en", "de"))
}
"""
)


# version 2
# SELECT ?item ?wiagid (group_concat(DISTINCT ?label; separator=',') as ?labels) (group_concat(DISTINCT ?itemDesc; separator=',') as ?itemDescs) (group_concat(DISTINCT ?alternative ; separator=',') as ?alternatives) WHERE {
#   ?item wdt:P2/wdt:P3* wd:Q164535.
#   ?item schema:description ?itemDesc.
#   ?item rdfs:label ?label.
#   OPTIONAL {?item schema:description ?itemDesc.}
#   OPTIONAL {?item skos:altLabel ?alternative. }
#   OPTIONAL {?item wdt:P601 ?wiagid.}
#   FILTER(LANG(?label) in ("en", "de"))
# }
# GROUP BY ?item ?wiagid

# SELECT ?item ?wiagid WHERE {
#   ?item wdt:P2/wdt:P3* wd:Q164535.
#   ?item wdt:P601 ?wiagid
# }
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# make request: 
r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_diocese_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_diocese_df)

1280

In [55]:
url = 'https://database.factgrid.de/sparql'
query = (
"""
SELECT ?item ?label WHERE {
  ?item wdt:P2 wd:Q257052.
  ?item rdfs:label ?label.
  FILTER(LANG(?label) in ("de"))
}
"""
)

r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
data = r.json()
factgrid_inst_roles_df = pd.json_normalize(data['results']['bindings'])

len(factgrid_inst_roles_df)

4351

# Clean Factgrid data

In [56]:
# extract out q id
def extract_qid(df, column):
    df[column] = df[column].map(lambda x: x.strip('https://database.factgrid.de/entity/'))
 
#factgrid_df['item.value'] = factgrid_df['item.value'].map(lambda x: x.strip('https://database.factgrid.de/entity/'))

# drop irrelevant columns
def drop_type_columns(df):
    df.drop(columns=[column for column in df.columns if column.endswith('type')], inplace=True)
    df.drop(columns=[column for column in df.columns if column.endswith('xml:lang')], inplace=True)

In [57]:
extract_qid(factgrid_institution_df, 'item.value')
extract_qid(factgrid_diocese_df, 'item.value')
extract_qid(factgrid_inst_roles_df, 'item.value')

In [58]:
drop_type_columns(factgrid_institution_df)
drop_type_columns(factgrid_diocese_df)
drop_type_columns(factgrid_inst_roles_df)

In [59]:
# rename columns
factgrid_institution_df.columns = ['fg_institution_id', 'fg_gsn_id']
factgrid_diocese_df.columns = ["fg_diocese_id", "dioc_label", "dioc_alt", "dioc_wiag_id"]
factgrid_inst_roles_df.columns = ["fg_inst_role_id", "inst_role"]

In [60]:
# clean the diocese alts by removing BITECA and BETA entries 
factgrid_diocese_df['dioc_alt'] = factgrid_diocese_df['dioc_alt'].replace(['^BITECA.*', '^BETA.*'], '', regex=True)
factgrid_diocese_df
# set(factgrid_diocese_df['fg_alts'])

Unnamed: 0,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,Q153221,Bistum Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
1,Q153221,Diocese of Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
2,Q153221,Bistum Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
3,Q153221,Diocese of Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
4,Q153221,Bistum Freising,dioecesis Frisingensis,WIAG-Inst-DIOCGatz-007-001
...,...,...,...,...
1275,Q395389,Archdiocese of Agrigento,Bistum Agrigent,
1276,Q395389,Erzbistum Agrigent,Diözese Agrigent,
1277,Q395389,Archdiocese of Agrigento,Diözese Agrigent,
1278,Q395390,Erzbistum Catania,,


In [61]:
factgrid_institution_df['fg_gsn_id'] = pd.to_numeric(factgrid_institution_df['fg_gsn_id'], downcast='float')

In [62]:
fg_gp = factgrid_institution_df.groupby('fg_gsn_id').count()
duplicate_fg_entries = factgrid_institution_df[factgrid_institution_df['fg_gsn_id'].isin(list(fg_gp[fg_gp['fg_institution_id'] > 1].index))]
assert duplicate_fg_entries.empty, f"There are possible duplicates on factgrid\n.{duplicate_fg_entries}"

In [63]:
# factgrid_institution_df.drop_duplicates(['fg_gsn_id'], inplace=True)

In [64]:
factgrid_institution_df.dtypes

fg_institution_id     object
fg_gsn_id            float32
dtype: object

In [65]:
factgrid_diocese_df.dtypes

fg_diocese_id    object
dioc_label       object
dioc_alt         object
dioc_wiag_id     object
dtype: object

In [66]:
role_all_df.dtypes

person_id            object
id                    int64
name                 object
role_group           object
role_group_en        object
role_group_fq_id     object
institution          object
institution_id      float64
diocese              object
diocese_id           object
date_begin           object
date_end             object
date_sort_key         int64
GND                  object
GSN                  object
FactGrid             object
dtype: object

In [67]:
# TODO: Find a way to query amter data directly from wiag
# url = 'https://wiag-vocab.adw-goe.de/domherr/data'
# r = requests.get(url, params={'domstift': place_name})
# data = r.json()
# person_import_df = pd.json_normalize(data['persons'])

# print(len(person_import_df))
# person_import_df.head()

In [68]:
factgrid_institution_df.head()

Unnamed: 0,fg_institution_id,fg_gsn_id
0,Q422286,114.0
1,Q470546,119.0
2,Q633292,120.0
3,Q633339,131.0
4,Q633346,139.0


In [69]:
factgrid_diocese_df.head()

Unnamed: 0,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,Q153221,Bistum Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
1,Q153221,Diocese of Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
2,Q153221,Bistum Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
3,Q153221,Diocese of Freising,Bishopric of Freising,WIAG-Inst-DIOCGatz-007-001
4,Q153221,Bistum Freising,dioecesis Frisingensis,WIAG-Inst-DIOCGatz-007-001


# Check for missing data

## Check for missing institutions

In [70]:
missing_institution_on_factgrid_df = role_all_df.merge(
    factgrid_institution_df, indicator = True, how='left', left_on='institution_id', right_on='fg_gsn_id', suffixes=('_wiag', '_institute_fg')
).loc[lambda x : x['_merge']!='both']

In [71]:
print(len(missing_institution_on_factgrid_df))
missing_institution_on_factgrid_df.head()

8073


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,date_begin,date_end,date_sort_key,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,923,973,923150,118625284,059-01621-001,Q653546,,,left_only
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,972,973,972150,118625284,059-01621-001,Q653546,,,left_only
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,1077,1096,1077150,13805746X,050-06892-001,Q653421,,,left_only
7,WIAG-Pers-EPISCGatz-04027-001,110917,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Chur,WIAG-Inst-DIOCGatz-051-001,1079/1080,1088,1079150,,710-00332-001,Q653192,,,left_only
9,WIAG-Pers-EPISCGatz-02518-001,100965,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,1184,1202,1184150,138018383,059-01616-001,Q649639,,,left_only


In [72]:
# missing_institution_on_factgrid_df[['institution', 'institution_id', 'diocese', 'diocese_id']]

## Create a csv file with persons having no institution_id, diocese_id and diocese

In [73]:
missing_institution_diocese_on_factgrid_df = missing_institution_on_factgrid_df.merge(
    factgrid_diocese_df, indicator = '_second_merge', how='left', left_on='diocese_id', right_on='dioc_wiag_id', suffixes=('_wiag', '_dioc_fg')
).loc[lambda x : x['_second_merge']!='both']
missing_institution_diocese_on_factgrid_df.drop(['_merge','_second_merge'], axis=1, inplace=True)
null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df[['institution_id', 'diocese_id', 'diocese']].isna().all(axis=1)]
null_entries = null_entries[null_entries['name'] != 'Kardinal'].sort_values('person_id')

# 4312 to 7979 should be removed
null_entries = null_entries[null_entries.person_id.str.contains('[0-7][0-9]{4}-[0-9]{3}$', regex=True)]
null_entries.to_csv(f'simple_null_entries_amter-{today_string}.csv', sep=';')
assert len(null_entries) == 0

## Add diocese information from factgrid

Diocese is matched by 3 queries between wiag and factgrid. They are applied in the following priority. If a higher priority query find a match, the following query does nothing:
* wiag id (wiag) => wiag id (factgrid)
* diocese name (wiag) => diocese label (factgrid)
* diocese name (wiag) => diocese alt label (factgrid)

In [74]:
# lookup for the diocese by the diocese_id, then diocese labels, then diocese alts
def join_dioceses(input_df):
    rows = []
    for _, row in input_df.iterrows():
        search_row = pd.Series({'fg_diocese_id':None, 'dioc_label':None, 'dioc_alt':None, 'dioc_wiag_id':None})

        # NOTE: in the following three code blocks only the query changes.
        # this could be refactored somehow without running the query when defining it
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_wiag_id'] == row['diocese_id']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
    
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_label'] == row['diocese']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
    
        if not search_row['fg_diocese_id']:
            query = factgrid_diocese_df[factgrid_diocese_df['dioc_alt'] == row['diocese']]
            if not query.empty:
                search_row = query.head(1).squeeze(axis=0)
        rows.append(row.combine_first(search_row))
    
    dioc_joined_df = pd.DataFrame(rows)
    dioc_joined_df = dioc_joined_df[[*input_df.columns, *factgrid_diocese_df.columns]]
    
    return dioc_joined_df
dioc_joined_df = join_dioceses(missing_institution_on_factgrid_df)
print(len(dioc_joined_df))
dioc_joined_df.head()

8073


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,118625284,059-01621-001,Q653546,,,left_only,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,...,118625284,059-01621-001,Q653546,,,left_only,,,,
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,13805746X,050-06892-001,Q653421,,,left_only,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001
7,WIAG-Pers-EPISCGatz-04027-001,110917,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Chur,WIAG-Inst-DIOCGatz-051-001,...,,710-00332-001,Q653192,,,left_only,Q256664,Bistum Chur,Diözese Chur,
9,WIAG-Pers-EPISCGatz-02518-001,100965,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,138018383,059-01616-001,Q649639,,,left_only,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001


In [75]:
missing_institution_diocese_on_factgrid_df = dioc_joined_df[dioc_joined_df['fg_diocese_id'].isna()]

In [76]:
# missing_institution_diocese_on_factgrid_df.drop(['_merge'], axis=1, inplace=True)

In [77]:
# missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df.person_id.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]

## Create a csv file with missing ids for institution or diocese

In [78]:
# null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df['institution_id'].isnull()][missing_institution_diocese_on_factgrid_df['diocese_id'].isnull()]
null_entries = missing_institution_diocese_on_factgrid_df[missing_institution_diocese_on_factgrid_df['name'] != 'Kardinal'].sort_values('person_id')
null_entries = null_entries[null_entries['role_group'] != 'Kurienamt']
null_entries = null_entries[null_entries['role_group'] != 'Papst']
null_entries = null_entries.sort_values(['diocese', 'institution'])

# 4312 to 7979 should be removed
null_entries = null_entries[null_entries.person_id.str.contains('[0-7][0-9]{4}-[0-9]{3}$', regex=True)]
null_entries.to_csv(f'missing_amter-{today_string}_Bamberg.csv', sep=';')
null_entries

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,_merge,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
16164,WIAG-Pers-CANON-13163-001,44352,Archidiakon,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Aire,WIAG-Inst-DIOCGatz-120-001,...,,,Q721905,,,left_only,,,,
11767,WIAG-Pers-CANON-50000-001,142306,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Cambrai,WIAG-Inst_DIOCGatz-148-001,...,1204464561,712-00081-001,,,,left_only,,,,
11039,WIAG-Pers-EPISCGatz-03056-001,142310,Designierter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Cambrai,WIAG-Inst_DIOCGatz-148-001,...,118557742,712-00022-001,Q653884,,,left_only,,,,
7786,WIAG-Pers-EPISCGatz-03690-001,142328,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Cambrai,WIAG-Inst_DIOCGatz-148-001,...,100964850,036-00934-001,Q656370,,,left_only,,,,
9992,WIAG-Pers-CANON-51313-001,45314,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Coventry,WIAG-Inst-DIOCGatz-127-001,...,1147233063,,,,,left_only,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26218,WIAG-Pers-CANON-50664-001,65810,Archidiakon,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,,,...,,,,,,left_only,,,,
26335,WIAG-Pers-CANON-50723-001,65549,Archidiakon,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,,,...,1025032772,,,,,left_only,,,,
5061,WIAG-Pers-EPISCGatz-10386-001,80161,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,,,...,,077-00422-001,Q655184,,,left_only,,,,
23341,WIAG-Pers-EPISCGatz-10695-001,6871,Apostolischer Vikar,,,,,,,,...,,027-06083-001,Q656245,,,left_only,,,,


# (Optionally) create the entries here

In [79]:
# TODO: sent ouput to file instead of stdout

# for _, row in null_entries[null_entries['diocese'].notnull()][['diocese', 'diocese_id']].iterrows():
#     print("CREATE")
#     print('\t'.join(['LAST', 'Lde', f'"{row["diocese"]}"']))
#     print('\t'.join(['LAST', 'P2', 'Q153166']))

#     print('\t'.join(['LAST', 'P131', 'Q153178']))
#     if type(row['diocese_id']) == str:
#         print('\t'.join(['LAST', 'P601', f'"{row["diocese_id"]}"']))

In [80]:
# missing_institution_diocese_on_factgrid_df.to_csv('no_institution_or_dioc_on_fg_bamberg.csv', sep=';')

### To continue below your csv files should be empty. The check below ensures that.

In [81]:
assert len(null_entries) == 0, "Create the entries on factgrid before continuing."

AssertionError: Create the entries on factgrid before continuing.

# Add factgrid information

## Add institution factgrid id

In [82]:
institution_joined_df = role_all_df.merge(
    factgrid_institution_df, how='left', left_on='institution_id', right_on='fg_gsn_id', suffixes=('_wiag', '_institute_fg')
)
print(len(institution_joined_df))
institution_joined_df.head()

31454


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,date_begin,date_end,date_sort_key,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,um 909,,909210,118625284,059-01621-001,Q653546,Q898020,3498.0
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,923,973,923150,118625284,059-01621-001,Q653546,,
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,vor 948?,952?,948100,118625284,059-01621-001,Q653546,Q266876,60182.0
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,972,973,972150,118625284,059-01621-001,Q653546,,
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,1077,1096,1077150,13805746X,050-06892-001,Q653421,,


In [83]:
assert len(role_all_df) == len(institution_joined_df), f"There may be duplicates on factgrid {len(role_all_df)} {len(institution_joined_df)}"

## Add diocese factgrid id

In [84]:
dioc_joined_df = join_dioceses(institution_joined_df)
print(len(dioc_joined_df))
dioc_joined_df.head()

31454


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,date_sort_key,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,...,909210,118625284,059-01621-001,Q653546,Q898020,3498.0,,,,
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,923150,118625284,059-01621-001,Q653546,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,...,948100,118625284,059-01621-001,Q653546,Q266876,60182.0,,,,
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,...,972150,118625284,059-01621-001,Q653546,,,,,,
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,1077150,13805746X,050-06892-001,Q653421,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001


## Add role factgrid id
Note: This role does not include the institution information. ie, it adds factgrid ids for roles like 'archbishop' and not 'archbishop of trier'

The part of the script below could be used to create quickstatements for career statements.

In [85]:
print(len(wiag_roles_df))
wiag_roles_df.head()
#wiag_roles_df[wiag_roles_df['name'] == '']

102


Unnamed: 0,id,note,name,comment,gs_reg_id,generic_term,plural,definition,role_gp_fg_id,gender,lang,role_group_id,r_id,item_id,r_note,authority_id,role_fg_id,r_comment
0,3,,Administrator des Bistums,,,,,Ein Bistumsadministrator (auch: Diözesanadmini...,Q648236,männlich,de,33.0,1298102,3,,42,Q902170,
1,4,,Administrator des Erzbistums,,,,,,Q648236,männlich,de,33.0,1298104,4,,42,Q902170,
2,5,,Administrator des Fürstbischofs,,,,,Ein Bistumsadministrator (auch: Diözesanadmini...,Q648236,männlich,de,33.0,1298982,5,,42,Q902170,
3,9,,Apostolischer Administrator,,,,,Ist ein Bischofsstuhl vakant oder bestehen ern...,Q648236,männlich,de,33.0,1298980,9,,42,Q38958,
4,13,,Bischofsadministrator,,,,,Ein Bistumsadministrator (auch: Diözesanadmini...,Q648236,männlich,de,33.0,1298106,13,,42,Q902170,


In [86]:
joined_df = dioc_joined_df.merge(
    wiag_roles_df[['name', 'role_fg_id']], how='left', left_on='name', right_on='name', suffixes=('_wiag', '_institute_fg')
)
print(len(joined_df))
joined_df.head()

31637


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id
0,WIAG-Pers-EPISCGatz-03848-001,124071,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Augsburg,3498.0,,,...,118625284,059-01621-001,Q653546,Q898020,3498.0,,,,,Q38837
1,WIAG-Pers-EPISCGatz-03848-001,124073,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,118625284,059-01621-001,Q653546,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001,Q38809
2,WIAG-Pers-EPISCGatz-03848-001,124075,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Kempten,60182.0,,,...,118625284,059-01621-001,Q653546,Q266876,60182.0,,,,,Q38954
3,WIAG-Pers-EPISCGatz-03848-001,124077,Abt,Oberstes Leitungsamt Kloster,Monastery member with a leadership position,Q648233,Benediktinerkloster Ottobeuren,60305.0,,,...,118625284,059-01621-001,Q653546,,,,,,,Q38954
4,WIAG-Pers-EPISCGatz-03858-001,124065,Ernannter Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Augsburg,WIAG-Inst-DIOCGatz-001-001,...,13805746X,050-06892-001,Q653421,,,Q153179,Bistum Augsburg,Diözese Augsburg,WIAG-Inst-DIOCGatz-001-001,Q902184


### Ignore all Kanonikatsbewerber and Vikariatsbewerber offices


In [87]:
joined_df = joined_df[~joined_df['name'].isin(
    ['Vikariatsbewerber', 
     'Kanonikatsbewerber']
)]

## The output below should be empty. The cell after the next one will throw an error if it's not the case

In [99]:
missing_roles_df = joined_df[joined_df['role_fg_id'].isna()]
missing_roles_df.head()

Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GND,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id
33,WIAG-Pers-CANON-13216-001,18373,Propst und Archidiakon,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,"Benediktinerkloster Ansbach, später Kollegiats...",1015.0,,,...,,,Q648997,Q400534,1015.0,,,,,
50,WIAG-Pers-CANON-49220-001,109435,Domkämmerer,Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Speyer,3489.0,,,...,1038267439.0,,Q700918,Q898039,3489.0,,,,,
61,WIAG-Pers-EPISCGatz-02684-001,124663,Generalvikar des Erzbischofs,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Salzburg,WIAG-Inst-DIOCGatz-045-001,...,140745025.0,,Q651525,,,Q153249,Erzbistum Salzburg,Bistum Salzburg,WIAG-Inst-DIOCGatz-045-001,
62,WIAG-Pers-EPISCGatz-02684-001,124665,Offizial des Erzbischofs,Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Salzburg,WIAG-Inst-DIOCGatz-045-001,...,140745025.0,,Q651525,,,Q153249,Erzbistum Salzburg,Bistum Salzburg,WIAG-Inst-DIOCGatz-045-001,
66,WIAG-Pers-EPISCGatz-02684-001,124673,"Domthesaurar, Anwärter",Leitungsamt Domstift,Dignitary of a cathedral chapter,Q648232,Domstift Augsburg,3498.0,,,...,140745025.0,,Q651525,Q898020,3498.0,,,,,


In [97]:
wiag_roles_df[wiag_roles_df['name'].str.contains('Anwärter')]

Unnamed: 0,id,note,name,comment,gs_reg_id,generic_term,plural,definition,role_gp_fg_id,gender,lang,role_group_id,r_id,item_id,r_note,authority_id,role_fg_id,r_comment
27,98,,"Domkantor, Anwärter",,10220.0,Domkantor,"Domkantoren, Anwärter",,Q648232,männlich,de,41.0,1298116,98,,42,Q902174,
29,100,,"Domkanoniker, Anwärter",,11000.0,Domkanoniker,"Domkanoniker, Anwärter",,Q648226,männlich,de,42.0,1299244,100,,42,Q902172,
33,105,,"Domherr, Anwärter",,11000.0,Domherr,"Domherren, Anwärter",,Q648226,männlich,de,42.0,1298112,105,,42,Q902172,
52,141,,"Domvikar, Anwärter",,13300.0,Domvikar,"Domvikare, Anwärter",,Q648226,männlich,de,42.0,1298132,141,,42,Q902182,
58,149,,"Dompropst, Anwärter",,10150.0,Dompropst,"Dompröpste, Anwärter",,Q648232,männlich,de,41.0,1298124,149,,42,Q902178,
73,178,in FactGrid zwei Einträge: Kanonikatsbewerber ...,"Kanoniker, Anwärter",,11000.0,Kanoniker,"Kanoniker, Anwärter",,Q648228,männlich,de,51.0,1298142,178,,42,Q902189,
74,178,in FactGrid zwei Einträge: Kanonikatsbewerber ...,"Kanoniker, Anwärter",,11000.0,Kanoniker,"Kanoniker, Anwärter",,Q648228,männlich,de,51.0,1298144,178,,42,Q902188,
80,201,,"Propst, Anwärter",,10150.0,Propst,"Pröpste, Anwärterinnen",,Q648233,männlich,de,39.0,1298146,201,,42,Q902190,
88,220,mehrdeutiger Begriff; in FactGrid zwei Einträg...,"Vikar, Anwärter",,13300.0,Vikar,"Vikare, Anwärter",,Q648228,männlich,de,51.0,1298156,220,,42,Q902195,
89,220,mehrdeutiger Begriff; in FactGrid zwei Einträg...,"Vikar, Anwärter",,13300.0,Vikar,"Vikare, Anwärter",,Q648228,männlich,de,51.0,1298158,220,,42,Q902196,


In [89]:
missing_roles = joined_df[joined_df['role_fg_id'].isna()]['name'].unique()
print(len(missing_roles))
missing_roles

129


array(['Propst und Archidiakon', 'Domkämmerer',
       'Generalvikar des Erzbischofs', 'Offizial des Erzbischofs',
       'Domthesaurar, Anwärter', 'Thesaurar', 'Providierter Propst',
       'Domscholaster, Anwärter', 'Kaplan', 'Dechant (Prag)',
       'Fürsterzbischof', 'Stiftsherr', 'Domprior', 'Altarist',
       'Bischöflicher Offizial', 'Gewählter Koadjutor des Bischofs',
       'Domdechant', 'Ernannter Erzbischof',
       'Providierter und konfirmierter Bischof', 'Providierter Dompropst',
       'Vizedominus', 'Generaloffizial', 'Providierter Domvikar',
       'Dekan, Anwärter', 'Lektor', 'Domdekan, Anwärter',
       'Archidiakon (Schleswig)', 'Evangelischer Administrator',
       'Bischofskandidat', 'Offizial des Bischofs',
       'Koadjutor des Fürstpropstes', 'Fürstpropst',
       'Koadjutor des Fürstabtes', 'Koadjutor des Propstes',
       'Domchorbischof', 'Apostolischer Vikar', 'Gegenbischof', 'Viztum',
       'Domküster', 'Kanonikatsanwärter', 'Vikarieanwärter',
       'Dom

In [109]:
wiag_roles_df[wiag_roles_df['name'].isin(missing_roles)]

Unnamed: 0,id,note,name,comment,gs_reg_id,generic_term,plural,definition,role_gp_fg_id,gender,lang,role_group_id,r_id,item_id,r_note,authority_id,role_fg_id,r_comment


In [121]:
filename = f"create-missing-roles-{today_string}.qs"
with open(filename, 'w') as file:
    for role in missing_roles:
        file.write("CREATE\n")
        file.write("\t".join(["LAST", "Lde", role + "\n"]))
        file.write("\t".join(["LAST", "P2", "Q37073\n"]))
        file.write("\t".join(["LAST", "P131", "Q153178\n"]))
        search_res = missing_roles_df[missing_roles_df['name'] == role]['role_group_fq_id'].head(1).squeeze(axis=0)
        if not pd.isna(search_res):
            file.write("\t".join(["LAST", "P3", search_res + "\n"]))
print(filename)

create-missing-roles-2024-05-28.qs


### Run the file generated with the file name above

In [123]:
# [role for role in missing_roles if role.endswith('Anwärter')]

In [49]:
assert len(joined_df[joined_df['role_fg_id'].isna()]) == 0, "Missing roles with factgrid id in wiag database"

AssertionError: Missing roles with factgrid id in wiag database

## Check people with missing factgrid entries or missing factgrid ids in wiag

In [None]:
joined_df[joined_df['FactGrid'].isna()].head()

In [None]:
joined_df[joined_df['FactGrid'].isna()]['person_id'].unique()

## (Optionally) generate the quickstatements for creating the persons here

In [106]:
joined_df[joined_df['FactGrid'].isna()].to_csv(f'missing_factgrid-{today_string}_Bamberg.csv', sep=';')

## The cell below will throw an error if there are entries on Factgrid but not on Wiag

In [107]:
assert joined_df[joined_df['FactGrid'].isna()].empty, "There are missing persons on factgrid"

AssertionError: There are missing persons on factgrid

In [108]:
joined_df = joined_df[~joined_df['FactGrid'].isna()]
assert False

In [109]:
factgrid_inst_roles_df

Unnamed: 0,fg_inst_role_id,inst_role
0,Q171935,Doktor der medizinischen Fakultät von Nancy
1,Q171937,Doktor der medizinischen Fakultät von Douai
2,Q171938,Doktor der medizinischen Fakultät von Caen
3,Q171939,Doktor der medizinischen Fakultät von Bourges
4,Q171940,Doktor der medizinischen Fakultät von Bordeaux
...,...,...
4338,Q449139,Handelssekretariat Handelsvertretung Karatschi...
4339,Q449140,Handelssekretär Botschaft Conakry Guinea
4340,Q449141,Handelsrat Außenhandelsbüro Rio de Janeiro Bra...
4341,Q449142,Zweigstellenleiter Botschaft Aden Südjemen


## Add factgrid ids for roles
Note: this role has information of the institution as well

In [112]:
# add factgrid ids for roles
found = 0
data_dict = {}
not_found = []
dupl = {}
for i, (name, inst, dioc) in joined_df[['name', 'institution', 'diocese']].iterrows():
    if name == "Kardinal":
        # this is okay
        # manually add qid for kardinals Q254893
        data_dict[i] = "Q254893"
        continue
    search_res = pd.DataFrame()
    if pd.isna(inst):
    # if name in ["Bischof", "Fürstbischof", "Ernannter Bischof", "Erzbischof"]:
        if pd.isna(dioc):
            print(i, name, inst, dioc)
        if name not in ["Archidiakon", "Koadjutor"]:
            dioc = dioc.lstrip('Bistum').lstrip('Erzbistum').lstrip('Patriarchat').lstrip()
        if name == "Fürstbischof" and dioc in ["Passau", "Straßburg"]:
            name = "Bischof"    
        search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['inst_role'].str.contains(f"^{name}.*{dioc}")]
        if name == "Erzbischof" and dioc == "Salzburg":
            # will be merged in later
            search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['fg_inst_role_id'] == 'Q172567']
    else:
        name = name.replace('Domkanoniker', 'Domherr')
        search_res = factgrid_inst_roles_df[factgrid_inst_roles_df['inst_role'] == f"{name} {inst}"]
    if len(search_res) == 1:
        found += 1
        data_dict[i] = search_res['fg_inst_role_id'].values[0]
    elif len(search_res) >= 2:
        # print("+" * 10)
        print(name, inst, dioc)
        print(search_res)
        print()
        dupl[i] = (name, inst, dioc, search_res)
    elif len(search_res) == 0:
        not_found.append((name, inst, dioc))
        print(name, inst, dioc)
print("===Summary of roles===")
print("Found:", found, "Duplicates:", len(dupl), "Not found:", len(not_found))

Propst Kollegiatstift St. Nikolaus, Spalt nan
Domsenior Domstift Würzburg nan
Cellerar Domstift Eichstätt nan
Propst Benediktinerkloster, später Kollegiatstift St. Cyriakus, Wiesensteig nan
Domscholaster Domstift Eichstätt nan
Domcellerar Domstift Augsburg nan
Kapitularvikar nan Würzburg
Propst Kollegiatstift Unsere Liebe Frau, Eichstätt nan
===Summary of roles===
Found: 2743 Duplicates: 0 Not found: 8


In [113]:
final_joined_df = joined_df.merge(pd.Series(data_dict).rename('fg_inst_role_id'), left_index=True, right_index=True)
print(len(final_joined_df))
final_joined_df.head()

2744


Unnamed: 0,person_id,id,name,role_group,role_group_en,role_group_fq_id,institution,institution_id,diocese,diocese_id,...,GSN,FactGrid,fg_institution_id,fg_gsn_id,fg_diocese_id,dioc_label,dioc_alt,dioc_wiag_id,role_fg_id,fg_inst_role_id
0,WIAG-Pers-EPISCGatz-05072-001,5154,Erzbischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Erzbistum Trier,WIAG-Inst-DIOCGatz-030-001,...,062-01186-001,Q653844,,,Q153244,Erzbistum Trier,Bistum Trier,WIAG-Inst-DIOCGatz-030-001,Q172539,Q172572
1,WIAG-Pers-EPISCGatz-04050-001,3770,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Eichstätt,WIAG-Inst-DIOCGatz-006-001,...,059-00710-001,Q652691,,,Q153220,Bistum Eichstätt,Diözese Eichstätt,WIAG-Inst-DIOCGatz-006-001,Q38809,Q172630
2,WIAG-Pers-CANON-24965-001,22325,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Würzburg,3502.0,,,...,,Q728753,Q400557,3502.0,,,,,Q38837,Q390632
3,WIAG-Pers-CANON-24965-001,31418,Domherr,Amt Domstift,Cleric of a cathedral chapter,Q648226,Domstift Bamberg,3492.0,,,...,,Q728753,Q400530,3492.0,,,,,Q38837,Q400601
4,WIAG-Pers-EPISCGatz-03869-001,3586,Bischof,Oberstes Leitungsamt Diözese,Head of an (arch)diocese,Q648236,,,Bistum Bamberg,WIAG-Inst-DIOCGatz-002-001,...,006-00007-001,Q652702,,,Q153216,Bistum Bamberg,Diözese Bamberg,WIAG-Inst-DIOCGatz-002-001,Q38809,Q195266


## Parse begin and end date from the wiag data

In [115]:
# https://database.factgrid.de/query/embed.html#SELECT%20%3FPropertyLabel%20%3FProperty%20%3FPropertyDescription%20%3Freciprocal%20%3FreciprocalLabel%20%3Fexample%20%3Fuseful_statements%20%3Fwd%20WHERE%20%7B%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20%3FProperty%20wdt%3AP8%20wd%3AQ77483.%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP364%20%3Fexample.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP86%20%3Freciprocal.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP343%20%3Fwd.%20%7D%0A%20%20OPTIONAL%20%7B%20%3FProperty%20wdt%3AP310%20%3Fuseful_statements.%20%7D%0A%7D%0AORDER%20BY%20%3FPropertyLabel

def format_datetime(entry: datetime, resolution):
    julian_ending = "/J" if entry.year < 1582 else ""
    return f"+{entry.isoformat()}Z/{resolution}" + julian_ending

def date_parsing(date_string: str, end=False):
    return_property = "P50" if end else "P49"
    qualifier = None
    entry = None
    resolution = 7

    if pd.isna(date_string) or date_string == '?':
        return tuple()
    
    if matches := re.match(r'frühestens (\d{3,4})', date_string):
        return_property = "P1125" if end else "P1126"
        entry = datetime(int(matches.group(1)), 1, 1)
        resolution = 9
    
    elif matches := re.match(r'(kurz )?vor (\d{3,4})', date_string):
        return_property = "P1123" if end else "P1124"
        if matches.group(1):
            qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'
        entry = datetime(int(matches.group(2)), 1, 1)
        resolution = 9
        
    elif matches := re.match(r'(kurz )?nach (\d{3,4})', date_string):
        return_property = "P1125" if end else "P1126"
        if matches.group(1):
            qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'
        entry = datetime(int(matches.group(2)), 1, 1)
        resolution = 9
        
    elif matches := re.match(r'(\d{1,2})\. Jahrhundert', date_string):
        century = int(matches.group(1))
        entry = datetime(100 * (century), 1, 1)
    
    elif matches := re.match(r'(\d)\. Hälfte (des )?(\d{1,2})\. (Jhs\.|Jahrhunderts|Jahrhundert)', date_string):
        half = int(matches.group(1))
        year = int(matches.group(3)) - 1
        latest_year   = year * 100 + (half * 50)
        earliest_year = latest_year - 50 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'
    
    elif matches := re.match(r'(\w+) Viertel des (\d{1,2})\. Jhs\.', date_string):
        number_map = {
            "erstes":  1,
            "zweites": 2,
            "drittes": 3,
            "viertes": 4,
        }
        quarter = number_map[matches.group(1)]
        year    = int(matches.group(2)) - 1
        latest_year   = year * 100 + (quarter * 25)
        earliest_year = latest_year - 25 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'frühes (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1)) - 1
        latest_year   = year * 100 + 20
        earliest_year = latest_year - 20 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'spätes (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1))
        latest_year   = year * 100
        earliest_year = latest_year - 20 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'(Anfang|Mitte|Ende) (\d{1,2})\. Jh\.', date_string):
        number_map = {
            "Anfang":  1,
            "Mitte": 2,
            "Ende": 3,
        }
        third = number_map[matches.group(1)]
        year = int(matches.group(2)) - 1
        latest_year   = year * 100 + (third * 33)
        earliest_year = latest_year - 33 + 1
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'(ca\.|um) (\d{3,4})', date_string):
        year = int(matches.group(2))
        latest_year   = year + 5
        earliest_year = latest_year - 10
        entry = datetime((earliest_year + latest_year) // 2, 1, 1)
        resolution = 9
        qualifier = ("P785 " if end else "P786 ") + "Q10"

    elif matches := re.match(r'(\d{3,4})er Jahre', date_string):
        year = int(matches.group(1))
        latest_year   = year + 10
        earliest_year = latest_year - 10 + 1
        entry = datetime(year, 1, 1)
        resolution = 8
    
    elif matches := re.match(r'Wende zum (\d{1,2})\. Jh\.', date_string):
        year = int(matches.group(1)) - 1
        latest_year   = year * 100 + 10
        earliest_year = latest_year - 20 + 1
        entry = datetime(latest_year, 1, 1)
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'Anfang der (\d{3,4})er Jahre', date_string):
        year = int(matches.group(1))
        latest_year   = year + 3
        earliest_year = year
        entry = datetime(year, 1, 1)
        resolution = 8
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'

    elif matches := re.match(r'\((\d{3,4}) \?\) (\d{3,4})', date_string):
        year1 = int(matches.group(1))
        year2 = int(matches.group(2))
        entry = datetime(year2, 1, 1)
        resolution = 9
        qualifier = ("P788 " if end else "P787 ") + f'"{date_string}"'
    
    elif matches := re.match(r'(\d{3,4})/(\d{3,4})', date_string):
        year1 = int(matches.group(1))
        year2 = int(matches.group(2))

        if year2 - year1 == 1:
            # check for consecutive years
            qualifier = "P786 = Q912616" if end else "P785 = Q912616"
        entry = datetime(year1, 1, 1)
        resolution = 9

    elif matches := re.match(r'(\d{3,4})\?', date_string):
        year = int(matches.group(1))
        entry = datetime(year, 1, 1)
        resolution = 9
        qualifier = ("P786 " if end else "P785 ") + f'"{date_string}"'
    
    elif matches := re.match(r'(\d{3,4})', date_string):
        year = int(matches.group(1))
        entry = datetime(year, 1, 1)
        resolution = 9
    else:
        print(date_string)
        raise Exception("Couldn't parse date")
        
    if qualifier:
        return (return_property, format_datetime(entry, resolution), qualifier)
    else:
        return (return_property, format_datetime(entry, resolution))

tests = {
    "1205": "+1205-01-01T00:00:00Z/9/J",
    "12. Jahrhundert": "+1200-01-01T00:00:00Z/7/J",
    "1. Hälfte des 12. Jhs.": "+1125-01-01T00:00:00Z/7/J",
    "2. Hälfte des 12. Jhs.": "+1175-01-01T00:00:00Z/7/J",
    "erstes Viertel des 12. Jhs.": "+1113-01-01T00:00:00Z/7/J",
    "zweites Viertel des 12. Jhs.": "+1138-01-01T00:00:00Z/7/J",
    "drittes Viertel des 12. Jhs.": "+1163-01-01T00:00:00Z/7/J",
    "viertes Viertel des 12. Jhs.": "+1188-01-01T00:00:00Z/7/J",
    "frühes 12. Jh.": "+1110-01-01T00:00:00Z/7/J",
    "spätes 12. Jh.": "+1190-01-01T00:00:00Z/7/J",
    "Anfang 12. Jh.": "+1117-01-01T00:00:00Z/7/J",
    "Mitte 12. Jh.": "+1150-01-01T00:00:00Z/7/J",
    "Ende 12. Jh.": "+1183-01-01T00:00:00Z/7/J",
    "ca. 1050": "+1050-01-01T00:00:00Z/9/J",
    "um 1050": "+1050-01-01T00:00:00Z/9/J",
    "1230er Jahre": "+1230-01-01T00:00:00Z/8/J",
    "Wende zum 12. Jh.": "+1110-01-01T00:00:00Z/7/J",
    "Anfang der 1480er Jahre": "+1480-01-01T00:00:00Z/8/J",
    "frühestens 1342": "+1342-01-01T00:00:00Z/9/J",
    "vor 1230": "+1230-01-01T00:00:00Z/9/J",
    "nach 1230": "+1230-01-01T00:00:00Z/9/J",
    "kurz vor 1200": "+1200-01-01T00:00:00Z/9/J",
    "kurz nach 1200": "+1200-01-01T00:00:00Z/9/J",
    "1164/1165": "+1164-01-01T00:00:00Z/9/J",
    "1164/1177": "+1164-01-01T00:00:00Z/9/J",
}

for key, value in tests.items():
    retval = date_parsing(key)[1]
    assert retval == value, f"{key}: Returned {retval} instead of {value}"


## Reconcile office data with factgrid

In [None]:
#https://database.factgrid.de/wiki/Special:EntityData/Q515.json

## Generate quickstatements for offices

In [122]:
with open(f'quickstatments_{today_string}.qs', 'w') as file:
    for _, row in final_joined_df.iterrows():
        try:
            date_begin_parsed = date_parsing(row['date_begin'])
            date_end_parsed   = date_parsing(row['date_end'], end=True)
            file.write('\t'.join([
                row['FactGrid'], 
                'P165', 
                row['fg_inst_role_id'],
                'S601', 
                '"' + row['person_id'] + '"',
                *date_begin_parsed, 
                *date_end_parsed, 
            ]) + '\n')
        except Exception as e:
            print(traceback.format_exc())
            print(row)

In [45]:
# all_roles = set()

# def add_possible_list(a_set: set, element):
#     if type(element) == list:
#         # add all items in list
#         for item in element:
#             a_set.add(item)
#     else:
#         # simply add the given element
#         a_set.add(element)

# for summary in missing_on_factgrid_df['summary_offices'].tolist():
#     print(summary)
#     offices = re.split(r'(\d+),', summary)
#     offices = [office.lstrip() for office in offices]
#     new_offices = offices
#     for index, office in enumerate(offices):
#         if re.match('\d+', office):
#             new_offices[index - 1] += office
#             new_offices.pop(index)
#     print(new_offices)
#     for office in new_offices:
#         for office_name in re.match(r'\w+(, \w+)*', office).group().split(','):
#             office_name = office_name.strip()
#             print(office_name)
#             all_roles.add(office_name)
#     print()

# print('#'*10)
# print(all_roles)

# url = 'https://database.factgrid.de/sparql'
# query = (
#     f"""SELECT DISTINCT ?item ?label
# WHERE
# {{
#   ?item wdt:P2 wd:Q37073;
#         rdfs:label ?label.
#   FILTER(LANG(?label) = "de").
#   FILTER REGEX (?label, "({'|'.join(list(all_roles))})$").
# }}
# ORDER BY ?label  
# """
# )
# print(query)
# # SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

# # make request: 
# r = requests.get(url, params={'query': query}, headers={"Accept": "application/json"})
# data = r.json()
# factgrid_roles_df = pd.json_normalize(data['results']['bindings'])

# len(factgrid_roles_df)

# list(all_roles)