In [2]:
import requests
import csv
import os
import pandas as pd
import json
import re
from datetime import datetime
today_string = datetime.now().strftime('%Y-%m-%d')

# Get data from wiag

```sql
SELECT i.id_public, u.value
FROM `item_corpus` i 
INNER JOIN (
    select item_id, value FROM url_external WHERE authority_id=200
) u on i.item_id = u.item_id
INNER JOIN item ON item.id = i.item_id
where item.merge_status != 'parent'
and item.merge_status != 'orphan'
and item.edit_status = 'online';
```

Mapping WIAG-ID - GSN for all data sets that are online
(the script gives back the valid public WIAG-ID due to the when-statement for the corpus)

```sql
select distinct i.id,
CASE WHEN (t_id.epc IS NOT NULL) THEN t_id.epc
WHEN (t_id.can IS NOT NULL) THEN t_id.can
WHEN (t_id.dreg_can IS NOT NULL) THEN t_id.dreg_can
END as id_public,
uext.value as gsn
from item as i
join url_external as uext on uext.item_id = i.id and uext.authority_id = 200
join (select distinct ic.item_id as item_id,
ic_ii.id_public as epc,
ic_iii.id_public as can,
ic_iv.id_public as dreg_can
from item_corpus as ic
left join item_corpus as ic_ii on ic_ii.item_id = ic.item_id and ic_ii.corpus_id = 'epc'
left join item_corpus as ic_iii on ic_iii.item_id = ic.item_id and ic_iii.corpus_id = 'can'
left join item_corpus as ic_iv on ic_iv.item_id = ic.item_id and ic_iv.corpus_id = 'dreg-can'
where ic.corpus_id in ('epc', 'can', 'dreg-can'))
as t_id on t_id.item_id = i.id
where i.is_online = 1;
```

In [3]:
ic_df = pd.read_csv('item_2024-05-21.csv', names=["id", "wiag_id", "pd_id"])
ic_df

Unnamed: 0,id,wiag_id,pd_id
0,13008,WIAG-Pers-EPISCGatz-21120-001,006-00056-001
1,13009,WIAG-Pers-EPISCGatz-21119-001,070-01102-001
2,13038,WIAG-Pers-EPISCGatz-21018-001,084-00204-001
3,13051,WIAG-Pers-EPISCGatz-21004-001,006-00074-001
4,13052,WIAG-Pers-EPISCGatz-21002-001,006-00070-001
...,...,...,...
13384,16003,WIAG-Pers-EPISCGatz-05523-001,069-01866-001
13385,16223,WIAG-Pers-EPISCGatz-05395-001,035-00067-001
13386,56399,WIAG-Pers-EPISCGatz-05526-001,061-05513-001
13387,56169,WIAG-Pers-EPISCGatz-02918-001,084-00243-001


In [6]:
ic_df[ic_df['pd_id'].isna()]

Unnamed: 0,id,wiag_id,pd_id


In [11]:
ic_df[ic_df.wiag_id.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]

Unnamed: 0,id,wiag_id,pd_id
1427,19551,WIAG-Pers-CANON-80031-001,003-00683-001
1428,19552,WIAG-Pers-CANON-80030-001,003-00664-001
1429,19553,WIAG-Pers-CANON-80029-001,003-00635-001
1430,19554,WIAG-Pers-CANON-80028-001,003-00634-001
1431,19555,WIAG-Pers-CANON-80027-001,003-00630-001
...,...,...,...
12737,57357,WIAG-Pers-CANON-91670-001,085-00129-001
12738,57365,WIAG-Pers-CANON-91671-001,085-00255-001
12739,57367,WIAG-Pers-CANON-91672-001,085-00240-001
12740,57369,WIAG-Pers-CANON-91673-001,085-00225-001


In [22]:
ic_df[ic_df['wiag_id'] == 'WIAG-Pers-CANON-80018-001']

Unnamed: 0,id,wiag_id,pd_id
1439,19563,WIAG-Pers-CANON-80018-001,003-00368-001


# Get data from personendatenbank

```sql
SELECT persons.wiag, MIN(gsn.nummer)
FROM items
INNER JOIN persons ON persons.item_id = items.id AND persons.deleted=0 AND items.deleted=0 AND items.status = "online"
INNER JOIN gsn ON gsn.item_id = items.id AND gsn.deleted=0
WHERE persons.wiag IS NOT NULL AND persons.wiag != ''
GROUP BY persons.wiag
```


In [12]:
# need to fix the ebene problem
pr_df = pd.read_csv('pr_2024-05-15_1.csv', names=["wiag_id", "pd_id"])
pr_df

Unnamed: 0,wiag_id,pd_id
0,WIAG-Pers-CANON-10014-001,054-02832-001
1,WIAG-Pers-CANON-10032-001,032-02373-001
2,WIAG-Pers-CANON-10047-001,029-03106-001
3,WIAG-Pers-CANON-10054-001,029-03206-001
4,WIAG-Pers-CANON-10071-001,029-01071-001
...,...,...
11003,WIAG-Pers-EPISCGatz-21735-001,061-06324-001
11004,WIAG-Pers-EPISCGatz-21750-001,052-02662-001
11005,WIAG-Pers-EPISCGatz-21751-001,077-00515-001
11006,WIAG-Pers-EPISCGatz-21766-001,073-01566-001


In [13]:
pr_df[pr_df['pd_id'].isna()]

Unnamed: 0,wiag_id,pd_id


In [14]:
pr_df_gp = pr_df.groupby('wiag_id').count()
pr_df_gp[pr_df_gp['pd_id'] > 1]

Unnamed: 0_level_0,pd_id
wiag_id,Unnamed: 1_level_1


In [15]:
joined_df = ic_df.merge(pr_df, on='pd_id', suffixes=('_wiag', '_pd'))
joined_df

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd
0,13008,WIAG-Pers-EPISCGatz-21120-001,006-00056-001,WIAG-Pers-EPISCGatz-21120-001
1,13038,WIAG-Pers-EPISCGatz-21018-001,084-00204-001,WIAG-Pers-EPISCGatz-21018-001
2,13051,WIAG-Pers-EPISCGatz-21004-001,006-00074-001,WIAG-Pers-EPISCGatz-21004-001
3,13052,WIAG-Pers-EPISCGatz-21002-001,006-00070-001,WIAG-Pers-EPISCGatz-21002-001
4,13053,WIAG-Pers-EPISCGatz-21001-001,006-00069-001,WIAG-Pers-EPISCGatz-21001-001
...,...,...,...,...
10342,13759,WIAG-Pers-EPISCGatz-02708-001,046-02506-001,WIAG-Pers-EPISCGatz-02708-001
10343,13853,WIAG-Pers-EPISCGatz-02799-001,060-02179-001,WIAG-Pers-EPISCGatz-02799-001
10344,16223,WIAG-Pers-EPISCGatz-05395-001,035-00067-001,WIAG-Pers-EPISCGatz-05395-001
10345,56169,WIAG-Pers-EPISCGatz-02918-001,084-00243-001,WIAG-Pers-EPISCGatz-02918-001


In [16]:
unequal_df = joined_df[joined_df['wiag_id_wiag'] != joined_df['wiag_id_pd']]
unequal_df

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd
797,17819,WIAG-Pers-EPISCGatz-21305-001,046-02872-001,WIAG-Pers-EPISCGatz-03210-001
832,19563,WIAG-Pers-CANON-80018-001,003-00368-001,WIAG-Pers-EPISCGatz-02835-001
837,19568,WIAG-Pers-CANON-90233-001,003-00269-001,WIAG-Pers-EPISCGatz-03473-001
841,19572,WIAG-Pers-CANON-80009-001,003-00183-001,WIAG-Pers-EPISCGatz-21702-001
873,19604,WIAG-Pers-CANON-80038-001,003-00935-001,WIAG-Pers-EPISCGatz-04133-001
...,...,...,...,...
10315,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001
10316,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001
10317,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001
10319,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001


In [17]:
# check_list = list(map(lambda x: x[-9:-8] == '8' or x[-9:-8] == '9', unequal_df[unequal_df['wiag_id_x'].isna()]['wiag_id_y'].to_list()))
# any(check_list)
empty_wiag = unequal_df[unequal_df['wiag_id_wiag'].isna()]
diff_df = empty_wiag[empty_wiag.wiag_id_pd.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]
diff_df

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd


In [18]:
unequal_df[unequal_df['wiag_id_wiag'].isna()]

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd


In [19]:
unequal_df = unequal_df[~unequal_df['wiag_id_wiag'].isna()]
unequal_df

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd
797,17819,WIAG-Pers-EPISCGatz-21305-001,046-02872-001,WIAG-Pers-EPISCGatz-03210-001
832,19563,WIAG-Pers-CANON-80018-001,003-00368-001,WIAG-Pers-EPISCGatz-02835-001
837,19568,WIAG-Pers-CANON-90233-001,003-00269-001,WIAG-Pers-EPISCGatz-03473-001
841,19572,WIAG-Pers-CANON-80009-001,003-00183-001,WIAG-Pers-EPISCGatz-21702-001
873,19604,WIAG-Pers-CANON-80038-001,003-00935-001,WIAG-Pers-EPISCGatz-04133-001
...,...,...,...,...
10315,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001
10316,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001
10317,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001
10319,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001


In [20]:
unequal_df['pd_link'] = unequal_df.apply(lambda row: "http://personendatenbank.germania-sacra.de/index/gsn/" + row["pd_id"], axis = 1)
unequal_df
unequal_df['current_wiag_link'] = unequal_df.apply(lambda row: "https://wiag-vocab.adw-goe.de/id/" + row["wiag_id_wiag"], axis = 1)
unequal_df
unequal_df['pd_wiag_link'] = unequal_df.apply(lambda row: "https://wiag-vocab.adw-goe.de/id/" + row["wiag_id_pd"], axis = 1)
unequal_df

Unnamed: 0,id,wiag_id_wiag,pd_id,wiag_id_pd,pd_link,current_wiag_link,pd_wiag_link
797,17819,WIAG-Pers-EPISCGatz-21305-001,046-02872-001,WIAG-Pers-EPISCGatz-03210-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...
832,19563,WIAG-Pers-CANON-80018-001,003-00368-001,WIAG-Pers-EPISCGatz-02835-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...
837,19568,WIAG-Pers-CANON-90233-001,003-00269-001,WIAG-Pers-EPISCGatz-03473-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...
841,19572,WIAG-Pers-CANON-80009-001,003-00183-001,WIAG-Pers-EPISCGatz-21702-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...
873,19604,WIAG-Pers-CANON-80038-001,003-00935-001,WIAG-Pers-EPISCGatz-04133-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-EPI...
...,...,...,...,...,...,...,...
10315,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10316,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10317,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10319,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...


In [21]:
unequal_df[['pd_link', 'current_wiag_link', 'pd_wiag_link']].to_csv(f'unequal_check_{today_string}.csv', sep=';')

In [70]:
### https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CANON-12751-001 doesn't exist on wiag???
### check for fertig status

In [None]:
SELECT * 
FROM `item_corpus` i INNER JOIN ( 
    select item_id, value FROM url_external WHERE authority_id=200 
) u on i.item_id = u.item_id 
INNER JOIN item ON item.id = i.item_id 
where item.merge_status != 'parent' 
and item.merge_status != 'orphan' 
and item.edit_status = 'online'
and i.id_public = 'WIAG-Pers-CANON-80038-001'

In [15]:
# SELECT *  FROM url_external WHERE authority_id=200

In [16]:

# select id_in_source from item 
# where id_in_source is not NULL 
# and id_in_source!=''
# and id in 
# (SELECT item_id FROM url_external WHERE authority_id=200)

In [17]:
# select * from person where id in
# (select id_in_source from item 
# where id_in_source is not NULL 
# and id_in_source!=''
# and id in 
# (SELECT item_id FROM url_external WHERE authority_id=200))

In [18]:
# SELECT * FROM `item_corpus` 
# where item_id in  (SELECT item_id FROM url_external WHERE authority_id=200)

In [19]:
# SELECT i.id_public, i.item_id, u.value 
# FROM `item_corpus` i 
# INNER JOIN (
#     select item_id, value FROM url_external WHERE authority_id=200
# ) u on i.item_id = u.item_id

In [None]:
# example complex query
# http://personendatenbank.germania-sacra.de/api/v1.0/person?
# query[0][field]=person.vorname&
# query[0][value]=b*&
# query[0][operator]=like&
# query[0][connector]=or&
# query[1][field]=person.familienname&
# query[1][value]=b*&
# query[1][operator]=like&
# format=json-ld
