In [135]:
import requests
import csv
import os
import pandas as pd
import json
import re
from datetime import datetime
today_string = datetime.now().strftime('%Y-%m-%d')

# Get data from wiag

```sql
SELECT DISTINCT 
  i.id,
  CASE 
    WHEN t_id.epc IS NOT NULL THEN t_id.epc
    WHEN t_id.epc IS NULL AND t_id.can IS NOT NULL THEN t_id.can
    WHEN t_id.epc IS NULL AND t_id.can IS NULL AND t_id.dreg_can IS NOT NULL THEN t_id.dreg_can
  END AS id_public,
  uext.value AS gsn
FROM 
  item AS i
JOIN 
  url_external AS uext ON uext.item_id = i.id AND uext.authority_id = 200
JOIN 
  item_name_role AS inr ON inr.item_id_name = i.id
JOIN 
  (SELECT DISTINCT 
      ic.item_id AS item_id,
      ic_ii.id_public AS epc,
      ic_iii.id_public AS can,
      ic_iv.id_public AS dreg_can
   FROM 
      item_corpus AS ic
   LEFT JOIN 
      item_corpus AS ic_ii ON ic_ii.item_id = ic.item_id AND ic_ii.corpus_id = 'epc'
   LEFT JOIN 
      item_corpus AS ic_iii ON ic_iii.item_id = ic.item_id AND ic_iii.corpus_id = 'can'
   LEFT JOIN 
      item_corpus AS ic_iv ON ic_iv.item_id = ic.item_id AND ic_iv.corpus_id = 'dreg-can'
   WHERE 
      ic.corpus_id IN ('epc', 'can', 'dreg-can')
  ) AS t_id ON t_id.item_id = i.id
WHERE 
  i.is_online = 1;
```

In [136]:
orig_ic_df = pd.read_csv('item_2024-05-22.csv', names=["id", "wiag_id", "pd_id"])
ic_df = orig_ic_df
ic_df

Unnamed: 0,id,wiag_id,pd_id
0,13008,WIAG-Pers-EPISCGatz-21120-001,006-00056-001
1,13009,WIAG-Pers-EPISCGatz-21119-001,070-01102-001
2,13038,WIAG-Pers-EPISCGatz-21018-001,084-00204-001
3,13051,WIAG-Pers-EPISCGatz-21004-001,006-00074-001
4,13052,WIAG-Pers-EPISCGatz-21002-001,006-00070-001
...,...,...,...
11009,16003,WIAG-Pers-EPISCGatz-05523-001,069-01866-001
11010,16223,WIAG-Pers-EPISCGatz-05395-001,035-00067-001
11011,56399,WIAG-Pers-EPISCGatz-05526-001,061-05513-001
11012,56169,WIAG-Pers-EPISCGatz-02918-001,084-00243-001


## Check data from wiag and remove identical records

### The following code block should be empty

In [137]:
ic_df[ic_df['pd_id'].isna()]

Unnamed: 0,id,wiag_id,pd_id


# Identical records in wiag

output is the gsn id of such records

In [138]:
gp_df = ic_df.groupby('pd_id').count()
duplicate_wiag_gsns = gp_df[gp_df['id'] > 1].index.to_list()
duplicate_wiag_gsns

['046-02872-001']

In [139]:
ic_df = ic_df[~ic_df['pd_id'].isin(duplicate_wiag_gsns)]

In [140]:
# ic_df['pr_imported'] = ic_df.wiag_id.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)
# ic_df = ic_df.sort_values(by=['pr_imported'])
# lower_df = ic_df.groupby('pd_id')[['wiag_id', 'pd_id', 'pr_imported']].head()
# lower_df

In [141]:
# ic_df.groupby('pd_id', as_index=False)['pr_imported'].min()

In [142]:
# selection = ic_df.groupby('pd_id')['pr_imported'].min()
# selection = selection.index.tolist()
# lower_df = ic_df[ic_df.pd_id.isin(selection)]
# lower_df

In [143]:
# ic_df[ic_df.wiag_id.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]

# Get data from personendatenbank

```sql
SELECT persons.wiag, persons.id, gsn.id, gsn.nummer 
FROM items 
INNER JOIN persons ON persons.item_id = items.id AND persons.deleted=0 AND items.deleted=0 AND items.status = "online" 
INNER JOIN gsn ON gsn.item_id = items.id AND gsn.deleted=0 
WHERE persons.wiag IS NOT NULL AND persons.wiag != '' 
group by persons.wiag 
having gsn.id=min(gsn.id)
```

<!--
old version
```sql
SELECT persons.wiag, MIN(gsn.nummer)
FROM items
INNER JOIN persons ON persons.item_id = items.id AND persons.deleted=0 AND items.deleted=0 AND items.status = "online"
INNER JOIN gsn ON gsn.item_id = items.id AND gsn.deleted=0
WHERE persons.wiag IS NOT NULL AND persons.wiag != ''
GROUP BY persons.wiag
``` -->


### (Optional) Checking Query 

```sql
select DISTINCT persons.wiag, gsn.id, gsn.nummer
from persons
inner join gsn ON gsn.item_id = persons.item_id
where persons.wiag in (
    SELECT persons.wiag
    FROM items
    INNER JOIN persons ON persons.item_id = items.id AND persons.deleted=0 AND items.deleted=0 AND items.status = "online"
    INNER JOIN gsn ON gsn.item_id = items.id AND gsn.deleted=0
    WHERE persons.wiag IS NOT NULL AND persons.wiag != ''
    GROUP BY persons.wiag
    having count(*) > 1
)
order by persons.wiag
```

In [204]:
pr_df = pd.read_csv('pr_2024-05-22_1.csv', names=["wiag_id", "id", "gsn_table_id", "pd_id"])
pr_df

Unnamed: 0,wiag_id,id,gsn_table_id,pd_id
0,WIAG-Pers-CANON-10014-001,226042,192200,054-02832-001
1,WIAG-Pers-CANON-10032-001,129079,162893,032-02373-001
2,WIAG-Pers-CANON-10047-001,116590,212367,029-03106-001
3,WIAG-Pers-CANON-10054-001,116690,212467,029-03206-001
4,WIAG-Pers-CANON-10071-001,114555,95890,029-01071-001
...,...,...,...,...
10969,WIAG-Pers-EPISCGatz-21735-001,252575,261488,061-06324-001
10970,WIAG-Pers-EPISCGatz-21750-001,219379,189218,052-02662-001
10971,WIAG-Pers-EPISCGatz-21751-001,303871,328143,077-00515-001
10972,WIAG-Pers-EPISCGatz-21766-001,297929,314418,073-01566-001


## Check if there are any problems with the data

In [205]:
pr_df[pr_df['pd_id'].isna()]

Unnamed: 0,wiag_id,id,gsn_table_id,pd_id


In [206]:
pr_df_gp = pr_df.groupby('wiag_id').count()
pr_df_gp[pr_df_gp['pd_id'] > 1]

Unnamed: 0_level_0,id,gsn_table_id,pd_id
wiag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


# Compare records

In [207]:
joined_df = ic_df.merge(pr_df, on='pd_id', suffixes=('_wiag', '_pd'))
joined_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id
0,13008,WIAG-Pers-EPISCGatz-21120-001,006-00056-001,WIAG-Pers-EPISCGatz-21120-001,334162,390898
1,13009,WIAG-Pers-EPISCGatz-21119-001,070-01102-001,WIAG-Pers-EPISCGatz-21119-001,321113,363804
2,13038,WIAG-Pers-EPISCGatz-21018-001,084-00204-001,WIAG-Pers-EPISCGatz-21018-001,331642,382803
3,13051,WIAG-Pers-EPISCGatz-21004-001,006-00074-001,WIAG-Pers-EPISCGatz-21004-001,300667,323238
4,13052,WIAG-Pers-EPISCGatz-21002-001,006-00070-001,WIAG-Pers-EPISCGatz-21002-001,307523,338553
...,...,...,...,...,...,...
10967,16003,WIAG-Pers-EPISCGatz-05523-001,069-01866-001,WIAG-Pers-EPISCGatz-05523-001,330540,380833
10968,16223,WIAG-Pers-EPISCGatz-05395-001,035-00067-001,WIAG-Pers-EPISCGatz-05395-001,137029,103870
10969,56399,WIAG-Pers-EPISCGatz-05526-001,061-05513-001,WIAG-Pers-EPISCGatz-05526-001,335234,393051
10970,56169,WIAG-Pers-EPISCGatz-02918-001,084-00243-001,WIAG-Pers-EPISCGatz-02918-001,331575,382842


In [208]:
unequal_df = joined_df[joined_df['wiag_id_wiag'] != joined_df['wiag_id_pd']]
unequal_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id
10664,40223,WIAG-Pers-CANON-25769-001,052-01335-001,WIAG-Pers-CANON-83849-001,218052,129050
10665,39332,WIAG-Pers-CANON-24827-001,054-00315-001,WIAG-Pers-CANON-83926-001,223525,131017
10666,59977,WIAG-Pers-CANON-25221-001,054-03033-001,WIAG-Pers-CANON-84133-001,226243,252547
10667,59981,WIAG-Pers-CANON-25328-001,052-00807-001,WIAG-Pers-CANON-86196-001,307716,339279
10668,40850,WIAG-Pers-CANON-26504-001,054-02931-001,WIAG-Pers-CANON-84125-001,226141,192299
...,...,...,...,...,...,...
10914,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001,285180,293702
10915,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001,75438,62478
10916,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001,217049,128053
10919,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001,223464,130958


In [209]:
# check_list = list(map(lambda x: x[-9:-8] == '8' or x[-9:-8] == '9', unequal_df[unequal_df['wiag_id_x'].isna()]['wiag_id_y'].to_list()))
# any(check_list)
empty_wiag = unequal_df[unequal_df['wiag_id_wiag'].isna()]
diff_df = empty_wiag[empty_wiag.wiag_id_pd.str.contains('[89][0-9]{4}-[0-9]{3}$', regex=True)]
diff_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id


In [210]:
unequal_df[unequal_df['wiag_id_wiag'].isna()]

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id


In [211]:
unequal_df = unequal_df[~unequal_df['wiag_id_wiag'].isna()]
unequal_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id
10664,40223,WIAG-Pers-CANON-25769-001,052-01335-001,WIAG-Pers-CANON-83849-001,218052,129050
10665,39332,WIAG-Pers-CANON-24827-001,054-00315-001,WIAG-Pers-CANON-83926-001,223525,131017
10666,59977,WIAG-Pers-CANON-25221-001,054-03033-001,WIAG-Pers-CANON-84133-001,226243,252547
10667,59981,WIAG-Pers-CANON-25328-001,052-00807-001,WIAG-Pers-CANON-86196-001,307716,339279
10668,40850,WIAG-Pers-CANON-26504-001,054-02931-001,WIAG-Pers-CANON-84125-001,226141,192299
...,...,...,...,...,...,...
10914,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001,285180,293702
10915,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001,75438,62478
10916,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001,217049,128053
10919,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001,223464,130958


In [212]:
links_df = unequal_df.copy()
links_df['pd_link'] = links_df.apply(lambda row: "http://personendatenbank.germania-sacra.de/index/gsn/" + row["pd_id"], axis = 1)
links_df
links_df['current_wiag_link'] = links_df.apply(lambda row: "https://wiag-vocab.adw-goe.de/id/" + row["wiag_id_wiag"], axis = 1)
links_df
links_df['pd_wiag_link'] = links_df.apply(lambda row: "https://wiag-vocab.adw-goe.de/id/" + row["wiag_id_pd"], axis = 1)
links_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id,pd_link,current_wiag_link,pd_wiag_link
10664,40223,WIAG-Pers-CANON-25769-001,052-01335-001,WIAG-Pers-CANON-83849-001,218052,129050,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10665,39332,WIAG-Pers-CANON-24827-001,054-00315-001,WIAG-Pers-CANON-83926-001,223525,131017,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10666,59977,WIAG-Pers-CANON-25221-001,054-03033-001,WIAG-Pers-CANON-84133-001,226243,252547,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10667,59981,WIAG-Pers-CANON-25328-001,052-00807-001,WIAG-Pers-CANON-86196-001,307716,339279,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10668,40850,WIAG-Pers-CANON-26504-001,054-02931-001,WIAG-Pers-CANON-84125-001,226141,192299,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
...,...,...,...,...,...,...,...,...,...
10914,52530,WIAG-Pers-CANON-49000-001,006-00114-001,WIAG-Pers-CANON-84854-001,285180,293702,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10915,60834,WIAG-Pers-CANON-48993-001,021-00051-001,WIAG-Pers-CANON-81197-001,75438,62478,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10916,60108,WIAG-Pers-CANON-49017-001,052-00332-001,WIAG-Pers-CANON-83821-001,217049,128053,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...
10919,60674,WIAG-Pers-CANON-49003-001,054-00254-001,WIAG-Pers-CANON-83918-001,223464,130958,http://personendatenbank.germania-sacra.de/ind...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...,https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CAN...


In [213]:
links_df[['pd_link', 'current_wiag_link', 'pd_wiag_link']].to_csv(f'inconsistent_data_links_{today_string}.csv', sep=';')

## Generate sql file to run on personendatenbank

In [218]:
query = "LOCK TABLES gsn WRITE;\n"
for _, row in unequal_df.iterrows():
    query += f"""
    UPDATE persons
    SET wiag = '{row['wiag_id_wiag']}'
    WHERE id = {row['id_pd']}; -- id: {row['pd_id']}
"""
query += "\nUNLOCK TABLES;"
with open(f'update_statements_{today_string}.sql', 'w') as file:
    file.write(query)

# Work on people with more than one entry in wiag with the same gsn id

In [219]:
dupl_ppl = orig_ic_df[orig_ic_df['pd_id'].isin(duplicate_wiag_gsns)]
dupl_ppl

Unnamed: 0,id,wiag_id,pd_id
1393,17819,WIAG-Pers-EPISCGatz-21305-001,046-02872-001
9961,14702,WIAG-Pers-EPISCGatz-03210-001,046-02872-001


In [220]:
dupl_join_df = dupl_ppl.merge(pr_df, on='pd_id', suffixes=('_wiag', '_pd'))
dupl_join_df

Unnamed: 0,id_wiag,wiag_id_wiag,pd_id,wiag_id_pd,id_pd,gsn_table_id
0,17819,WIAG-Pers-EPISCGatz-21305-001,046-02872-001,WIAG-Pers-EPISCGatz-03210-001,334165,390910
1,14702,WIAG-Pers-EPISCGatz-03210-001,046-02872-001,WIAG-Pers-EPISCGatz-03210-001,334165,390910


## Manually fix the following entries on personendatenbank if the code below produces any output

In [221]:
for gsn in duplicate_wiag_gsns:
    pd_wiag_id = pr_df[pr_df['pd_id'] == gsn]['wiag_id'].values[0]
    
    if len(dupl_ppl[dupl_ppl['wiag_id'] == pd_wiag_id]) == 0:
        # latest wiag id absent in pr
        print(dupl_ppl[dupl_ppl['pd_id'] == gsn])

In [96]:
### https://wiag-vocab.adw-goe.de/id/WIAG-Pers-CANON-12751-001 doesn't exist on wiag???
### check for fertig status

In [53]:
# SELECT *  FROM url_external WHERE authority_id=200

In [54]:
# select id_in_source from item 
# where id_in_source is not NULL 
# and id_in_source!=''
# and id in 
# (SELECT item_id FROM url_external WHERE authority_id=200)

In [55]:
# select * from person where id in
# (select id_in_source from item 
# where id_in_source is not NULL 
# and id_in_source!=''
# and id in 
# (SELECT item_id FROM url_external WHERE authority_id=200))

In [56]:
# SELECT * FROM `item_corpus` 
# where item_id in  (SELECT item_id FROM url_external WHERE authority_id=200)

In [57]:
# SELECT i.id_public, i.item_id, u.value 
# FROM `item_corpus` i 
# INNER JOIN (
#     select item_id, value FROM url_external WHERE authority_id=200
# ) u on i.item_id = u.item_id

In [58]:
# example complex query
# http://personendatenbank.germania-sacra.de/api/v1.0/person?
# query[0][field]=person.vorname&
# query[0][value]=b*&
# query[0][operator]=like&
# query[0][connector]=or&
# query[1][field]=person.familienname&
# query[1][value]=b*&
# query[1][operator]=like&
# format=json-ld
