In [1]:
import requests
import pandas as pd
from tqdm import tqdm
from requests.exceptions import RequestException

def extract_viaf_id(jsonld_data):
    if isinstance(jsonld_data, dict) and '@graph' in jsonld_data:
        for item in jsonld_data['@graph']:
            if '@type' in item and item['@type'] == "schema:Person":
                return item.get('identifier', 'NA')
    return 'NA'

def extract_wkp_id(jsonld_data):
    graph_data = jsonld_data.get('@graph', [])
    for entry in graph_data:
        if 'sameAs' in entry:
            for same_as_url in entry['sameAs']:
                if 'wikidata.org' in same_as_url:
                    return same_as_url.split('/')[-1]
    return 'NA'

def update_person_records(parquet_file, tsv_file, output_file):
    """
    Updates person records by adding VIAF and Wikidata IDs if missing, only for new rows.
    
    Args:
        parquet_file (str): Path to the parquet file with new IDs.
        tsv_file (str): Path to the authority person links file to update.
        output_file (str): Path for saving the updated file.
    """
    
    nle_persons_df = pd.read_parquet(parquet_file)
    person_df = pd.read_csv(tsv_file, sep='\t', na_filter=False)

    new_ids = set(nle_persons_df['id']) - set(person_df['rara_id'])
    relevant_columns = ['id']
    new_rows = nle_persons_df[nle_persons_df['id'].isin(new_ids)][relevant_columns].copy()
    new_rows.rename(columns={'id': 'rara_id'}, inplace=True)

    new_rows['viaf_id'] = 'NA'
    new_rows['wkp_id'] = 'NA'

    updated_person_df = pd.concat([person_df, new_rows], ignore_index=True)
    
    for index, row in tqdm(new_rows.iterrows(), total=len(new_rows), desc="Processing New Records"):
        id_number = row['rara_id'].lstrip('a')

        try:
            jsonld_url = f'https://viaf.org/viaf/sourceID/ERRR|{id_number}/viaf.jsonld'
            response = requests.get(jsonld_url)

            if response.status_code == 200:
                jsonld_data = response.json()
                new_viaf_id = extract_viaf_id(jsonld_data)
                new_wkp_id = extract_wkp_id(jsonld_data)

                if new_viaf_id != 'NA':
                    new_rows.at[index, 'viaf_id'] = new_viaf_id
                if new_wkp_id != 'NA':
                    new_rows.at[index, 'wkp_id'] = new_wkp_id

            else:
                new_rows.at[index, 'viaf_id'] = 'NA'
                new_rows.at[index, 'wkp_id'] = 'NA'

        except RequestException:
            new_rows.at[index, 'viaf_id'] = 'NA'
            new_rows.at[index, 'wkp_id'] = 'NA'

    final_df = pd.concat([person_df, new_rows], ignore_index=True)
    final_df.to_csv(output_file, sep='\t', index=False)
    print(f"Finished!")

In [2]:
update_person_records(
    parquet_file='test_nle_person.parquet',
    tsv_file='test_persons.tsv',
    output_file='persons_updated.tsv'
)

Processing New Records: 100%|██████████████████████████████████████████████████████████| 14/14 [00:10<00:00,  1.40it/s]

Finished!



