In [1]:
import pandas as pd
from pathlib import Path
import requests
from tqdm import tqdm
import os

In [2]:
MAPPING_375a = {'mees': 'male',
                'naine': 'female',
                'male': 'male',
                'Males': 'male',
                'Females': 'female',
                'female': 'female',
                'Women': 'female',
                'Men': 'male',
                'males': 'male',
                'Male': 'male',
                'sugu': None,
                'Female': 'female',
                'females': 'female',
                'male; Males': 'male',
                'men': 'male',
                'eng': None,
                'ajaloolane': None,
                'Females; naine': 'female',
                'Non-binary people': 'non-binary',
                'meed': 'male',
                'kirjanik': None,
                'Males; Armenians': 'male',
                'not known': None,
                ' mees': 'male',
                'mee': 'male',
                '2': None,
                'Authors; Males': 'male',
                'women': 'female',
                'saksa': None,
                'Transgender people; Males; Transgender men': 'non-binary',
                'mees; male': 'male',
                'fre': None,
                'kunstnik': None,
                'Women; naine': 'female',
                'põlisrahvaste uurija': None,
                'Males; Males': 'male',
                'tõlkija': None,
                'male; Architects': 'male',
                'vene': None,
                'Males; eng': 'male',
                'Males $2 lcdgt; Females': None,
                'mees0': 'male',
                'naine; female': 'female',
                'majandusteadlane': None,
                'filosoofiadoktor': None,
                'rus': None,
                'Males; Fantasy fiction; Authors': 'male',
                'lit': None,
                'transgender woman; male; female': 'non-binary',
                'Music teachers; Males': 'male',
                'persoon; mees': 'male',
                'Tokyo, Jaapan; mees': 'male',
                'õppejõud; mees': 'male',
                'professor': None,
                'mwwa': 'male',
                'male; Men': 'male',
                'Law teachers': None,
                'Femles': 'female',
                'nane': 'female',
                'lcdgt': None,
                'female; Women': 'female',
                'persoon; Males': 'male',
                'Composition teachers (Music); Females': 'female',
                'mees; mees': 'male',
                'mes': 'male',
                'mess': 'male',
                'aine': 'female'}

In [13]:
current_notebook_path = Path().resolve()

project_root = current_notebook_path.parent

curated_persons = project_root / "data" / "curated" / "nle_persons.parquet"
persons_gender = project_root / "config" / "persons" / "persons_gender.tsv"
output_file = project_root / "config" / "persons" / "persons_gender.tsv"

In [11]:
df_persons = pd.read_parquet(curated_persons, engine='pyarrow')

In [14]:
df_persons_gender = pd.read_csv(persons_gender, sep='\t')
df_persons_gender.fillna('NA', inplace=True)

In [15]:
new_ids = df_persons[~df_persons['id'].isin(df_persons_gender['rara_id'])]

new_ids.loc[:, 'gender'] = new_ids['gender'].map(MAPPING_375a)

In [16]:
def extract_gender(data):
    try:
        gender_info = data.get('@graph', [])[0].get('gender', None)
        
        if gender_info:
            if "Q6581097" in gender_info:
                return "male"
            elif "Q6581072" in gender_info:
                return "female"
            elif "Q432848" in gender_info:
                return "non-binary"
            else:
                return "NA"
        else:
            return "NA"
    except Exception:
        return "NA"

In [17]:
timeout_duration = 10

for index, row in tqdm(new_ids.iterrows(), total=len(new_ids), desc="Processing Records"):
    viaf_id = row['viaf_id']
    
    if pd.isna(viaf_id) or viaf_id == 'NA' or pd.notna(new_ids.at[index, 'gender']):
        continue

    try:
        response = requests.get(f'https://viaf.org/viaf/{viaf_id}/viaf.jsonld', timeout=timeout_duration)
        
        if response.status_code == 200:
            try:
                data = response.json()
                gender = extract_gender(data)
                
                new_ids.at[index, 'gender'] = gender

                new_ids.iloc[[index]].to_csv(output_file, mode='a', sep='\t', index=False, header=not os.path.exists(output_file))
            
            except Exception:
                pass
        
    except requests.exceptions.ReadTimeout:
        pass

Processing Records: 100%|██████████████████████████████████████████████████████████| 2856/2856 [04:20<00:00, 10.95it/s]


In [18]:
new_ids = new_ids[['id', 'gender']].rename(columns={'id': 'rara_id'})
new_ids.fillna('NA', inplace=True)

df_updated = pd.concat([df_persons_gender, new_ids], ignore_index=True)

df_updated.to_csv(output_file, sep='\t', index=False)