# Script 1 to clean and enrich the C-CLAMP metadata file (before processing in OpenRefine)
This script has to be available within the same folder as ...
1) `C-CLAMP_metadata_gender.txt`
2) `auteurs(in).csv`
3) `Alle_personen_adb_DBNL.csv`
4) `Alle_personen_DBNL_beroepen.csv`
5) `Alle_personen_DBNL_organisaties.csv`

In [1]:
# Import the necessary packages
import pandas as pd
import numpy as np
import re
from itertools import chain

## Step 1: Process the original metadata file

In [3]:
# Read the original metadata file with the authors
metadata_df = pd.read_csv("C-CLAMP_metadata_gender.txt", sep="\t", encoding='utf-8')

In [5]:
## Create a list of all the authors
# Use str.split to split the strings and expand to create separate rows
df_cleaned = metadata_df['Author'].str.split(';', expand=True).stack()

# Reset index to clean up
df_cleaned = df_cleaned.reset_index(drop=True)

# Drop missing values
df_cleaned = df_cleaned.dropna()

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Sort the values
df_cleaned = df_cleaned.sort_values().reset_index(drop=True)

# Create a new DataFrame for the authors
author_df = pd.DataFrame({'Author': df_cleaned})
print(author_df)

                      Author
0             't Haakstertje
1                   A. Aerts
2           A. Agnes Sneller
3                 A. Alberts
4                    A. Ampe
...                      ...
10519           van der Veur
10520    zuster Maria Jozefa
10521            Ágnes Roboz
10522  Émile O.J.J.G. Lousse
10523        Émile Verhaeren

[10524 rows x 1 columns]


In [7]:
# Save the author list
author_df.to_csv("author_list.txt", index=False, sep="\t", encoding='utf-8')

In [13]:
## Create a list of all authors and their available metadata
# Select Author, DOB, POB, DOD, POD and Link
metadata_original_df = metadata_df[['Author', 'DOB', 'POB', 'DOD', 'POD', 'Link']]

# Flatten the DataFrame
flattened_data = []

for index, row in metadata_original_df.iterrows():
    authors = str(row['Author']).split('; ')
    dobs = str(row['DOB']).split('; ')
    pobs = str(row['POB']).split('; ')
    dods = str(row['DOD']).split('; ')
    pods = str(row['POD']).split('; ')
    links = str(row['Link']).split(' ; ')

    # Define the number of splits based on the number of authors
    splits = len(authors)

    # Define a function that fills down values that cannot be split or fills in blanks in case of too little values
    def fill_down(list):
        if len(list) == 1 and splits > 1:
            return list * splits
        else:
            return list + [np.nan] * (splits - len(list))

    authors = fill_down(authors)
    dobs = fill_down(dobs)
    pobs = fill_down(pobs)
    dods = fill_down(dods)
    pods = fill_down(pods)
    links = fill_down(links)

    for author, dob, pob, dod, pod, link in zip(authors, dobs, pobs, dods, pods, links):
        flattened_data.append({
            'Author': author,
            'DOB': dob,
            'POB': pob,
            'DOD': dod,
            'POD': pod,
            'Link': link
        })

flattened_df = pd.DataFrame(flattened_data)

# Extract the identifiers
for link in flattened_df:
    flattened_df['identifier'] = flattened_df['Link'].str.extract(r'.*=(\w*)')

# Lowercase all identifiers
flattened_df['identifier'] = flattened_df['identifier'].str.lower()

# Replace 'NA' or 'nan' with NaN
flattened_df.replace(['NA', 'nan'], np.nan, inplace=True)

# Correct all updated identifiers
flattened_df = flattened_df.replace({'identifier': {'haas021': 'draa001', 'domm015': 'domm004', 'rooy001': 'rooi003',
                                                                      'roes008': 'roos006', 'domm014': 'domm004', 'buur017': 'buur001',
                                                                      'merc034': 'merc005'}})

# Remove duplicates based on identifier column and remove rows with empty identifiers
flattened_df = flattened_df.drop_duplicates(subset='identifier')
flattened_df = flattened_df.dropna(subset=['identifier'])

print(flattened_df)

                   Author              DOB         POB              DOD  \
1            Jan Engelman      7 juni 1900     Utrecht    20 maart 1972   
2             Willem Maas    28 april 1897     Utrecht       6 mei 1950   
3            Joep Nicolas   6 oktober 1897         NaN     25 juli 1972   
5           Albert Helman  7 november 1903  Paramaribo     10 juli 1996   
6      Willem Nieuwenhuis             1886         NaN             1935   
...                   ...              ...         ...              ...   
62933        A. De Geyter       20ste eeuw         NaN              NaN   
62934            J. Hoing       20ste eeuw         NaN              NaN   
62935       Flor Kielbaey       20ste eeuw         NaN              NaN   
62936  Hendrik Imberechts    13 april 1922         NaN  3 februari 2012   
62937            H. Aerts       20ste eeuw         NaN              NaN   

             POD                                               Link identifier  
1      Amsterdam  

In [15]:
# Save the author list with the metadata and the identifiers
flattened_df.to_csv("author_metadata_original.txt", index=False, sep="\t", encoding='utf-8')

In [17]:
# Create a separate DataFrame only with the author names, identifiers and links
author_identifier_df = flattened_df[['Author', 'Link', 'identifier']]
author_identifier_df.to_csv("author_identifier_list.txt", index=False, sep="\t", encoding='utf-8')

## Step 2: Process the metadata file extracted from the openly available DBNL database

In [27]:
# Read the metadata file from DBNL
dbnl_df = pd.read_csv("auteurs(in).csv", sep=",", encoding='utf-8')

In [29]:
## Filter and clean dbnl_df
# Filter dbnl_df
dbnl_df = dbnl_df[['identifier', 'author_given_name', 'author_family_name', 'alternate_names', 'occupation', 'birthPlace', 'birthDate', 'deathPlace', 'deathDate', 'gender']]

# Lowercase all identifiers
dbnl_df['identifier'] = dbnl_df['identifier'].str.lower()

# Merge the values in occupation per identifier, set '/' as separator
dbnl_df['occupation'] = dbnl_df.groupby('identifier')['occupation'].transform(lambda x: '/'.join(x.dropna()))

# Create a function to remove duplicates in the occupation values and apply it to the occupation column
def remove_duplicates(text):
    items = text.split('/')
    unique_items = list(set(items))
    return '/'.join(unique_items)

dbnl_df['occupation'] = dbnl_df['occupation'].apply(remove_duplicates)

# Remove duplicates based on the identifiers
dbnl_df = dbnl_df.drop_duplicates(subset='identifier')

# Rename columns
dbnl_df = dbnl_df.rename(columns = {'author_given_name': 'givenName', 'author_family_name': 'familyName', 'alternate_names': 'alternateNames'})

# Replace #SEP# in the alternateNames column with '/'
dbnl_df['alternateNames'] = dbnl_df['alternateNames'].str.replace(r'#SEP#', '/', regex=True).str.strip()

## Step 3: Merge both metadata files

In [31]:
## Merge authors_identifier_df and dbnl_df on the identifier column
# Keep all rows from authors_identifier_df
authors_metadata_merge_df = pd.merge(flattened_df, dbnl_df, on='identifier', how='left')

# Replace empty cells, 'NA' or 'nan' with NaN
authors_metadata_merge_df.replace(['', 'NA', 'nan'], np.nan, inplace=True)

# Trim leading and trailing spaces
authors_metadata_merge_df = authors_metadata_merge_df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [35]:
# Save the merged metadata file
authors_metadata_merge_df.to_csv("author_metadata_merge_list.txt", index=False, sep='\t', encoding='utf-8')
authors_metadata_merge_df

Unnamed: 0,Author,DOB,POB,DOD,POD,Link,identifier,givenName,familyName,alternateNames,occupation,birthPlace,birthDate,deathPlace,deathDate,gender
0,Jan Engelman,7 juni 1900,Utrecht,20 maart 1972,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=eng...,enge016,Johannes Aloysius Antonius,Engelman,Jan Engelman,redacteur/journalist/vertaler,Utrecht,6/7/1900,Amsterdam,3/20/1972,male
1,Willem Maas,28 april 1897,Utrecht,6 mei 1950,Utrecht,https://www.dbnl.org/auteurs/auteur.php?id=maa...,maas024,Willem Arnoldus,Maas,Willem Maas,illustrator/tekenaar/beeldend kunstenaar,Utrecht,1897-04-28,Utrecht,5/6/1950,male
2,Joep Nicolas,6 oktober 1897,,25 juli 1972,Steyl,https://www.dbnl.org/auteurs/auteur.php?id=nic...,nico008,Josephus Antonius Hubertus Franciscus,Nicolas,Joep Nicolas,schilder/ambachtsman,Roermond,1897-10-06,Steyl,7/25/1972,male
3,Albert Helman,7 november 1903,Paramaribo,10 juli 1996,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=hel...,helm003,Albert,Helman,Albert Helman/Beckmesser/Brandaris/Floris Kapt...,musicus/journalist/politicus/redacteur/staatsm...,Paramaribo,11/7/1903,Amsterdam,7/10/1996,male
4,Willem Nieuwenhuis,1886,,1935,,https://www.dbnl.org/auteurs/auteur.php?id=nie...,nieu047,Willem,Nieuwenhuis,Willem Nieuwenhuis,,,1886,,1935,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8748,A. De Geyter,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=gey...,geyt003,A.,De Geyter,A. De Geyter,,,19xx,,,male
8749,J. Hoing,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=hoi...,hoin002,J.,Hoing,J. Hoing,,,19xx,,,male
8750,Flor Kielbaey,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=kie...,kiel012,Flor,Kielbaey,Flor Kielbaey,vertaler,,19xx,,,male
8751,Hendrik Imberechts,13 april 1922,,3 februari 2012,,https://www.dbnl.org/auteurs/auteur.php?id=imb...,imbe003,Hendrik,Imberechts,Hendrik Imberechts,rk-geestelijke/kloosterling,Muizen,4/13/1922,Leuven,2/3/2012,male


## Step 4: Process the metadata files delivered by the Dutch KB, i.e., taken from the DBNL's internal database

In [37]:
# Read the metadata files from KB
kb_metadata_df = pd.read_csv("Alle_personen_adb_DBNL.csv", sep=";", encoding='utf-8')
kb_professions_df = pd.read_csv("Alle_personen_DBNL_beroepen.csv", sep=";", encoding='utf-8')
kb_organisations_df = pd.read_csv("Alle_personen_DBNL_organisaties.csv", sep=";", encoding='utf-8')

  kb_metadata_df = pd.read_csv("Alle_personen_adb_DBNL.csv", sep=";", encoding='utf-8')


In [39]:
## Filter and clean kb_metadata_df
# Filter kb_metadata_df
kb_metadata_df = kb_metadata_df[['pers_id', 'geb_datum', 'jaar_geboren', 'geb_plaats', 'geb_land_code', 'overl_datum', 'jaar_overlijden',
                                  'overl_plaats', 'overl_land_code', 'periode', 'secundair', 'taalkunde', 'jeugdliteratuur', 'nonfictie',
                                  'suriname', 'zuidafrika', 'buitenland', 'fries', 'limburg', 'taalcode', 'taalcode_2']]
print(kb_metadata_df.head())

# Rename columns
kb_metadata_df = kb_metadata_df.rename(columns = {'pers_id': 'identifier', 'geb_land_code': 'birthCountry', 'overl_land_code': 'deathCountry'})

# Rename numerical values in period (periode) column
kb_metadata_df['periode'] = kb_metadata_df['periode'].astype(str)

period_mapping = {
    "1.0": "Middeleeuwen", "2.0": "Periode 1550-1700", "3.0": "Middeleeuwen/Gouden Eeuw", "4.0": "Achttiende Eeuw", "6.0": "17e/18e Eeuw",
    "8.0": "Negentiende Eeuw", "12.0": "18e/19e Eeuw", "16.0": "Twintigste Eeuw", "24.0": "19e/20e Eeuw", "32.0": "Eenentwintigste Eeuw",
    "48.0": "20e/21e Eeuw", "64.0": "Voor 400"}

kb_metadata_df['period'] = kb_metadata_df['periode'].map(period_mapping)

# Merge the date of birth columns (geb_datum and jaar_geboren) into one, idem for the date of death columns
kb_metadata_df['geboortedatum'] = (kb_metadata_df['geb_datum'].fillna('') + ' ' + kb_metadata_df['jaar_geboren'].fillna('').astype(str)).str.strip()
kb_metadata_df['sterfdatum'] = (kb_metadata_df['overl_datum'].fillna('') + ' ' + kb_metadata_df['jaar_overlijden'].fillna('').astype(str)).str.strip()

# Merge the language code columns into one
kb_metadata_df['language'] = (kb_metadata_df['taalcode'].fillna('') + '/' + kb_metadata_df['taalcode_2'].fillna(''))
kb_metadata_df['language'] = kb_metadata_df['taalcode'].str.strip('/').str.strip()

# Create a new column in which all categories are listed per writer
categories = ['secundair', 'taalkunde', 'jeugdliteratuur', 'nonfictie', 'suriname', 'zuidafrika', 'buitenland', 'fries', 'limburg']

kb_metadata_df['category'] = kb_metadata_df.apply(
    lambda row: '/'.join([category for category in categories if row[category] == 'WAAR']), axis=1
)

# Filter the DataFrame
kb_metadata_filtered_df = kb_metadata_df[['identifier', 'geb_plaats', 'birthCountry', 'overl_plaats', 'deathCountry', 'period', 'geboortedatum', 'sterfdatum',
                                          'language', 'category']]

# Lowercase all identifiers
kb_metadata_filtered_df['identifier'] = kb_metadata_filtered_df['identifier'].str.lower()

   pers_id geb_datum   jaar_geboren geb_plaats geb_land_code overl_datum  \
0  _dam002       NaN  ?(20ste eeuw)        NaN           NaN         NaN   
1  _lum001       NaN  ?(20ste eeuw)        NaN           NaN         NaN   
2  _mol002       NaN  ?(20ste eeuw)        NaN           NaN         NaN   
3  _out001       NaN  ?(20ste eeuw)        NaN           NaN         NaN   
4  _tam001       NaN   ?(19de eeuw)        NaN           NaN         NaN   

  jaar_overlijden overl_plaats overl_land_code  periode  ... taalkunde  \
0             NaN          NaN             NaN     48.0  ...    ONWAAR   
1             NaN          NaN             NaN     48.0  ...    ONWAAR   
2             NaN          NaN             NaN     16.0  ...    ONWAAR   
3             NaN          NaN             NaN     16.0  ...    ONWAAR   
4             NaN          NaN             NaN     24.0  ...    ONWAAR   

  jeugdliteratuur nonfictie suriname zuidafrika buitenland   fries limburg  \
0            WAAR   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kb_metadata_filtered_df['identifier'] = kb_metadata_filtered_df['identifier'].str.lower()


In [41]:
## Filter and clean kb_professions_df
# Filter kb_professions_df
kb_professions_df = kb_professions_df[['pers_id', 'omschrijving']]

# Rename columns
kb_professions_df = kb_professions_df.rename(columns = {'pers_id': 'identifier', 'omschrijving': 'profession'})

# Lowercase all identifiers
kb_professions_df['identifier'] = kb_professions_df['identifier'].str.lower()

# Merge the values in profession per identifier, set '/' as separator
kb_professions_df['profession'] = kb_professions_df.groupby('identifier')['profession'].transform(lambda x: '/'.join(x.dropna()))

# Remove duplicates in the profession values with the remove_duplicates function defined above
kb_professions_df['profession'] = kb_professions_df['profession'].apply(remove_duplicates)

# Remove duplicates based on the identifiers
kb_professions_df = kb_professions_df.drop_duplicates(subset='identifier')

In [43]:
## Filter and clean kb_organisations_df
kb_organisations_df = kb_organisations_df[['pers_id', 'Gestandariseerde naam']]

# Rename columns
kb_organisations_df = kb_organisations_df.rename(columns = {'pers_id': 'identifier', 'Gestandariseerde naam': 'organisation'})

# Remove spaces before/after '/' in the organisation column
kb_organisations_df['organisation'] = kb_organisations_df['organisation'].str.replace(r'\s/\s', '/', regex=True).str.strip()

# Lowercase all identifiers
kb_organisations_df['identifier'] = kb_organisations_df['identifier'].str.lower()

# Merge the values in organisation per identifier, set '/' as separator
kb_organisations_df['organisation'] = kb_organisations_df.groupby('identifier')['organisation'].transform(lambda x: '/'.join(x.dropna()))

# Remove duplicates in the organisation values with the remove_duplicates function defined above
kb_organisations_df['organisation'] = kb_organisations_df['organisation'].apply(remove_duplicates)

# Remove duplicates based on the identifiers
kb_organisations_df = kb_organisations_df.drop_duplicates(subset='identifier')

## Step 5: Merge the KB files with the previously created metadata file

In [47]:
## Merge all files based on the identifier column
authors_metadata_merge2_df = pd.merge(authors_metadata_merge_df, kb_metadata_filtered_df, on='identifier', how='left')
authors_metadata_merge2_df = pd.merge(authors_metadata_merge2_df, kb_professions_df, on='identifier', how='left')
authors_metadata_merge2_df = pd.merge(authors_metadata_merge2_df, kb_organisations_df, on='identifier', how='left')

# Replace empty celles, 'NA' or 'nan' with NaN
authors_metadata_merge2_df.replace(['', 'NA', 'nan'], np.nan, inplace=True)

# Trim leading and trailing spaces
authors_metadata_merge2_df = authors_metadata_merge2_df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [49]:
# Save the merged metadata file
authors_metadata_merge2_df.to_csv("author_metadata_merge2_list.txt", index=False, sep='\t', encoding='utf-8')
authors_metadata_merge2_df

Unnamed: 0,Author,DOB,POB,DOD,POD,Link,identifier,givenName,familyName,alternateNames,...,birthCountry,overl_plaats,deathCountry,period,geboortedatum,sterfdatum,language,category,profession,organisation
0,Jan Engelman,7 juni 1900,Utrecht,20 maart 1972,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=eng...,enge016,Johannes Aloysius Antonius,Engelman,Jan Engelman,...,,Amsterdam,,Twintigste Eeuw,7 juni 1900,20 maart 1972,nl,secundair/nonfictie,redacteur/journalist/vertaler,
1,Willem Maas,28 april 1897,Utrecht,6 mei 1950,Utrecht,https://www.dbnl.org/auteurs/auteur.php?id=maa...,maas024,Willem Arnoldus,Maas,Willem Maas,...,,Utrecht,,Twintigste Eeuw,28 april 1897,6 mei 1950,,,illustrator/tekenaar/beeldend kunstenaar,
2,Joep Nicolas,6 oktober 1897,,25 juli 1972,Steyl,https://www.dbnl.org/auteurs/auteur.php?id=nic...,nico008,Josephus Antonius Hubertus Franciscus,Nicolas,Joep Nicolas,...,,Steyl,,Twintigste Eeuw,6 oktober 1897,25 juli 1972,,nonfictie,schilder/ambachtsman,
3,Albert Helman,7 november 1903,Paramaribo,10 juli 1996,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=hel...,helm003,Albert,Helman,Albert Helman/Beckmesser/Brandaris/Floris Kapt...,...,Suriname,Amsterdam,,Twintigste Eeuw,7 november 1903,10 juli 1996,nl,jeugdliteratuur/nonfictie/suriname/buitenland,musicus/journalist/redacteur/staatsman/jongleu...,
4,Willem Nieuwenhuis,1886,,1935,,https://www.dbnl.org/auteurs/auteur.php?id=nie...,nieu047,Willem,Nieuwenhuis,Willem Nieuwenhuis,...,,,,Twintigste Eeuw,1886,1935,nl,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8748,A. De Geyter,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=gey...,geyt003,A.,De Geyter,A. De Geyter,...,,,,Twintigste Eeuw,?(20ste eeuw),,,secundair,,
8749,J. Hoing,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=hoi...,hoin002,J.,Hoing,J. Hoing,...,,,,Twintigste Eeuw,?(20ste eeuw),,,secundair/nonfictie,,
8750,Flor Kielbaey,20ste eeuw,,,,https://www.dbnl.org/auteurs/auteur.php?id=kie...,kiel012,Flor,Kielbaey,Flor Kielbaey,...,,,,Twintigste Eeuw,?(20ste eeuw),,,secundair,vertaler,
8751,Hendrik Imberechts,13 april 1922,,3 februari 2012,,https://www.dbnl.org/auteurs/auteur.php?id=imb...,imbe003,Hendrik,Imberechts,Hendrik Imberechts,...,,Leuven,,Twintigste Eeuw,13 april 1922,3 februari 2012,nl,secundair/nonfictie,rk-geestelijke/kloosterling,


## Step 6: (Semi)Manual cleaning using OpenRefine
See Appendix in Schockaert (2025) for a detailed breakdown of the steps taken.

Clean-up using Python continues in C-CLAMP_metadata_cleanup_B.