# Getting metadata for the corpus from gallica.bnf.fr

## Get IDs for the files in the corpus

In [1]:
import csv
ids = []
with open('./metadata/records.csv') as file:
    reader = csv.reader(file, delimiter=",")
    next(reader)
    for i, line in enumerate(reader):
        century = line[0]
        identifier = line[2].replace('https://gallica.bnf.fr/','').split('/')[-1]
        ids.append([century, identifier])
len(ids)

13524

## Download metadata from the gallica API for all records in XML-format into corpus-folder

In [2]:
import pathlib
import urllib
from tqdm import tqdm

errors = []

for i, (century, identifier) in enumerate(tqdm(ids)):
    if identifier: 
        filename = './all_metadata/' + str(century) + '/' + identifier + '.xml'
        url = 'https://gallica.bnf.fr/services/OAIRecord?ark=' + identifier
        try:
            urllib.request.urlretrieve(url, filename)
        except Exception:
            errors.append([i, century, identifier])

with open('./all_metadata/all_metadata_errors.csv', 'w',newline="") as f:
    write = csv.writer(f)
    write.writerow(["id", "century", "identifier"])
    write.writerows(errors)

100%|██████████| 13524/13524 [1:14:40<00:00,  3.02it/s]  


## Open metadata XML-files and convert them to CSV

In [8]:
import xmltodict
metadata_headers = ['identifier', 'datestamp', 'setSpec',
            '@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemaLocation', 'dc:identifier',
            'dc:title', 'dc:creator', 'dc:publisher', 'dc:date', 'dc:format', 'dc:language',
            'dc:relation', 'dc:type', 'dc:source', 'dc:rights', 'dc:description',
            'provenance', 'sdewey', 'dewey', 'source', 'typedoc', 'nqamoyen',
            'mode_indexation', 'title', 'date', 'first_indexation_date', 'streamable']

centuries = ['18', '19', '20']

for century in centuries:

    paths = list(pathlib.Path('./all_metadata/' + century + '/').rglob('*.xml'))

    metadata_body = []
    for path in paths:
        with open(path, 'r') as file:
            data = file.read()
        d = xmltodict.parse(data)

        header = [d['results']['notice']['record']['header']['identifier'],
            d['results']['notice']['record']['header'].get('datestamp'),
            d['results']['notice']['record']['header'].get('setSpec')]

        metadata = [d['results']['notice']['record']['metadata']['oai_dc:dc'].get('@xmlns:dc'),
                d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:oai_dc'],
                d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:xsi'],
                d['results']['notice']['record']['metadata']['oai_dc:dc']['@xsi:schemaLocation'],
                d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:identifier'],
                d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:title'],
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:creator'),
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:publisher'),
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:date'),
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:format'),
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:language'),
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:relation'),
                str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:type')), # make dict
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:source'),
                str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:rights')), # make dict
                d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:description')]

        extra_metadata = [ d['results']['provenance'],
                        d['results'].get('sdewey'),
                        d['results'].get('dewey'),
                        d['results']['source'],
                        d['results']['typedoc'],
                        d['results']['nqamoyen'],
                        d['results'].get('mode_indexation'),
                        d['results']['title'],
                        str(d['results']['date'].get('#text')), # make dict
                        d['results']['first_indexation_date'],
                        d['results']['streamable']]

        metadata_body.append(header + metadata + extra_metadata)

    with open('./all_metadata/all_metadata_' + century + '.csv', 'w',newline="") as f:
        write = csv.writer(f)
        write.writerow(metadata_headers)
        write.writerows(metadata_body)