# Getting metadata for the corpus from gallica.bnf.fr

## Get IDs for the files in the corpus

In [1]:
import csv
ids = []
with open('./corpus_metadata/ids_for_19_full.csv') as file:
    for row in file:
        #ids.append(row.replace('\n', '').replace('__', ':/').replace('_', '/'))
        ids.append(row.replace('\n', ''))

## Download metadata from the gallica API for all records in XML-format into corpus-folder

In [2]:
import pathlib
import urllib
from tqdm import tqdm

for i in tqdm(ids):
    identifier = i.split('_')[-1]
    if identifier: 
        filename = './corpus_metadata/19_full/' + identifier + '.xml'
        url = 'https://gallica.bnf.fr/services/OAIRecord?ark=' + identifier
        try:
            urllib.request.urlretrieve(url, filename)
        except Exception:
            print(i)

100%|██████████| 842/842 [06:14<00:00,  2.25it/s]


## Open metadata XML-files and convert them to CSV

In [3]:
import xmltodict
metadata_headers = ['identifier', 'datestamp', 'setSpec',
            '@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemaLocation', 'dc:identifier',
            'dc:title', 'dc:creator', 'dc:publisher', 'dc:date', 'dc:format', 'dc:language',
            'dc:relation', 'dc:type', 'dc:source', 'dc:rights', 'dc:description',
            'provenance', 'sdewey', 'dewey', 'source', 'typedoc', 'nqamoyen',
            'mode_indexation', 'title', 'date', 'first_indexation_date', 'streamable']


paths = list(pathlib.Path('./corpus_metadata/19_full/').rglob('*.xml'))

metadata_body = []
for path in paths:
    with open(path, 'r') as file:
        data = file.read()
    d = xmltodict.parse(data)
    
    header = [d['results']['notice']['record']['header']['identifier'],
        d['results']['notice']['record']['header']['datestamp'],
        d['results']['notice']['record']['header']['setSpec']]

    metadata = [d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:dc'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:oai_dc'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:xsi'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xsi:schemaLocation'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:identifier'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:title'],
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:creator'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:publisher'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:date'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:format'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:language'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:relation'),
            str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:type')), # make dict
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:source'),
            str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:rights')[0].get('#text')), # make dict
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:description')]

    extra_metadata = [ d['results']['provenance'],
                    d['results'].get('sdewey'),
                    d['results'].get('dewey'),
                    d['results']['source'],
                    d['results']['typedoc'],
                    d['results']['nqamoyen'],
                    d['results'].get('mode_indexation'),
                    d['results']['title'],
                    str(d['results']['date'].get('#text')), # make dict
                    d['results']['first_indexation_date'],
                    d['results']['streamable']]
    
    metadata_body.append(header + metadata + extra_metadata)

with open('./corpus_metadata/corpus_metadata_19_full.csv', 'w',newline="") as f:
    write = csv.writer(f)
    write.writerow(metadata_headers)
    write.writerows(metadata_body)