# Getting metadata for the corpus from gallica.bnf.fr

## Get IDs for the files in the corpus

In [None]:
import csv
ids = []
with open('./corpus_metadata/ids_for_19.csv') as file:
    for row in file:
        #ids.append(row.replace('\n', '').replace('__', ':/').replace('_', '/'))
        ids.append(row.replace('\n', ''))

## Download metadata from the gallica API for all records in XML-format into corpus-folder

In [None]:
import pathlib
import urllib
from tqdm import tqdm

for i in tqdm(ids):
    identifier = i.split('_')[-1]
    if identifier: 
        filename = './corpus_metadata/19/' + identifier + '.xml'
        url = 'https://gallica.bnf.fr/services/OAIRecord?ark=' + identifier
        try:
            urllib.request.urlretrieve(url, filename)
        except Exception:
            print(i)

## Open metadata XML-files and convert them to CSV

In [None]:
#d['results']['notice']['record']
import xmltodict
metadata_headers = ['identifier', 'datestamp', 'setSpec',
            '@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemaLocation', 'dc:identifier',
            'dc:title', 'dc:creator', 'dc:publisher', 'dc:date', 'dc:format', 'dc:language',
            'dc:relation', 'dc:type', 'dc:source', 'dc:rights', 'dc:description',
            'provenance', 'sdewey', 'dewey', 'source', 'typedoc', 'nqamoyen',
            'mode_indexation', 'title', 'date', 'first_indexation_date', 'streamable']


paths = list(pathlib.Path('./corpus_metadata/19/').rglob('*.xml'))

metadata_body = []
for path in paths:
    with open(path, 'r') as file:
        data = file.read()
    d = xmltodict.parse(data)
    
    header = [d['results']['notice']['record']['header']['identifier'],
        d['results']['notice']['record']['header']['datestamp'],
        d['results']['notice']['record']['header']['setSpec']]

    metadata = [d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:dc'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:oai_dc'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xmlns:xsi'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['@xsi:schemaLocation'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:identifier'],
            d['results']['notice']['record']['metadata']['oai_dc:dc']['dc:title'],
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:creator'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:publisher'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:date'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:format'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:language'),
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:relation'),
            str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:type')), # make dict
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:source'),
            str(d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:rights')[0].get('#text')), # make dict
            d['results']['notice']['record']['metadata']['oai_dc:dc'].get('dc:description')]

    extra_metadata = [ d['results']['provenance'],
                    d['results'].get('sdewey'),
                    d['results'].get('dewey'),
                    d['results']['source'],
                    d['results']['typedoc'],
                    d['results']['nqamoyen'],
                    d['results'].get('mode_indexation'),
                    d['results']['title'],
                    str(d['results']['date'].get('#text')), # make dict
                    d['results']['first_indexation_date'],
                    d['results']['streamable']]
    
    metadata_body.append(header + metadata + extra_metadata)

with open('./corpus_metadata/corpus_metadata_19.csv', 'w',newline="") as f:
    write = csv.writer(f)
    write.writerow(metadata_headers)
    write.writerows(metadata_body)


In [None]:
def request_gallica(query, keywords, filters, startRecord=1):
    url='https://gallica.bnf.fr/SRU'
    params = {'operation': 'searchRetrieve',
              'exactSearch': False,
              'version': '1.2',
              'query': query,
              'collapsing': True,
              'keywords': keywords,
              'startRecord': startRecord,
              'maximumRecords': 50,
              'filter': filters,
              }
    try:
        r = requests.get(url, params)
        r.raise_for_status()
    except Exception: # due to error 500 sometimes
        r = requests.get(url, params)
        r.raise_for_status()
    return BeautifulSoup(r.text, 'lxml')

def merge_pages_of_records(soup, link, author):
    records = []
    for record in soup.find_all("srw:record"):
        identifier = record.find('dc:identifier').text
        title = record.find('dc:title').text
        contributors = [s.text for s in record.find_all('dc:contributor')]
        rights = [s.text for s in record.find_all('dc:rights')]
        records.append([century, link, identifier, author, title, contributors, rights])
    return records

records = []
errors = []
for i, (century, author, link) in tqdm(enumerate(authors)):
    if link.startswith('https://gallica.bnf.fr/ark:'):
        records.append([century, link, link, author, author, [], []])
    else:
        parts_of_link = [str(s) for s in link.split('&')]
        query = [s.replace('query=', '') for s in parts_of_link if s.startswith('query')]
        filters = [s.replace('filter=', '') for s in parts_of_link if s.startswith('filter')]
        keywords = [s.replace('keywords=', '') for s in parts_of_link if s.startswith('keywords')]
        try:
            soup = request_gallica(query, keywords, filters)
            number_of_records = int(soup.find("srw:numberofrecords").text)
            records.extend( merge_pages_of_records(soup, link, author) )
            pages = number_of_records // 50
            if pages>0 and number_of_records!=50:
                sRecord = 51
                for page in range(1,pages+1):
                    sRecord += 50 
                    soup = request_gallica(query, keywords, filters, startRecord=sRecord)
                    records.extend(merge_pages_of_records(soup, link, author))
        except Exception:
            errors.append([i, century, link, author, sRecord])

In [None]:
print( len(errors), len(records) )

In [None]:
num = 0
for rec in records:
    if rec[1].startswith('https://gallica.bnf.fr/ark'):
        num += 1
num

In [None]:
with open("./metadata/records.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['century', 'link', 'identifier', 'author', 'title', 'contributors', 'rights']) #'link', 
    writer.writerows(records)

with open("./metadata/errors.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['index', 'century', 'link', 'author', 'startRecord']) # , 'sRecord'
    writer.writerows(errors)

with open("./metadata/authors.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['century', 'author', 'link']) # , 'sRecord'
    writer.writerows(authors)