# Getting OCR-ed books from gallica.bnf.fr

The whole digital collection for various centuries is at https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-acces-par-periode?mode=desktop. Our focus is on the following collections:

1. https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xviiie-siecle?mode=desktop
2. https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xixe-siecle?mode=desktop
3. https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xxe-siecle?mode=desktop

Our task is to download all OCR-ed books from that collection for three centuries. The problem is that the APIs (https://api.bnf.fr/api-gallica-de-recherche#/recherche and https://api.bnf.fr/fr/api-document-de-gallica#/texte%20brut/get__ark__texteBrut) do not provide search over collections.

## Get all authors from the collection and the links to their pages

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.request
import csv
import pathlib


urls = ['https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xviiie-siecle?mode=desktop',
        'https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xixe-siecle?mode=desktop',
        'https://gallica.bnf.fr/html/und/litteratures/les-classiques-de-la-litterature-du-xxe-siecle?mode=desktop']
authors = []
for century, url in enumerate(urls):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    for link in soup.find_all("div", {"class": "titre"}):
        try:
            authors.append( [str(18 + century), link.a.get('title').strip(), link.a.get('href') ] )
        except Exception:
            pass
len(authors)

243

## Get all links to all books of all authors, titles of books, contributors and license

In [2]:
def request_gallica(query, keywords, filters, startRecord=1):
    url='https://gallica.bnf.fr/SRU'
    params = {'operation': 'searchRetrieve',
              'exactSearch': False,
              'version': '1.2',
              'query': query,
              'collapsing': True,
              'keywords': keywords,
              'startRecord': startRecord,
              'maximumRecords': 50,
              'filter': filters,
              }
    try:
        r = requests.get(url, params)
        r.raise_for_status()
    except Exception: # due to error 500 sometimes
        r = requests.get(url, params)
        r.raise_for_status()
    return BeautifulSoup(r.text, 'lxml')

def merge_pages_of_records(soup, link, author):
    records = []
    for record in soup.find_all("srw:record"):
        identifier = record.find('dc:identifier').text
        title = record.find('dc:title').text
        contributors = [s.text for s in record.find_all('dc:contributor')]
        rights = [s.text for s in record.find_all('dc:rights')]
        records.append([century, link, identifier, author, title, contributors, rights])
    return records

records = []
errors = []
for i, (century, author, link) in tqdm(enumerate(authors)):
    if link.startswith('https://gallica.bnf.fr/ark:'):
        records.append([century, link, link, author, author, [], []])
    else:
        parts_of_link = [str(s) for s in link.split('&')]
        query = [s.replace('query=', '') for s in parts_of_link if s.startswith('query')]
        filters = [s.replace('filter=', '') for s in parts_of_link if s.startswith('filter')]
        keywords = [s.replace('keywords=', '') for s in parts_of_link if s.startswith('keywords')]
        try:
            soup = request_gallica(query, keywords, filters)
            number_of_records = int(soup.find("srw:numberofrecords").text)
            records.extend( merge_pages_of_records(soup, link, author) )
            pages = number_of_records // 50
            if pages>0 and number_of_records!=50:
                sRecord = 51
                for page in range(1,pages+1):
                    sRecord += 50 
                    soup = request_gallica(query, keywords, filters, startRecord=sRecord)
                    records.extend(merge_pages_of_records(soup, link, author))
        except Exception:
            errors.append([i, century, link, author, sRecord])

243it [03:50,  1.06it/s]


## Saving metadata and problematic cases

In [3]:
print( len(errors), len(records) )

3 13524


In [4]:
num = 0
for rec in records:
    if rec[1].startswith('https://gallica.bnf.fr/ark'):
        num += 1
num

2

In [5]:
with open("./metadata/records.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['century', 'link', 'identifier', 'author', 'title', 'contributors', 'rights']) #'link', 
    writer.writerows(records)

with open("./metadata/errors.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['index', 'century', 'link', 'author', 'startRecord']) # , 'sRecord'
    writer.writerows(errors)

with open("./metadata/authors.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['century', 'author', 'link']) # , 'sRecord'
    writer.writerows(authors)

## Download all OCR-ed books into /data folder

In [6]:
errors2 = []
for rec in tqdm(records):
    [century, link, url, author, title, contributors, license] = rec
    if url.startswith('https://gallica.bnf.fr/ark'): 
        filename = url.replace('https://gallica.bnf.fr/', '').replace('/', '|')
        url = url + '.texteBrut'
        pathlib.Path('./data/' + century + '/' + author).mkdir(parents=True, exist_ok=True)
        try:
            urllib.request.urlretrieve(url, './data/' + century + '/' + author + '/' + filename + '.html')
        except Exception:
            errors2.append(rec)

with open("./metadata/errors2.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(['century', 'link', 'identifier', 'author', 'title', 'contributors', 'rights']) #'link', 
    writer.writerows(errors2)

100%|██████████| 13524/13524 [16:04:46<00:00,  4.28s/it]   
