In [None]:
!pip install xmltodict

In [1]:
import requests
import xmltodict


## Accessing a resource in python

In [2]:
import requests

# Perform a HTTP GET request on the desired URL
response = requests.get('https://www.google.com')

In [3]:
# Object response, with status code 200, which means the request succeded
response

<Response [200]>

### Checking Response

In [4]:
# The code of the response can be accessed
response.status_code

200

In [5]:
# We will always get a response object, even if the request was not successful (assuming the server exists)
# Here we generate an error if the request failed. If the request was a success, nothing is raised
response.raise_for_status()

In [6]:
# And here we check directly if the status is one of the good ones
response.ok

True

In [7]:
# Failure case, resource not found
response = requests.get('https://www.google.com/totally-not-existing-path-that-google-will-not-answer')
print(response.ok)
print(response.status_code)
print(response.reason)

False
404
Not Found


In [8]:
response.raise_for_status()

HTTPError: 404 Client Error: Not Found for url: https://www.google.com/totally-not-existing-path-that-google-will-not-answer

### Accessing content

In [9]:
response = requests.get('https://www.google.com')
# The content of the answer can directly be accessed with .content
# Here we display the source code of the google webpage
response.content

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="fr"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2020/thank-you-public-health-workers-and-to-researchers-in-the-scientific-community-6753651837108753-law.gif" itemprop="image"><meta content="Merci \xe0 tous les personnels de sant\xe9 et \xe0 tous les chercheurs" property="twitter:title"><meta content="Merci \xe0 tous les personnels de sant\xe9 et \xe0 tous les chercheurs #GoogleDoodle" property="twitter:description"><meta content="Merci \xe0 tous les personnels de sant\xe9 et \xe0 tous les chercheurs #GoogleDoodle" property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content="https://www.google.com/logos/doodles/2020/thank-you-public-health-workers-and-to-researchers-in-the-scientific-community-6753651837108753.3-2xa.gif" property="twitter:image"><meta conten

In [10]:
# Access the full text representation of https://www.e-rara.ch/oec/vitruviana/content/titleinfo/6116002
response = requests.get('https://www.e-rara.ch/oec/download/fulltext/plain/6116002')
response.status_code

200

In [11]:
# By default response are raw bytes, binary format
response.content[1000:2000]  # Showing just 1000 characters

b'se rattache \xc3\xa0 son souvenir. Deux \xc3\xa9crivains pourtant semblent avoir enregistr\xc3\xa9 son nom dans leurs \xc3\xa9crits, comme pour indiquer qu\xe2\x80\x99il v\xc3\xa9cut : Pline, qui le cite au nombre des auteurs dont il s\xe2\x80\x99est servi; Frontin, qui le nomme comme ayant \xc3\xa9t\xc3\xa9 r\xc3\xa9put\xc3\xa9 l\xe2\x80\x99inventeur du module quinaire dans les aqueducs. On ne peut donc savoir sur la vie de Vitruve que ce qu\xe2\x80\x99il en a dit lui-m\xc3\xaame : or, il nous apprend qu\xe2\x80\x99il occupa un rang assez important dans les arm\xc3\xa9es de J. C\xc3\xa9sar, aupr\xc3\xa8s duquel il jouissait d\xe2\x80\x99une certaine consid\xc3\xa9ration ; que, de concert avec M. Aurelius, P. Numidius et L. Cornelius, il fut employ\xc3\xa9 \xc3\xa0 la construction des machines de guerre ; qu\xe2\x80\x99il \xc3\xa9leva la basilique de Fano; que, gr\xc3\xa2ce \xc3\xa0 la recommandation de la s\xc5\x93ur d\xe2\x80\x99Auguste, il dut \xc3\xa0 cet empereur, auquel il d\xc

In [12]:
# Getting a nicely encoded version
response.text[1000:2000]

'oir enregistré son nom dans leurs écrits, comme pour indiquer qu’il vécut : Pline, qui le cite au nombre des auteurs dont il s’est servi; Frontin, qui le nomme comme ayant été réputé l’inventeur du module quinaire dans les aqueducs. On ne peut donc savoir sur la vie de Vitruve que ce qu’il en a dit lui-même : or, il nous apprend qu’il occupa un rang assez important dans les armées de J. César, auprès duquel il jouissait d’une certaine considération ; que, de concert avec M. Aurelius, P. Numidius et L. Cornelius, il fut employé à la construction des machines de guerre ; qu’il éleva la basilique de Fano; que, grâce à la recommandation de la sœur d’Auguste, il dut à cet empereur, auquel il dédie son ouvrage, des gratifications qui mettaient ses vieux jours à l’abri du besoin *. Sa taille était peu avantageuse ; il avait une constitution maladive : mais il ne désespérait pas de racheter par ses connaissances ce qui lui manquait du côté des qualités physiques Il n’eut point de maîtres, ou 

In [13]:
# Saving it to a file
with open('e-rara_text.txt' , 'w', encoding='utf-8') as f:
    f.write(response.text)

### Downloading a file directly

In [14]:
GTA_LOGO_URL = 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Gta_logo.jpg/240px-Gta_logo.jpg'

In [15]:
response = requests.get(GTA_LOGO_URL)
response.raise_for_status()
# Writing in binary mode 'wb'
with open('gta_logo.jpg', 'wb') as f:
    f.write(response.content)

NOTE FOR COMPLETENESS: in this situation the file is fully downloaded in memory and then written to disk. For large files, this might not be appropriate and streaming should be used (see https://stackoverflow.com/a/34503421/2911687).

### Getting structured data

In [16]:
# Getting a IIIF Presentation manifest
response = requests.get('https://www.e-rara.ch/i3f/v20/6116002/manifest')

In [17]:
# We know IIIF manifest are JSON encoded, so we can parse the raw response to convert it to a nice python dict
import json
data_as_python_dict = json.loads(response.content)
# Accessing the metadata as stipulated by the Presentation API
data_as_python_dict['metadata']

[{'value': "L' architecture de Vitruve / trad. nouvelle par Ch.-L. Maufras",
  'label': 'Titel'},
 {'value': 'Vitruvius [ca. v1. Jh]; Maufras, Ch. L.',
  'label': 'Autor, Beteiligte'},
 {'value': 'Paris : Panckoucke, 1847', 'label': 'Impressum'},
 {'value': '2 tomes :  : Ill.', 'label': 'Umfang'},
 {'value': 'Französisch ; Latein', 'label': 'Sprache'},
 {'value': 'Stiftung Bibliothek Werner Oechslin, Einsiedeln, A04c ; app. 846',
  'label': 'Besitzende Institution'},
 {'value': 'Bibliothèque latine-française. Seconde série', 'label': 'Serie'},
 {'value': 'https://doi.org/10.3931/e-rara-19457', 'label': 'DOI'}]

In [18]:
# Shortcut version, equivalent to manual json parsing above
data_as_python_dict = response.json()
# Accessing the info about the first page of the first sequence
data_as_python_dict['sequences'][0]['canvases'][0]

{'height': 3714,
 'width': 2460,
 'images': [{'resource': {'service': {'profile': 'http://iiif.io/api/image/2/level2.json',
     '@context': 'http://iiif.io/api/image/2/context.json',
     '@id': 'https://www.e-rara.ch/oec/i3f/v20/6116005'},
    'format': 'image/jpeg',
    'height': 1509,
    'width': 1000,
    '@id': 'https://www.e-rara.ch/oec/download/webcache/1000/6116005',
    '@type': 'dctypes:Image'},
   'on': 'https://www.e-rara.ch/oec/i3f/v20/6116002/canvas/6116005',
   'motivation': 'sc:painting',
   '@id': 'https://www.e-rara.ch/oec/i3f/v20/6116002/annotation/6116005',
   '@type': 'oa:Annotation'}],
 'label': '[1]',
 '@id': 'https://www.e-rara.ch/oec/i3f/v20/6116002/canvas/6116005',
 '@type': 'sc:Canvas'}

### Request with parameters

Accessing OAI endpoint of e-rara, getting the information about a specific record, notice the structure of the url

https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=6116002

In [19]:
response = requests.get('https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=6116002')
response.content

b'<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:mets="http://www.loc.gov/METS/" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:epicur="urn:nbn:de:1111-2004033116" xmlns:vl="http://visuallibrary.net/vl" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:marcxml="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2020-04-06T09:43:33Z</responseDate><request verb="GetRecord" metadataPrefix="oai_dc" identifier="6116002">https://www.e-rara.ch/oec/oai/</request><GetRecord><record><header><identifier>oai:www.e-rara.ch/oec:6116002</identifier><datestamp>2013-11-13T12:26:07Z</datestamp><setSpec>oec</setSpec><setSpec>book</setSpec></header><metadata><oai_dc:dc xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.ope

In [20]:
# Format of OAI is xml so we can parse it as well
parsed_response = xmltodict.parse(response.content)
parsed_response

OrderedDict([('OAI-PMH',
              OrderedDict([('@xmlns', 'http://www.openarchives.org/OAI/2.0/'),
                           ('@xmlns:mets', 'http://www.loc.gov/METS/'),
                           ('@xmlns:mods', 'http://www.loc.gov/mods/v3'),
                           ('@xmlns:oai_dc',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/'),
                           ('@xmlns:epicur', 'urn:nbn:de:1111-2004033116'),
                           ('@xmlns:vl', 'http://visuallibrary.net/vl'),
                           ('@xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                           ('@xmlns:oai',
                            'http://www.openarchives.org/OAI/2.0/'),
                           ('@xmlns:marcxml',
                            'http://www.loc.gov/MARC21/slim'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
               

In [21]:
parsed_response['OAI-PMH']['GetRecord']['record']['metadata']['oai_dc:dc']

OrderedDict([('@xsi:schemaLocation',
              'http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'),
             ('dc:title', "L' architecture de Vitruve"),
             ('dc:creator', 'Vitruvius'),
             ('dc:description',
              ['trad. nouvelle par Ch.-L. Maufras',
               'Lat.-franz. Paralleltext',
               'Originaltitel: De architectura libri decem Vitruvius Pollio']),
             ('dc:publisher', 'Panckoucke'),
             ('dc:contributor', 'Maufras, Ch. L.'),
             ('dc:date', '1847'),
             ('dc:type', ['Text', 'Book']),
             ('dc:format', ['2 tomes :', 'Ill.']),
             ('dc:identifier',
              ['doi:10.3931/e-rara-19457',
               'https://www.e-rara.ch/oec/doi/10.3931/e-rara-19457',
               'system:004370068']),
             ('dc:relation',
              'vignette : https://www.e-rara.ch/oec/titlepage/doi/10.3931/e-rara-19457/128'),
             ('dc:language', '

NOTE: In the case of e-rara, the METS format is much more complete and has links for PDF, OCR and images downloads.

https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=mets&identifier=6116002

# Playing with the BNF APIs

Gallica (the BNF online repository) has multiple APIs for accessing its data (all documented in French unfortunately, google translate might be your friend if you want to have a deeper look):

- A search API in their collection, giving us directly records in OAI format (http://api.bnf.fr/api-gallica-de-recherche):

https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29

- A pagination API to know if the text was OCRized, if the table of content was detected, etc...:

https://gallica.bnf.fr/services/Pagination?ark=bpt6k5839083t
   

## Search with API

Queries are done in CQL (https://www.loc.gov/standards/sru/cql/ from the Library of Congress). Some examples are below

In [22]:
def request_and_parse(xml_url):
    """Get an xml url and parse it into a python dict"""
    print(f"Fetching {xml_url}")
    result = requests.get(xml_url)
    result.raise_for_status()
    result_parsed = xmltodict.parse(result.content)
    return result_parsed

SRU_BASEURL = 'https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query='

In [23]:
# Example queries
# Query all documents related to "Architecture" (code 72 in Dewey classification)
# Note: that only brings 585 documents, which is much lower than all architecture documents in Gallica...
query = '(sdewey all "72")'
# Queyr all documents where Vitruve is an author
query = '(dc.creator all "Vitruve")'

In [25]:
from urllib.parse import quote_plus
# Do a simple search
# Since query has spaces and maybe strange characters in it, we make it url-safe with the function quote_plus
# That will for instance convert spaces to '+'
request_and_parse(SRU_BASEURL + quote_plus(query))

Fetching https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29


OrderedDict([('srw:searchRetrieveResponse',
              OrderedDict([('@xmlns:ns7',
                            'http://gallica.bnf.fr/namespaces/gallica/'),
                           ('@xmlns:oai_dc',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/'),
                           ('@xmlns:onix_dc',
                            'http://bibnum.bnf.fr/NS/onix_dc/'),
                           ('@xmlns:srw', 'http://www.loc.gov/zing/srw/'),
                           ('@xmlns:onix',
                            'http://www.editeur.org/onix/2.1/reference/'),
                           ('@xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                           ('srw:version', '1.2'),
                           ('srw:echoedSearchRetrieveRequest',
                            OrderedDict([('srw:query',
                                          '(dc.creator all "Vitruve")'),
                                         ('srw:version', '1.2')])),
                           

Write a function that from a query will fetch all records corresponding to that query and parse them to dictionnaries.

First try to parse the records from a single request.

Then, you will realize that if your query has many results you need to make requests of "blocks" (first 50 records, then next 50, etc...) as the server will not give you all of them directly (to avoid too big response). The additional `maximumRecords` and `startRecord` HTTP parameters can be specified in the query to specify how many, and the starting record desired. For instance with `maximumRecords=50&startRecord=50` the response will be about the results number 50 to 100.

In [26]:
def search_gallica(query, max_records=None):
    # Your code here
    NUM_RESULTS_PER_QUERY = 50
    nb_total_records = get_nb_total_records(query)
    if max_records:
        nb_total_records = min(max_records, nb_total_records)
        
    all_records = []
    for offset in range(0, nb_total_records, NUM_RESULTS_PER_QUERY):
        # Fetch a page of results
        full_xml_response = request_and_parse(SRU_BASEURL + quote_plus(query) + f"&maximumRecords={NUM_RESULTS_PER_QUERY}&startRecord={offset}")
        records = full_xml_response['srw:searchRetrieveResponse']['srw:records']['srw:record']
        # Parse results
        for r in records:
            tmp_record = dict(r['srw:recordData']['oai_dc:dc'])
            tmp_record['srw:extraRecordData'] = dict(r['srw:extraRecordData'])
            all_records.append(tmp_record)
    
    return all_records
    
def get_nb_total_records(query):
    """Fetch in the search result the total number of records"""
    result_parsed = request_and_parse(SRU_BASEURL + quote_plus(query) + "&maximumRecords=0")
    try:
        return int(result_parsed['srw:searchRetrieveResponse']['srw:numberOfRecords'])
    except KeyError:
        return 0

In [27]:
query = '(dc.creator all "Vitruve")'
records = search_gallica(query)
assert isinstance(records, list)
assert len(records) == 62

Fetching https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29&maximumRecords=0
Fetching https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29&maximumRecords=50&startRecord=0
Fetching https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29&maximumRecords=50&startRecord=50


## Get Pagination info from API

Write a small function that checks if the full text is present from the pagination info

In [28]:
def get_pagination_info(key):
    return request_and_parse(f"https://gallica.bnf.fr/services/Pagination?ark={key}")

def has_full_text(pagination_info):
    # Your code here
    return pagination_info['livre']['structure']['hasContent'] == 'true'

pag_info_1 = get_pagination_info('btv1b21000411')
print("Should be False : ", has_full_text(pag_info_1))
pag_info_2 = get_pagination_info('bpt6k5839083t')
print("Should be True : ", has_full_text(pag_info_2))

Fetching https://gallica.bnf.fr/services/Pagination?ark=btv1b21000411
Should be False :  False
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k5839083t
Should be True :  True


## Download the OCRized text

Given the url of a gallica resource (for instance https://gallica.bnf.fr/ark:/12148/bpt6k236629), one can access the OCRized text (if it exists) by appending `.texteBrut` (https://gallica.bnf.fr/ark:/12148/bpt6k236629.texteBrut).

It is also possible to always download directly the images or the PDF, since this resource has 588 pages, one can directly download the PDF of pages 1 to 588 with:

https://gallica.bnf.fr/ark:/12148/bpt6k236629/f1n588.pdf

Or for a smaller file, just pages 1 to 10:

https://gallica.bnf.fr/ark:/12148/bpt6k236629/f1n10.pdf

Here, we will try to download the OCRized text and save it to a text file. By default, the resource of the text (for instance https://gallica.bnf.fr/ark:/12148/bpt6k236629.texteBrut, open the developer view and inspector to look at the html structure) is in HTML and with an annoying header at the top, so we will be able to play a bit with HTML parsing to convert it to proper text.

In [29]:
from bs4 import BeautifulSoup

def get_html_text(identifier):
    url = identifier+'.texteBrut'
    print(f"Fetching {url}")
    data = requests.get(url)
    return data.content

def get_full_text(raw_html):
    soup = BeautifulSoup(raw_html)
    html_body = soup.find('body')
    
    # Iterates over the children of the html body
    # Discard the blocks before the first <hr> (horizontal row)  (use .name to know what kind of tag it is)
    # Concatenate the text of the elements after (use .get_text() )
    passed_header = False
    texts = []
    for child in html_body.children:
        if child.name == 'hr':
            passed_header = True
        if passed_header:
            texts.append(child.get_text())
    return '\n'.join(texts)

In [30]:
html_text = get_html_text('https://gallica.bnf.fr/ark:/12148/bpt6k236629')

Fetching https://gallica.bnf.fr/ark:/12148/bpt6k236629.texteBrut


In [31]:
parsed_text = get_full_text(html_text)
assert isinstance(parsed_text, str)
# Yes the OCR output is really far from being great, probably old OCR software, we can do much better now
print(parsed_text[:1000])


~(9~ a~~g 
Ï'Et.A 
BIBLIOTtIÈQUE 
Mm-FRA~!SE traductions nouvelles 
DES AUTEURS LATINE 
AVEC LE TEXTE EN REGARD 
DEPUIS ADRIEN ~C8Qt)'A CBEeOtBE DE TOBBS 
pubfiëe 
PAR C. L.F PANCKOUCKE N 
orxrcrxx ne ce Lecrnn n'nwxene 
VITRUVE j! 
traduction )joav<!)!f 
( Mvee de Membretn~eo a~mrea pour t tm<eMig<-n<M- du «~ PAR M.CH.L.MAUFRAS 
Membre de la Société (tea antiquaires de Normandie 
de )a Société t'anÉenne du Calvados 
de la &ooete pbur la conservation et la description des moniirnent~ hist~xni.~ 
professeuraucoUegeRnNin 
TOME PREMŒR 
PARIS 
C. L. F. PANCEOUCKE. EDITEUR B[;E DES POITEVINS, 14 
18~7 


SECONDE SÉRIE 
DEt.t 
B~LIOTHÈQUE 
Hït~-FM~~ttS)' D"UfS ADK.EM JUSQU'A SRf:GO..iE DE TOU.!S 
p.M.. 
PARC. L. F. PANCKO~CKH 

mpKntmuEPANCKfn'r.tix, 
ruedMroitc" 

LARCHtTECTURE 
0); 
VITRtJVE 
"rI\.AUIiGl'JO."o! t'oOfJ\' }o:l.l~ 
PAttt.m.-t. Mms 
de la Soci~tl; des antiquaires (le NOI"mandie. dl' la Soci~té liulleenne du C.1~ad.. 
--t~ et la desCl'iption des mOIHunell1s 11istoriq1Jes, 
P'e

In [32]:
import os
OUTPUT_FOLDER = 'gallica_output'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# For every record, try to get the full text and save it to a file
# Note that many documents are actually not OCRized
for record in records:
    try:
        key = record['srw:extraRecordData']['uri']
        pagination_info = get_pagination_info(key)
        if has_full_text(pagination_info):
            print(f'{key} has full text')
            html_text = get_html_text(record['dc:identifier'])
            full_text = get_full_text(html_text)
            with open(os.path.join(OUTPUT_FOLDER, f'{key}.txt'), 'w', encoding='utf-8') as f:
                print(f'Saving {key}.txt')
                f.write(full_text)
    except Exception as e:
        print(f'Failed for {record["dc:identifier"]}, {e}')

Fetching https://gallica.bnf.fr/services/Pagination?ark=btv1b21000411
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k5839083t
bpt6k5839083t has full text
Fetching https://gallica.bnf.fr/ark:/12148/bpt6k5839083t.texteBrut
Saving bpt6k5839083t.txt
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k85630d
Fetching https://gallica.bnf.fr/services/Pagination?ark=btv1b8457382h
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k85660b
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k61029198
bpt6k61029198 has full text
Fetching https://gallica.bnf.fr/ark:/12148/bpt6k61029198.texteBrut
Saving bpt6k61029198.txt
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k5834144h
bpt6k5834144h has full text
Fetching https://gallica.bnf.fr/ark:/12148/bpt6k5834144h.texteBrut
Saving bpt6k5834144h.txt
Fetching https://gallica.bnf.fr/services/Pagination?ark=btv1b73002165
Fetching https://gallica.bnf.fr/services/Pagination?ark=bpt6k236629
bpt6k236629 has 

KeyboardInterrupt: 