In [None]:
import requests
import xmltodict


## Accessing a resource in python

In [None]:
import requests

# Perform a HTTP GET request on the desired URL
response = requests.get('https://www.google.com')

In [None]:
# Object response, with status code 200, which means the request succeded
response

### Checking Response

In [None]:
# The code of the response can be accessed
response.status_code

In [None]:
# We will always get a response object, even if the request was not successful (assuming the server exists)
# Here we generate an error if the request failed. If the request was a success, nothing is raised
response.raise_for_status()

In [None]:
# And here we check directly if the status is one of the good ones
response.ok

In [None]:
# Failure case, resource not found
response = requests.get('https://www.google.com/totally-not-existing-path-that-google-will-not-answer')
print(response.ok)
print(response.status_code)
print(response.reason)

In [None]:
response.raise_for_status()

### Accessing content

In [None]:
response = requests.get('https://www.google.com')
# The content of the answer can directly be accessed with .content
# Here we display the source code of the google webpage
response.content

In [None]:
# Access the full text representation of https://www.e-rara.ch/oec/vitruviana/content/titleinfo/6116002
response = requests.get('https://www.e-rara.ch/oec/download/fulltext/plain/6116002')
response.status_code

In [None]:
# By default response are raw bytes, binary format
response.content[1000:2000]  # Showing just 1000 characters

In [None]:
# Getting a nicely encoded version
response.text[1000:2000]

In [None]:
# Saving it to a file
with open('e-rara_text.txt' , 'w', encoding='utf-8') as f:
    f.write(response.text)

### Downloading a file directly

In [None]:
GTA_LOGO_URL = 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Gta_logo.jpg/240px-Gta_logo.jpg'

In [None]:
response = requests.get(GTA_LOGO_URL)
response.raise_for_status()
# Writing in binary mode 'wb'
with open('gta_logo.jpg', 'wb') as f:
    f.write(response.content)

NOTE FOR COMPLETENESS: in this situation the file is fully downloaded in memory and then written to disk. For large files, this might not be appropriate and streaming should be used (see https://stackoverflow.com/a/34503421/2911687).

### Getting structured data

In [None]:
# Getting a IIIF Presentation manifest
response = requests.get('https://www.e-rara.ch/i3f/v20/6116002/manifest')

In [None]:
# We know IIIF manifest are JSON encoded, so we can parse the raw response to convert it to a nice python dict
import json
data_as_python_dict = json.loads(response.content)
# Accessing the metadata as stipulated by the Presentation API
data_as_python_dict['metadata']

In [None]:
# Shortcut version, equivalent to manual json parsing above
data_as_python_dict = response.json()
# Accessing the info about the first page of the first sequence
data_as_python_dict['sequences'][0]['canvases'][0]

### Request with parameters

Accessing OAI endpoint of e-rara, getting the information about a specific record, notice the structure of the url

https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=6116002

In [None]:
response = requests.get('https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=6116002')
response.content

In [None]:
# Format of OAI is xml so we can parse it as well
parsed_response = xmltodict.parse(response.content)
parsed_response

In [None]:
parsed_response['OAI-PMH']['GetRecord']['record']['metadata']['oai_dc:dc']

NOTE: In the case of e-rara, the METS format is much more complete and has links for PDF, OCR and images downloads.

https://www.e-rara.ch/oec/oai?verb=GetRecord&metadataPrefix=mets&identifier=6116002

# Playing with the BNF APIs

Gallica (the BNF online repository) has multiple APIs for accessing its data (all documented in French unfortunately, google translate might be your friend if you want to have a deeper look):

- A search API in their collection, giving us directly records in OAI format (http://api.bnf.fr/api-gallica-de-recherche):

https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query=%28dc.creator+all+%22Vitruve%22%29

- A pagination API to know if the text was OCRized, if the table of content was detected, etc...:

https://gallica.bnf.fr/services/Pagination?ark=bpt6k5839083t
   

## Search with API

Queries are done in CQL (https://www.loc.gov/standards/sru/cql/ from the Library of Congress). Some examples are below

In [None]:
def request_and_parse(xml_url):
    """Get an xml url and parse it into a python dict"""
    print(f"Fetching {xml_url}")
    result = requests.get(xml_url)
    result.raise_for_status()
    result_parsed = xmltodict.parse(result.content)
    return result_parsed

SRU_BASEURL = 'https://gallica.bnf.fr/SRU?version=1.2&operation=searchRetrieve&suggest=0&query='

In [None]:
# Example queries
# Query all documents related to "Architecture" (code 72 in Dewey classification)
# Note: that only brings 585 documents, which is much lower than all architecture documents in Gallica...
query = '(sdewey all "72")'
# Queyr all documents where Vitruve is an author
query = '(dc.creator all "Vitruve")'

In [None]:
from urllib.parse import quote_plus
# Do a simple search
# Since query has spaces and maybe strange characters in it, we make it url-safe with the function quote_plus
# That will for instance convert spaces to '+'
request_and_parse(SRU_BASEURL + quote_plus(query))

Write a function that from a query will fetch all records corresponding to that query and parse them to dictionnaries.

First try to parse the records from a single request.

Then, you will realize that if your query has many results you need to make requests of "blocks" (first 50 records, then next 50, etc...) as the server will not give you all of them directly (to avoid too big response). The additional `maximumRecords` and `startRecord` HTTP parameters can be specified in the query to specify how many, and the starting record desired. For instance with `maximumRecords=50&startRecord=50` the response will be about the results number 50 to 100.

In [None]:
def search_gallica(query, max_records=None):
    # Your code here
    return []

In [None]:
query = '(dc.creator all "Vitruve")'
records = search_gallica(query)
assert isinstance(records, list)
assert len(records) == 62

## Get Pagination info from API

Write a small function that checks if the full text is present from the pagination info

In [None]:
def get_pagination_info(key):
    return request_and_parse(f"https://gallica.bnf.fr/services/Pagination?ark={key}")

def has_full_text(pagination_info):
    # Your code here
    return False

pag_info_1 = get_pagination_info('btv1b21000411')
print("Should be False : ", has_full_text(pag_info_1))
pag_info_2 = get_pagination_info('bpt6k5839083t')
print("Should be True : ", has_full_text(pag_info_2))

## Download the OCRized text

Given the url of a gallica resource (for instance https://gallica.bnf.fr/ark:/12148/bpt6k236629), one can access the OCRized text (if it exists) by appending `.texteBrut` (https://gallica.bnf.fr/ark:/12148/bpt6k236629.texteBrut).

It is also possible to always download directly the images or the PDF, since this resource has 588 pages, one can directly download the PDF of pages 1 to 588 with:

https://gallica.bnf.fr/ark:/12148/bpt6k236629/f1n588.pdf

Or for a smaller file, just pages 1 to 10:

https://gallica.bnf.fr/ark:/12148/bpt6k236629/f1n10.pdf

Here, we will try to download the OCRized text and save it to a text file. By default, the resource of the text (for instance https://gallica.bnf.fr/ark:/12148/bpt6k236629.texteBrut, open the developer view and inspector to look at the html structure) is in HTML and with an annoying header at the top, so we will be able to play a bit with HTML parsing to convert it to proper text.

In [None]:
from bs4 import BeautifulSoup

def get_html_text(identifier):
    url = identifier+'.texteBrut'
    print(f"Fetching {url}")
    data = requests.get(url)
    return data.content

def get_full_text(raw_html):
    soup = BeautifulSoup(raw_html)
    html_body = soup.find('body')
    
    # Iterates over the children of the html body
    # Discard the blocks before the first <hr> (horizontal row)  (use .name to know what kind of tag it is)
    # Concatenate the text of the elements after (use .get_text() )

    return ''

In [None]:
html_text = get_html_text('https://gallica.bnf.fr/ark:/12148/bpt6k236629')

In [None]:
parsed_text = get_full_text(html_text)
assert isinstance(parsed_text, str)
# Yes the OCR output is really far from being great, probably old OCR software, we can do much better now
print(parsed_text[:1000])

In [None]:
import os
OUTPUT_FOLDER = 'gallica_output'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# For every record, try to get the full text and save it to a file
# Note that many documents are actually not OCRized
for record in records:
    try:
        key = record['srw:extraRecordData']['uri']
        pagination_info = get_pagination_info(key)
        if has_full_text(pagination_info):
            print(f'{key} has full text')
            html_text = get_html_text(record['dc:identifier'])
            full_text = get_full_text(html_text)
            with open(os.path.join(OUTPUT_FOLDER, f'{key}.txt'), 'w', encoding='utf-8') as f:
                print(f'Saving {key}.txt')
                f.write(full_text)
    except Exception as e:
        print(f'Failed for {record["dc:identifier"]}, {e}')