##1 - Combobulate

In [None]:
#Code snippet 1
#Connect to Google Drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Code snippet 2
data_directory = '/gdrive/MyDrive/rbs_digital_approaches_2022/2022_data_class/'
data_file = data_directory + 'MS_digitised_books_2021-01-09.csv'

##2 - Look at one particular row
There's other code we'll have to write in class before we get to this snippet, so it won't do anything on its own.

In [None]:
#Code snippet 3
pre_1801_behn.loc[pre_1801_behn['BL record ID'] == 14816272]

##3 - Look at catalogue record and book view in our browser

In [None]:
#Representative BL catalogue record page:
http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL01014816272

#BL book viewer page for that book:
http://access.bl.uk/item/viewer/ark:/81055/vdc_00000002ABE8

##4 - Actually retrieve information from the web, version 1

In [None]:
#Code snippet 4
#1. Import packages: requests (and related) for retrieving the contents of the 
#web page; BeautifulSoup for parsing the HTML; re [Regular Expressions] for 
#defining and searching for a pattern of text to identify vdc_ numbers

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import re

#2. Define a function accepting rec_id as an argument
def retrieve_vdc_simple(rec_id) :
  #2.a. Create a pattern for retrieving URLs that can tolerate some intermittent
  #server problems by retrying failed requests
  retry_strategy = Retry(
      total=3,
      status_forcelist=[429, 500, 502, 503, 504],
      method_whitelist=["GET"]
  )
  adapter = HTTPAdapter(max_retries=retry_strategy)
  #Note that the actual connection to the resource is given the variable name
  #http. In other code you'll see online, you'll often see requests.get(url).
  #This is more or less what's happening here; without a retry strategy, though, 
  #a scraping script can be quite brittle, failing at the first server error it 
  #encounters.
  http = requests.Session()
  http.mount("https://", adapter)
  http.mount("http://", adapter)
  #Tell the web page that we're using Firefox for Windows. 
  #This is... not veracious.
  headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
  }
  
  #2.b Construct and retrieve a URL that takes the basic form of the permalink to
  #a BL catalogue record with the rec_id parameter added to the end (as a string)
  rec_url = 'http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=4&tabs=moreTab&ct=display&fn=search&doc=BLL010' + str(rec_id) + \
  '&vid=BLVU1&lang=en_US&institution=BL'
  rec_r = http.get(rec_url, headers=headers)
  
  #2.c Pass the content of the retrieved result to BeautifulSoup for parsing
  rec_soup = BeautifulSoup(rec_r.content, 'html.parser')
  
  #2.d Construct a regular experession pattern to find vdc_ numbers and hold
  #on to them using a capture pattern
  viewer_pattern = re.compile(r'vdc_([A-Za-z0-9\.]+)')
  
  #Find the input element with id "getit1_0" using BeautifulSoup's find_all() method
  view_button = rec_soup.find_all('input', id='getit1_0')
  
  #There are some ghost records in the metadata file: if there's not actually a 
  #view_button on the page we retireve, return 'None found' and move on
  if len(view_button) == 0 :
    return 'None found'
  
  else :
    #Python's re.findall() returns a list of all matches of the regular expression.
    vdc_list = re.findall(viewer_pattern, view_button[0]['value'])
    print(vdc_list)
    
    #The vdc_number can appear multiple times in the value attribute, but 
    #sometimes there are actually multiple vdc_ numbers for a record: there could
    #be multiple volumes, or there could be more than one scan of the same volume
    #(e.g., one by Google, one by the British Library). We want all of the
    #vdc_ numbers, but we only need each one once.
    #
    #This one-liner uses set() to get the distinct results, then wraps that set
    #in list() to turn the set back into a list.
    vdc_distinct_list = list(set(vdc_list))
    
    #If the length of vdc_distinct_list equals 1, there's just one vdc_ number:
    #prepend "vdc_" to it and make that the value of the variable vdc
    if len(vdc_distinct_list) == 1 :
      vdc = 'vdc_' + vdc_distinct_list[0]
    #If there are multiple vdc_ numbers...
    else :
     #Prepend  "vdc_" to each of them and make a list
     multi_vdc = ['vdc_' + vdc_item for vdc_item in vdc_distinct_list]
     #Join together all of the items in that list, separated by a pipe character
     vdc = '|'.join(multi_vdc)
    
    #2.e Return the resulting vdc_number(s)
    return vdc

##5 - Actually retrieve information from the web, version 2

In [None]:
#Code snippet 5
#This function defines an http request with a retry srategy. It accepts a URL
#as an argument, requests the URL using our defined http connection, and returns
#the response to that request
def create_http(url) :
  retry_strategy = Retry(
      total=3,
      status_forcelist=[429, 500, 502, 503, 504],
      method_whitelist=["GET"]
  )
  adapter = HTTPAdapter(max_retries=retry_strategy)
  http = requests.Session()
  http.mount("https://", adapter)
  http.mount("http://", adapter)
  headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
  }
  
  r = http.get(url, headers=headers)

  return r

#This function receives a record id as an argument, constructs a URL to feed to
#the create_http function, then processeds the information that the create_http
#function returns
def retrieve_vdc(rec_id) :
  #Construct a URL incorporating the rec_id parameter 
  rec_url = 'http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=4&tabs=moreTab&ct=display&fn=search&doc=BLL010' + \
  str(rec_id) + '&vid=BLVU1&lang=en_US&institution=BL'
  
  #pass the URL we just constructed to the create_http function we defined in
  #code cell 17: we are calling a function from inside another function
  rec_r = create_http(rec_url)
  
  #Pass the content of the response to BeautifulSoup for parsing
  rec_soup = BeautifulSoup(rec_r.content, 'html.parser')
  
  #See comments in code cell 16
  viewer_pattern = re.compile(r'vdc_([A-Za-z0-9\.]+)')
  google_pattern = re.compile(r'books\.google\.com.+vid%253DBL%253A([A-Za-z0-9]+)%2520')
  view_button = rec_soup.find_all('input', id='getit1_0')
  if len(view_button) == 0 :
    return 'None found'
  else :
    vdc_list = re.findall(viewer_pattern, view_button[0]['value'])
    print(vdc_list)
    vdc_distinct_list = list(set(vdc_list))
    if len(vdc_distinct_list) == 1 :
      vdc = 'vdc_' + vdc_distinct_list[0]
    else :
     multi_vdc = ['vdc_' + vdc_item for vdc_item in vdc_distinct_list]
     vdc = '|'.join(multi_vdc)
    return vdc

##6 - Split a row with multiple values in one column into two rows
Like #2, above, this is only going to work after we've written some other code.

In [None]:
#Code snippet 6
shakespeare_quarto = (
    #Create a new object by splitting the vdc column into a list on the pipe character 
    shakespeare_quarto.assign(vdc=shakespeare_quarto['vdc'].str.split('|'))
      #Explode the new vdc object into multiple rows
      .explode('vdc')
      #Reset the indices for the rows of the DataFrame: don't try to hold on to 
      #the existing indices, just accept new defaults (since we're creating rows
      #where there weren't any before)
      .reset_index(drop=True)
)
shakespeare_quarto

##7 - Retrieve more information from a different file on the British Library's servers

In [None]:
#Code snippet 7
import json

def confirm_shelfmark(vdc_num) :
  #Construct a URL to retrieve the IIIF manifest
  iiif_manifest_url = 'https://api.bl.uk/metadata/iiif/ark:/81055/' + vdc_num + '/manifest.json'
  print(iiif_manifest_url)
  #Call the create_http function to retrieve the manifest
  manifest_r = create_http(iiif_manifest_url)
  
  #Check to see if we succeeded in retrieving a manifest: there are some ghost
  #records in the metadata file which will return errors if we attempt to 
  #retrieve them.
  try :
    #Parse the JSON from our retrieved request
    manifest_json = json.loads(manifest_r.text)
    #Get the 'metadata' object in the JSON 
    metadata = manifest_json['metadata']
    confirmed_shelfmark = ''
    #Iterate through the objects in the metadata object, looking for one that
    #has the label "Identifier", then updating the confirmed_shelfmark variable
    #with the value of that object
    for metadata_item in metadata :
      if metadata_item['label'] == 'Identifier' :
        confirmed_shelfmark = metadata_item['value']
    return confirmed_shelfmark
  #If we stumbled on a bad record, return a simple message
  except :
    return ('Not found')

##8 - Splitting rows with multi-value columns again

In [None]:
pre_1801_behn = (
    pre_1801_behn.assign(vdc=pre_1801_behn['vdc'].str.split('|'))
      .explode('vdc')
      .reset_index(drop=True)
)
pre_1801_behn

##9 - A different version of our scrape of the IIIF manifest to confirm shelfmark and get license terms


In [None]:
def check_manifest(vdc_num) :
  iiif_manifest_url = 'https://api.bl.uk/metadata/iiif/ark:/81055/' + vdc_num + '/manifest.json'
  print(iiif_manifest_url)
  manifest_r = create_http(iiif_manifest_url)
  try :
    manifest_json = json.loads(manifest_r.text)
    metadata = manifest_json['metadata']
    confirmed_shelfmark = ''
    for metadata_item in metadata :
      if metadata_item['label'] == 'Identifier' :
        confirmed_shelfmark = metadata_item['value']
    #Look for the license object in the JSON, too
    license_terms = manifest_json['license']
    #Check to see if the value of the license object includes "creativecommons",
    #"google", or... something else.
    if license_terms.find('creativecommons') != -1 :
      license = 'Public Domain'
    elif license_terms.find('google') != -1 :
      license = 'Google Books'
    else :
      license = license_terms
    #Return a list including the confirmed shelfmark and the license
    return [confirmed_shelfmark, license]
  except :
    return ('Not found')

##10 - Constructing links to catalogue record and book viewer


In [None]:
def add_links(vdc_num) :
  viewer_link = 'http://access.bl.uk/item/viewer/ark:/81055/' + vdc_num
  iiif_manifest = 'https://api.bl.uk/metadata/iiif/ark:/81055/' + vdc_num + \
    'manifest.json'
  return([viewer_link, iiif_manifest])