# Scraping a web page without an API
This notebook provides code snippets for "scraping" information from a web site that doesn't offer an API--that is, a web site that was designed for a user to consult and interact with in their web browser. These snippets are for use in our class session. The `reference` folder includes a notebook with all of this code (and more) supported by a prose discussion of what's going on that may be useful for later review. This notebook doesn't include all the code examples in the longer notebook, but maintains the numbering of the code cells from that notebook so you can readily find your place in it.

## Connect to Google Drive and import some packages

In [None]:
#Code cell 1
#Connect to and mount Google Drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Code cell 2
#Import packages for working with the British Library Labs' metadata file
import numpy as np
import pandas as pd

## Have a look at British Library dataset of metadata for digitized printed books

In [None]:
#Code cell 3
#Set data folder
data_directory = '/gdrive/MyDrive/rbs_digital_approaches_2023/2023_data_class/'

#Load BL metadata (source: https://data.bl.uk/bl_labs_datasets/#3)
bl_digitized = pd.read_csv(data_directory + 'MS_digitised_books_2021-01-09.csv')

#Inspect the DataFrame to get a list of columns, a count of how many rows have
#data in each column, and the datatype of the column
bl_digitized.info()

### Get usable publication dates

In [None]:
#Code cell 4
#Make a new dataframe of just the rows where the date column is not null (i.e.,
#books for which we have a publication date)
bl_digitized_w_dates = bl_digitized.loc[bl_digitized['Date of publication'].notnull()].copy()
bl_digitized_w_dates

In [None]:
#Code cell 5
#Try to turn those dates into numbers. This is going to produce an error.
bl_digitized_w_dates['Date of publication'] = bl_digitized['Date of publication'].astype(int)

In [None]:
#Code cell 6
#Filter (using pandas' .loc[] function) to show rows for rows whose values aren't
#entirely numeric
bl_digitized_w_dates.loc[bl_digitized_w_dates['Date of publication'].str.isnumeric() == False]

In [None]:
#Code cell 7
#Add a new column based on the 'Date of publication' column; populate that column
#with the first string of four digits we find; make all values integers.
bl_digitized_w_dates['use_date'] = bl_digitized_w_dates['Date of publication'].str.extract(r'([0-9]{4})').astype(int)
bl_digitized_w_dates

## Identify records of interest
Note that we're not throwing anything away: you can access the full set of records at any time using the `bl_digitized_w_dates` or `bl_digitized` DataFrames. See the full notebook in today's `reference` folder for more examples of ways to select a subset of records.

In [None]:
#Code cell 8
#Create another DataFrame for rows with a date before 1801
pre_1801 = bl_digitized_w_dates.loc[bl_digitized_w_dates['use_date'] < 1801].copy()
pre_1801.sort_values(by=['use_date', 'Name'])

In [None]:
#Code cell 14
#Create a DataFrame of works by Aphra Behn using str.startswith(). Not all rows
#have a value in the 'Name' column, so we need to ignore any rows where that
#column is 'nan'. (See the reference notebook for more examples.)
pre_1801_behn = pre_1801.loc[pre_1801['Name'].str.startswith('Behn', na=False)].copy().reset_index()
pre_1801_behn

## A very quick introduction to HTML
This cell produces a *very* simple HTML document right here in our notebook.

In [None]:
#Code cell 15
%%html
<html>
  <head>
    <!--Information about the page goes here, normally along with links to scripts,
    stylesheets, etc. This simple HTML puts the styling information "inline" in the header.-->
    <style type="text/css">
      body { width: 40%;}
      h1 { color: #496fad;
         }
      div { margin-bottom: 1em; }
      .maintext { font-family: serif;
                  font-size: 13pt;
                }
      .blockquote { font-family: italic;
                    margin: 0em 3em 1em 3em;
                    padding: 0.5em;
                    background-color: #dedede;
                  }
      form { margin-top: 2em; }
      form label { font-weight: bold;
                   font-size: 11pt;
                 }
      #comment { width: 100%; }
      #submitbutton { float: right;
                      font-weight: bold;
                      font-size: 10pt;
                      background-color: #9dbaf5;
                      padding: 10px;
                      border: none;
                      border-radius: 10px;
                    }
    </style>
  </head>
  <body>
    <!--The actual content of the page that you end up seeing.-->
    <h1>A very simple HTML page</h1>
    <div class="maintext">A content <code>div</code>. This element has a
    <code>class</code> attribute that identifies it for one set of visual
    styling rules.</div>
    <div class="blockquote">This is another content <code>div</code>, with a
    different <code>class</code> attribute for very different styling. </div>
    <div class="maintext">Note that the elements in the form below have
      <code>id</code> attributes that <em>can</em> be used for visual styling,
      but also identify those elements for functional purposes.</div>
    <form id="feedback" action="/post_comment.php">
      <label for="comment">Tell us what you think!</label><br />
      <textarea rows="5" id="comment"
      placeholder="This form doesn't do actually do anything..."/></textarea>
      <input type="submit" id="submitbutton" value="Submit comment" />
    </form>
</body>
</html>

## Have a look at a record in the British Library's online catalogue
We'll take a look at a catalogue record and the underlying HTML using Developer Tools in our web browser:
http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL01014912206

## Scraping functions


In [None]:
#Code cell 17
#Import packages
import requests
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

#This function defines an http request with a retry strategy. It accepts a URL
#as an argument, requests the URL using our defined http connection, and returns
#the response to that request
def create_http(url) :
  retry_strategy = Retry(
      total=3,
      status_forcelist=[429, 500, 502, 503, 504],
      allowed_methods=["GET"]
  )
  adapter = HTTPAdapter(max_retries=retry_strategy)
  http = requests.Session()
  http.mount("https://", adapter)
  http.mount("http://", adapter)
  headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
  }

  r = http.get(url, headers=headers)

  return r

In [None]:
#Code cell 17a
from bs4 import BeautifulSoup
test_request = create_http('http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=4&tabs=moreTab&ct=display&fn=search&doc=BLL01014912206')
soup = BeautifulSoup(test_request.content, 'html.parser')
print(soup.prettify())
# important_bits = soup.find(class_='EXLSummary EXLResult')
# print(important_bits.prettify())


In [None]:
#Code cell 18
from bs4 import BeautifulSoup
import re
def retrieve_vdc(rec_id) :
  #Construct a URL incorporating the rec_id parameter
  rec_url = 'http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=4&tabs=moreTab&ct=display&fn=search&doc=BLL010' + \
  str(rec_id) + '&vid=BLVU1&lang=en_US&institution=BL'

  #pass the URL we just constructed to the create_http function we defined in
  #code cell 17: we are calling a function from inside another function
  rec_r = create_http(rec_url)

  #Pass the content of the response to BeautifulSoup for parsing
  rec_soup = BeautifulSoup(rec_r.content, 'html.parser')

  #See comments in code cell 16 in full notebook

  #Compile regular expressions
  viewer_pattern = re.compile(r'vdc_([A-Za-z0-9\.]+)')
  google_pattern = re.compile(r'books\.google\.com.+vid%253DBL%253A([A-Za-z0-9]+)%2520')

  #Locate the relevant "Go" button
  view_button = rec_soup.find_all('input', id='getit1_0')
  if len(view_button) == 0 :
    return 'None found'
  else :
    #Find the vdc_ number(s)
    vdc_list = re.findall(viewer_pattern, view_button[0]['value'])
    print(vdc_list)
    #Get rid of duplicates
    vdc_distinct_list = list(set(vdc_list))
    #If there's only one...
    if len(vdc_distinct_list) == 1 :
      vdc = 'vdc_' + vdc_distinct_list[0]
    else :
     #If there's more than one, create a string that delimits the VDC numbers
     #with a pipe character
     multi_vdc = ['vdc_' + vdc_item for vdc_item in vdc_distinct_list]
     vdc = '|'.join(multi_vdc)
    return vdc

In [None]:
#Code cell 18a
#Test out our function with a known record id
#Test out our function
newtest = retrieve_vdc(14816272)
newtest

## Actually scraping
We'll use the record ids from the dataset the British Library provided to get the VDC number from each record associated with Aphra Behn, then use that information to do a further scraping run to gather more informatino about the scans. We'll determine which scans are marked as public domain and download the title pages of public domain scans.

### Getting VDC numbers for records of interest

In [None]:
#Code cell 19
#A case in which several volumes were scanned twice: once by Google, then again by
#the British Library, itself. We created this DataFrame earlier, in code cell 11.
pre_1801_behn['vdc'] = pre_1801_behn['BL record ID'].apply(retrieve_vdc)
pre_1801_behn

### Getting one row for each scan

In [None]:
#Code cell 25
pre_1801_behn = (
    pre_1801_behn.assign(vdc=pre_1801_behn['vdc'].str.split('|'))
      .explode('vdc')
      .reset_index(drop=True)
)
pre_1801_behn

### Looking at IIIF manifest (in JSON format)
Let's have a look at a IIIF manifest for a minute: https://api.bl.uk/metadata/iiif/ark:/81055/vdc_00000002C83E/manifest.json

In [None]:
#Code cell 26
import json
def check_manifest(vdc_num) :
  iiif_manifest_url = 'https://api.bl.uk/metadata/iiif/ark:/81055/' + vdc_num + '/manifest.json'
  print(iiif_manifest_url)
  manifest_r = create_http(iiif_manifest_url)
  try :
    manifest_json = json.loads(manifest_r.text)
    metadata = manifest_json['metadata']
    confirmed_shelfmark = ''
    for metadata_item in metadata :
      if metadata_item['label'] == 'Identifier' :
        confirmed_shelfmark = metadata_item['value']
    #Look for the license object in the JSON, too
    license_terms = manifest_json['license']
    #Check to see if the value of the license object includes "creativecommons",
    #"google", or... something else.
    if license_terms.find('creativecommons') != -1 :
      license = 'Public Domain'
    elif license_terms.find('google') != -1 :
      license = 'Google Books'
    else :
      license = license_terms
    #Find the link to the book's title page (used as a thumbnail)
    title_page_link = manifest_json['thumbnail']['@id']
    #Return a list including the confirmed shelfmark and the license
    return [confirmed_shelfmark, license, title_page_link]
  except :
    return ('Not found')

In [None]:
#Code cell 27
#This one's a little tricky: we're adding two columns to the DataFrame: we pass
#a list of columns to add. We populate that list of columns using a list of values
#created from a list comprehension of value returned by the check_manifest
#function
pre_1801_behn[['confirmed_shelfmark', 'license', 'title_page']] = [result for result in pre_1801_behn['vdc'].apply(check_manifest)]
pre_1801_behn

In [None]:
#Code cell 28
def add_links(vdc_num) :
  viewer_link = 'http://access.bl.uk/item/viewer/ark:/81055/' + vdc_num
  iiif_manifest = 'https://api.bl.uk/metadata/iiif/ark:/81055/' + vdc_num + \
    'manifest.json'
  return([viewer_link, iiif_manifest])

pre_1801_behn[['book_viewer', 'iiif_manifest']] = [i for i in pre_1801_behn['vdc'].apply(add_links)]

In [None]:
#Code cell 30
#Create a subset of Behn's works with a Public Domain license and see what we have
pre_1801_behn_public_domain = pre_1801_behn.loc[pre_1801_behn['license'] == 'Public Domain'].copy()
pre_1801_behn_public_domain[['Title', 'confirmed_shelfmark', 'book_viewer', 'license', 'title_page']]

In [None]:
#Code cell 31
#Import package
import os

#Check to see if the directory exists. If not, create it and set it as the output
#directory
if not os.path.exists('/gdrive/MyDrive/rbs_digital_approaches_2023/output/behn_titlepages/') :
  os.makedirs('/gdrive/MyDrive/rbs_digital_approaches_2023/output/behn_titlepages/')
output_dir = '/gdrive/MyDrive/rbs_digital_approaches_2023/output/behn_titlepages/'

#Iterate through the rows of the dataframe. Retrieve the title page files using
#the create_http function and save them to our output directory
for index, row in pre_1801_behn_public_domain.iterrows() :
  vdc = row['vdc']
  get_tp = create_http(row['title_page'])
  with open(output_dir + vdc + '_tp.jpg', 'wb') as file :
    print('Saving ' + vdc + '_tp.jpg...')
    file.write(get_tp.content)
