# Code snippets for Monday module 3: Mapping MARC records
We'll mostly write code as we go, but this notebook provides some snippets for things that are, variously:

* Too tedious to type;
* Really easy to get wrong;
* Important for making something work, but not really the focus of the unit (i.e., you need this code, but it's not something you need to puzzle through or to debug if you make a mistake.

There is a full write-up of this exercise in the course reader, with complete code and extended explanations of what's going on. This notebook is just to have on hand during Monday afternoon's unit.

## Combobulate

In [None]:
#Code cell 1
#Connect to Google Drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Code cell 2
#Install the Pymarc package for reading MARC records
!pip install pymarc

In [None]:
#Code cell 3
#Import packages
from pymarc import MARCReader
import re
import numpy as np
import pandas as pd

In [None]:
#Code cell 4
#Information from https://www.loc.gov/marc/countries/ Turned into a Python
#dictionary with a regular expression replacement in BBEdit
marc_country_codes = {
    "aa": "Albania",
"abc": "Alberta",
"-ac": "Ashmore and Cartier Islands",
"aca": "Australian Capital Territory",
"ae": "Algeria",
"af": "Afghanistan",
"ag": "Argentina",
"-ai": "Anguilla",
"ai": "Armenia (Republic)",
"-air": "Armenian S.S.R.",
"aj": "Azerbaijan",
"-ajr": "Azerbaijan S.S.R.",
"aku": "Alaska",
"alu": "Alabama",
"am": "Anguilla",
"an": "Andorra",
"ao": "Angola",
"aq": "Antigua and Barbuda",
"aru": "Arkansas",
"as": "American Samoa",
"at": "Australia",
"au": "Austria",
"aw": "Aruba",
"ay": "Antarctica",
"azu": "Arizona",
"ba": "Bahrain",
"bb": "Barbados",
"bcc": "British Columbia",
"bd": "Burundi",
"be": "Belgium",
"bf": "Bahamas",
"bg": "Bangladesh",
"bh": "Belize",
"bi": "British Indian Ocean Territory",
"bl": "Brazil",
"bm": "Bermuda Islands",
"bn": "Bosnia and Herzegovina",
"bo": "Bolivia",
"bp": "Solomon Islands",
"br": "Burma",
"bs": "Botswana",
"bt": "Bhutan",
"bu": "Bulgaria",
"bv": "Bouvet Island",
"bw": "Belarus",
"-bwr": "Byelorussian S.S.R.",
"bx": "Brunei",
"ca": "Caribbean Netherlands",
"cau": "California",
"cb": "Cambodia",
"cc": "China",
"cd": "Chad",
"ce": "Sri Lanka",
"cf": "Congo (Brazzaville)",
"cg": "Congo (Democratic Republic)",
"ch": "China (Republic : 1949- )",
"ci": "Croatia",
"cj": "Cayman Islands",
"ck": "Colombia",
"cl": "Chile",
"cm": "Cameroon",
"-cn": "Canada",
"co": "Curaçao",
"cou": "Colorado",
"-cp": "Canton and Enderbury Islands",
"cq": "Comoros",
"cr": "Costa Rica",
"-cs": "Czechoslovakia",
"ctu": "Connecticut",
"cu": "Cuba",
"cv": "Cabo Verde",
"cw": "Cook Islands",
"cx": "Central African Republic",
"cy": "Cyprus",
"-cz": "Canal Zone",
"dcu": "District of Columbia",
"deu": "Delaware",
"dk": "Denmark",
"dm": "Benin",
"dq": "Dominica",
"dr": "Dominican Republic",
"ea": "Eritrea",
"ec": "Ecuador",
"eg": "Equatorial Guinea",
"em": "Timor-Leste",
"enk": "England",
"er": "Estonia",
"-err": "Estonia",
"es": "El Salvador",
"et": "Ethiopia",
"fa": "Faroe Islands",
"fg": "French Guiana",
"fi": "Finland",
"fj": "Fiji",
"fk": "Falkland Islands",
"flu": "Florida",
"fm": "Micronesia (Federated States)",
"fp": "French Polynesia",
"fr": "France",
"fs": "Terres australes et antarctiques françaises",
"ft": "Djibouti",
"gau": "Georgia",
"gb": "Kiribati",
"gd": "Grenada",
"-ge": "Germany (East)",
"gg": "Guernsey",
"gh": "Ghana",
"gi": "Gibraltar",
"gl": "Greenland",
"gm": "Gambia",
"-gn": "Gilbert and Ellice Islands",
"go": "Gabon",
"gp": "Guadeloupe",
"gr": "Greece",
"gs": "Georgia (Republic)",
"-gsr": "Georgian S.S.R.",
"gt": "Guatemala",
"gu": "Guam",
"gv": "Guinea",
"gw": "Germany",
"gy": "Guyana",
"gz": "Gaza Strip",
"hiu": "Hawaii",
"-hk": "Hong Kong",
"hm": "Heard and McDonald Islands",
"ho": "Honduras",
"ht": "Haiti",
"hu": "Hungary",
"iau": "Iowa",
"ic": "Iceland",
"idu": "Idaho",
"ie": "Ireland",
"ii": "India",
"ilu": "Illinois",
"im": "Isle of Man",
"inu": "Indiana",
"io": "Indonesia",
"iq": "Iraq",
"ir": "Iran",
"is": "Israel",
"it": "Italy",
"-iu": "Israel-Syria Demilitarized Zones",
"iv": "Côte d'Ivoire",
"-iw": "Israel-Jordan Demilitarized Zones",
"iy": "Iraq-Saudi Arabia Neutral Zone",
"ja": "Japan",
"je": "Jersey",
"ji": "Johnston Atoll",
"jm": "Jamaica",
"-jn": "Jan Mayen",
"jo": "Jordan",
"ke": "Kenya",
"kg": "Kyrgyzstan",
"-kgr": "Kirghiz S.S.R.",
"kn": "Korea (North)",
"ko": "Korea (South)",
"ksu": "Kansas",
"ku": "Kuwait",
"kv": "Kosovo",
"kyu": "Kentucky",
"kz": "Kazakhstan",
"-kzr": "Kazakh S.S.R.",
"lau": "Louisiana",
"lb": "Liberia",
"le": "Lebanon",
"lh": "Liechtenstein",
"li": "Lithuania",
"-lir": "Lithuania",
"-ln": "Central and Southern Line Islands",
"lo": "Lesotho",
"ls": "Laos",
"lu": "Luxembourg",
"lv": "Latvia",
"-lvr": "Latvia",
"ly": "Libya",
"mau": "Massachusetts",
"mbc": "Manitoba",
"mc": "Monaco",
"mdu": "Maryland",
"meu": "Maine",
"mf": "Mauritius",
"mg": "Madagascar",
"-mh": "Macao",
"miu": "Michigan",
"mj": "Montserrat",
"mk": "Oman",
"ml": "Mali",
"mm": "Malta",
"mnu": "Minnesota",
"mo": "Montenegro",
"mou": "Missouri",
"mp": "Mongolia",
"mq": "Martinique",
"mr": "Morocco",
"msu": "Mississippi",
"mtu": "Montana",
"mu": "Mauritania",
"mv": "Moldova",
"-mvr": "Moldavian S.S.R.",
"mw": "Malawi",
"mx": "Mexico",
"my": "Malaysia",
"mz": "Mozambique",
"-na": "Netherlands Antilles",
"nbu": "Nebraska",
"ncu": "North Carolina",
"ndu": "North Dakota",
"ne": "Netherlands",
"nfc": "Newfoundland and Labrador",
"ng": "Niger",
"nhu": "New Hampshire",
"nik": "Northern Ireland",
"nju": "New Jersey",
"nkc": "New Brunswick",
"nl": "New Caledonia",
"-nm": "Northern Mariana Islands",
"nmu": "New Mexico",
"nn": "Vanuatu",
"no": "Norway",
"np": "Nepal",
"nq": "Nicaragua",
"nr": "Nigeria",
"nsc": "Nova Scotia",
"ntc": "Northwest Territories",
"nu": "Nauru",
"nuc": "Nunavut",
"nvu": "Nevada",
"nw": "Northern Mariana Islands",
"nx": "Norfolk Island",
"nyu": "New York (State)",
"nz": "New Zealand",
"ohu": "Ohio",
"oku": "Oklahoma",
"onc": "Ontario",
"oru": "Oregon",
"ot": "Mayotte",
"pau": "Pennsylvania",
"pc": "Pitcairn Island",
"pe": "Peru",
"pf": "Paracel Islands",
"pg": "Guinea-Bissau",
"ph": "Philippines",
"pic": "Prince Edward Island",
"pk": "Pakistan",
"pl": "Poland",
"pn": "Panama",
"po": "Portugal",
"pp": "Papua New Guinea",
"pr": "Puerto Rico",
"-pt": "Portuguese Timor",
"pw": "Palau",
"py": "Paraguay",
"qa": "Qatar",
"qea": "Queensland",
"quc": "Québec (Province)",
"rb": "Serbia",
"re": "Réunion",
"rh": "Zimbabwe",
"riu": "Rhode Island",
"rm": "Romania",
"ru": "Russia (Federation)",
"-rur": "Russian S.F.S.R.",
"rw": "Rwanda",
"-ry": "Ryukyu Islands, Southern",
"sa": "South Africa",
"-sb": "Svalbard",
"sc": "Saint-Barthélemy",
"scu": "South Carolina",
"sd": "South Sudan",
"sdu": "South Dakota",
"se": "Seychelles",
"sf": "Sao Tome and Principe",
"sg": "Senegal",
"sh": "Spanish North Africa",
"si": "Singapore",
"sj": "Sudan",
"-sk": "Sikkim",
"sl": "Sierra Leone",
"sm": "San Marino",
"sn": "Sint Maarten",
"snc": "Saskatchewan",
"so": "Somalia",
"sp": "Spain",
"sq": "Eswatini",
"sr": "Surinam",
"ss": "Western Sahara",
"st": "Saint-Martin",
"stk": "Scotland",
"su": "Saudi Arabia",
"-sv": "Swan Islands",
"sw": "Sweden",
"sx": "Namibia",
"sy": "Syria",
"sz": "Switzerland",
"ta": "Tajikistan",
"-tar": "Tajik S.S.R.",
"tc": "Turks and Caicos Islands",
"tg": "Togo",
"th": "Thailand",
"ti": "Tunisia",
"tk": "Turkmenistan",
"-tkr": "Turkmen S.S.R.",
"tl": "Tokelau",
"tma": "Tasmania",
"tnu": "Tennessee",
"to": "Tonga",
"tr": "Trinidad and Tobago",
"ts": "United Arab Emirates",
"-tt": "Trust Territory of the Pacific Islands",
"tu": "Turkey",
"tv": "Tuvalu",
"txu": "Texas",
"tz": "Tanzania",
"ua": "Egypt",
"uc": "United States Misc. Caribbean Islands",
"ug": "Uganda",
"-ui": "United Kingdom Misc. Islands",
"-uik": "United Kingdom Misc. Islands",
"-uk": "United Kingdom",
"un": "Ukraine",
"-unr": "Ukraine",
"up": "United States Misc. Pacific Islands",
"-ur": "Soviet Union",
"-us": "United States",
"utu": "Utah",
"uv": "Burkina Faso",
"uy": "Uruguay",
"uz": "Uzbekistan",
"-uzr": "Uzbek S.S.R.",
"vau": "Virginia",
"vb": "British Virgin Islands",
"vc": "Vatican City",
"ve": "Venezuela",
"vi": "Virgin Islands of the United States",
"vm": "Vietnam",
"-vn": "Vietnam, North",
"vp": "Various places",
"vra": "Victoria",
"-vs": "Vietnam, South",
"vtu": "Vermont",
"wau": "Washington (State)",
"-wb": "West Berlin",
"wea": "Western Australia",
"wf": "Wallis and Futuna",
"wiu": "Wisconsin",
"wj": "West Bank of the Jordan River",
"wk": "Wake Island",
"wlk": "Wales",
"ws": "Samoa",
"wvu": "West Virginia",
"wyu": "Wyoming",
"xa": "Christmas Island (Indian Ocean)",
"xb": "Cocos (Keeling) Islands",
"xc": "Maldives",
"xd": "Saint Kitts-Nevis",
"xe": "Marshall Islands",
"xf": "Midway Islands",
"xga": "Coral Sea Islands Territory",
"xh": "Niue",
"-xi": "Saint Kitts-Nevis-Anguilla",
"xj": "Saint Helena",
"xk": "Saint Lucia",
"xl": "Saint Pierre and Miquelon",
"xm": "Saint Vincent and the Grenadines",
"xn": "North Macedonia",
"xna": "New South Wales",
"xo": "Slovakia",
"xoa": "Northern Territory",
"xp": "Spratly Island",
"xr": "Czech Republic",
"xra": "South Australia",
"xs": "South Georgia and the South Sandwich Islands",
"xv": "Slovenia",
"xx": "No place, unknown, or undetermined",
"xxc": "Canada",
"xxk": "United Kingdom",
"-xxr": "Soviet Union",
"xxu": "United States",
"ye": "Yemen",
"ykc": "Yukon Territory",
"-ys": "Yemen (People's Democratic Republic)",
"-yu": "Serbia and Montenegro",
"za": "Zambia"
}

In [None]:
#Code cell 5
#Create a variable with the path to our data folder
source_directory = '/gdrive/MyDrive/rbs_digital_approaches_2023/2023_data_class/'

## Read MARC records

In [None]:
#Code cell 10

import unicodedata

#Create an empty dataframe (basically a spreadsheet) to hold our data
bib_records = pd.DataFrame()

#Define a couple of regular expressions for stripping away punctuation that's
#included in the MARC fields:
#1) One or more spaces and/or colons at the end of the string
field_punctuation = re.compile(r'[\s\:]+$')
#2) Opening and closing square brackets, question mark, period, and comma
other_punctuation = re.compile(r'[\[\]\?\.,]')

#Open the file of MARC records in binary mode using MARCReader
reader = MARCReader(open(source_directory + '2023_d1_estc_pilgrims_progress.mrc', 'rb'))

#Loop through the records in the file
for record in reader :
  #Get the ESTC number from the 001 field
  estc_num = record['001'].data

  #Get the publication date from the 008 field
  pub_year = record['008'].data[7:11]

  #Get the publication city from MARC field 260|a using our unicodedata.normalize
  #trick
  pub_city = unicodedata.normalize('NFC', record['260']['a'])

  #Get rid of punctuation that's included in accord with cataloging rules
  #using the regular expression defined at line 13, above
  stripped_city = re.sub(field_punctuation, '', pub_city)

  #If the publication city has "i.e." in it
  if stripped_city.find('i.e.') != -1 :
    #Only keep the string starting 5 characters ahead of the i in "i.e."
    stripped_city = stripped_city[stripped_city.find('i.e.')+5:]

  #Remove any other punctuation (like square brackets) from the publication
  #city, using the regular expression defined at line 15, above
  stripped_city = re.sub(other_punctuation, '', stripped_city)

  #Get the country code from MARC field 008, stripping any white space from
  #the right: some country codes are three characters long, others are only
  #two, and would bring white space with them
  country_code = record['008'].data[15:18].rstrip()

  #Use the country code as the key to get the corresponding value from the
  #marc_country_codes dictionary from code cell 5
  country = marc_country_codes[country_code]

  #Combine the city name (stripped of punctuation and white space) and the country
  orig_place = stripped_city + ' ' + country

  #Get the imprint statement
  #There seems to have been a change in PyMarc that made some of my code break.
  #Syntax for testing for presence of subfield is from here:
  #https://groups.google.com/g/pymarc/c/f5A8m0976jY/m/s9dR9yc2AwAJ
  if record['260'] :
     if record.get('260', {}).get('b', None) is not None :
      imprint = record['260']['a'].rstrip(' :') + ': '
      imprint += record['260']['b'].rstrip(',')
      imprint += ' (' + str(pub_year) + ')'
  else :
    imprint = ''

  #Create a dictionary of the information we've extracted from this records
  row_df = pd.DataFrame({'estc_num': estc_num,
                         'pub_year': pub_year,
                         'imprint': imprint,
                         'orig_place': orig_place}, index=[0])
  #Update the bib_records dataframe by appending the row we just created, allowing
  #pandas to create an index number for the new row
  bib_records = pd.concat([bib_records, row_df], ignore_index=True)

#Print the dataframe to screen
bib_records

## Regularize place names

In [None]:
#Code cell 11
#Create a new DataFrame of the unique values of the orig_place column of our
#existing bib_records DataFrame
places = pd.DataFrame(sorted(list(bib_records.orig_place.unique())))
places.columns = ['orig_place']
places

In [None]:
#Code cell 12
#This is a Python dictionary with the original place names serving as keys and
#the regularized place names serving as the values.
regularized_places = {
  'Bath England': 'Bath GB',
  'Argraphwŷd yn y Mwŷthig England': 'Shrewsbury GB',
  'Birmingham England': 'Birmingham GB',
  'Boston United States': 'Boston MA USA',
  'Boston NE United States': 'Boston MA USA',
  'Boston in New-England United States': 'Boston MA USA',
  'Bristol England': 'Bristol GB',
  'Caerfyrddin Wales': 'Carmarthen GB',
  'Caerlleon England': 'Chester GB',
  'Coventry England': 'Coventry GB',
  'Dublin Ireland': 'Dublin IE',
  'Edinburgh England': 'Edinburgh GB',
  'Edinburgh Scotland': 'Edinburgh GB',
  'Ephrata in Pennsylvania United States': 'Ephrata PA USA',
  'Gainsborough England': 'Gainsborough Lincolnshire GB',
  'Gainsbrough England': 'Gainsborough Lincolnshire GB',
  'Germantaun Pa United States': 'Germantown Philadelphia PA USA',
  'Germanton Pa United States': 'Germantown Philadelphia PA USA',
  'Glasgow Scotland': 'Glasgow GB',
  'Liverpool England': 'Liverpool GB',
  'London England': 'London GB',
  'Manchester England': 'Manchester GB',
  'New-York United States': 'New York NY USA',
  'Newcastle England': 'Newcastle upon Tyne GB',
  'Newcastle upon Tyne England': 'Newcastle upon Tyne GB',
  'Nottingham England': 'Nottingham GB',
  'Paisley Scotland': 'Paisley GB',
  'Philadelphia United States': 'Philadelphia PA USA',
  'Preston England': 'Preston GB',
  'Shrewsbury England': 'Shrewsbury GB',
  'Vepery India': 'Vepery India',
  'Wolverhampton England': 'Wolverhampton GB',
  'Worcester United States': 'Worcester MA USA',
  'Worcester Mass United States': 'Worcester MA USA',
  'York England': 'York GB'
}

In [None]:
#Code cell 13
places['regularized_place'] = places['orig_place'].map(regularized_places)
places

In [None]:
#Code cell 14
regularized_bib_records = pd.merge(
    left=bib_records,
    right=places,
    on='orig_place',
    how='right'
)
regularized_bib_records

In [None]:
#Code cell 15
#Create a new dataframe to hold the unique values (i.e., the distinct place names)
#in the regularized_place column of the regularized_bib_records dataframe.
distinct_places = pd.DataFrame(list(regularized_bib_records.regularized_place.unique()))
#Add a column heading to our column
distinct_places.columns = ['regularized_place']
distinct_places

## Retrieve latitude/longitude coordinates from GeoNames API

In [None]:
#Code cell 16
#Import more packages
import requests
import urllib
import json
import time

#Function to handle requesting information from GeoNames.
def get_coords(placename) :
  
  #I'll let you know this username in our class session. I don't really want to
  #push it to GitHub. You can also set up your own free account and get a username
  #of your own. Running this cell without a value for the api_username variable
  #will result in an error.
  api_username = ''
  query_url = 'http://api.geonames.org/search?q='
  query_url += placename.replace(' ', '%20')
  query_url += '&maxRows=1&type=json&username=' + api_username
  #Let's see what the query URL looks like--just because
  print(query_url)

  #Use the requests module to retrieve the information from our query_url
  r = requests.get(query_url)
  #Printing the place we're searching for to screen so we know something's
  #happening...
  print(placename)

  #Parse the response from the GeoNames server as json, get the latitude and
  #longitude values from the JSON, then combine them into a single string, joined
  #by a comma
  response = r.json()
  lat = float(response['geonames'][0]['lat'])
  lng = float(response['geonames'][0]['lng'])

  #Print the coordinates, so we know something's happening
  print('... ' + str(lat) + ',' + str(lng))

  #Pause for two seconds to avoid hammering the GeoNames server. It's only polite.
  time.sleep(2)

  #The output of the function: this is what we'll get back for the value in each
  #row of the distinct_places dataframe
  return (lat, lng)

#Add a new column to our dataframe with the heading coords. The content of that new column 
#is generated by applying the get_coords function to the contents of the existing 
#regularized_place column.
distinct_places['coords'] = distinct_places['regularized_place'].apply(get_coords)

#Show our updated dataframe
distinct_places

### In case of emergency...

In [None]:
#If you're unable to get the coordinates from GeoNames for any reason, they're
#available here. Run this cell, then carry on.
emergency_placenames = ['London GB', 'Dublin IE', 'Gainsborough Lincolnshire GB', 'Glasgow GB',
 'Edinburgh GB', 'Shrewsbury GB', 'Carmarthen GB', 'Nottingham GB', 'Wolverhampton GB',
 'Birmingham GB', 'Coventry GB', 'Manchester GB', 'Preston GB', 'York GB',
 'Vepery India', 'Bath GB', 'Newcastle upon Tyne GB', 'Bristol GB', 'New York NY USA',
 'Worcester MA USA', 'Boston MA USA', 'Philadelphia PA USA', 'Ephrata PA USA',
 'Germantown Philadelphia PA USA', 'Chester GB', 'Paisley GB', 'Liverpool GB']
emergency_coordinates = [(51.50853, -0.12574), (53.33306, -6.24889),
 (53.38333, -0.76667), (55.86515, -4.25763), (55.95206, -3.19648),
 (52.71009, -2.75208), (51.85552, -4.30535), (52.9536, -1.15047),
 (52.58547, -2.12296), (52.48142, -1.89983), (52.40656, -1.51217),
 (53.45, -2.23333), (53.76282, -2.70452), (53.95763, -1.08271),
 (13.08472, 80.2675), (51.3751, -2.36172), (54.97328, -1.61396),
 (51.45523, -2.59665), (40.71427, -74.00597), (42.26259, -71.80229),
 (42.35843, -71.05977), (39.95238, -75.16362), (40.17982, -76.17884),
 (40.04344, -75.18018), (53.1905, -2.89189), (55.83173, -4.43254),
 (53.41058, -2.97794)]
zipped = list(zip(emergency_placenames, emergency_coordinates))

distinct_places = pd.DataFrame(zipped, columns=['regularized_place', 'coords'])
distinct_places

In [None]:
#Code cell 17
#Add coordinates
regularized_bib_records = pd.merge(
    left = regularized_bib_records,
    right = distinct_places,
    on = 'regularized_place',
    how = 'right'
)
regularized_bib_records

In [None]:
#Code cell 18
#Transform the values in the pub_year column into integers.
regularized_bib_records['pub_year'] = regularized_bib_records['pub_year'].astype(int)

#Sort by date.
regularized_bib_records.sort_values(by=['pub_year'], inplace=True)
# regularized_bib_records = regularized_bib_records.reset_index()
regularized_bib_records

## Visualize map

In [None]:
#Code cell 19
#Install Bokeh visualization library and xyzservices for adding custom map tiles.
!pip install bokeh==2.4.3
!pip install xyzservices

### Convert coordinate systems


In [None]:
#Code cell 20
#Convert latitude/longitude coordinates to Mercator coordinates used by Bokeh.

#This cell adapts code from a blog post by Craig Dickson showing a function for
# converting from lat/long to Mercator that he attributes to Nadine Amersi-Belton:
#https://towardsdatascience.com/creating-an-interactive-map-in-python-using-bokeh-and-pandas-f84414536a06

# Define function to switch from lat/long to mercator coordinates
def x_coord(x, y):

    lat = x
    lon = y

    #Math. My son confirmed my dim memory that this was Trigonometry, and then looked at me pityingly.
    r_major = 6378137.000
    x = r_major * np.radians(lon)
    scale = x/lon
    y = 180.0/np.pi * np.log(np.tan(np.pi/4.0 +
        lat * (np.pi/180.0)/2.0)) * scale
    return (x, y)

# Obtain list of mercator coordinates
mercators = [x_coord(x, y) for x, y in regularized_bib_records['coords']]
# Create mercator column in our dataframe
regularized_bib_records['mercator'] = mercators
# Split that column out into two separate columns: mercator_x and mercator_y
regularized_bib_records[['mercator_x', 'mercator_y']] = regularized_bib_records['mercator'].apply(pd.Series)
regularized_bib_records

In [None]:
#Code cell 21
#Rather than being one monolithic library, Bokeh is set up as a collection
#of more narrowly-focused modules. This means that Bokeh imports can
#be pretty scary-looking.
from bokeh.plotting import figure, show, curdoc
from bokeh.tile_providers import get_provider
from bokeh.io import output_notebook, reset_output
from bokeh.layouts import layout, row, widgetbox
from bokeh.models import Column, ColumnDataSource, CDSView, CustomJS, \
CustomJSFilter, HoverTool, RangeSlider, Row, Slider

#Import the xyzservices library for choosing tiles other than those
#provided by default by bokeh
import xyzservices.providers as xyz

#A command to make sure bokeh works in a Juyter/Colab notebook
output_notebook()


### Set initial map view

In [None]:
#Code cell 22
#Figure out the initial map view: the fartest points east, west, south, and north, 
#then padded a bit. I honestly don't remember how I came up with 50,000. 
#So I'm guessing "trial and error."
def pad_coords(mercator_float) :
  rounded = round(mercator_float)
  if rounded < 0 :
    padded = rounded - 500000
  else :
    padded = rounded + 500000
  return padded

#Get the minimum and maximum values from the mercator_x and mercator_y columns, then process
#those values using the pad_coords function.
min_x = pad_coords(regularized_bib_records['mercator_x'].min())
max_x = pad_coords(regularized_bib_records['mercator_x'].max())

min_y = pad_coords(regularized_bib_records['mercator_y'].min())
max_y = pad_coords(regularized_bib_records['mercator_y'].max())

#Save the minima and maxima as variables to use in constructing our map view
x_vals = (min_x, max_x)
y_vals = (min_y, max_y)

#Get minimum and maximum publication years for a date filter slider.
earliest_date = regularized_bib_records['pub_year'].min()
latest_date = regularized_bib_records['pub_year'].max()


### Use historic map tiles

In [None]:
#Code cell 23
#Use tiles from an historic map from the David Rumsey Collection (free account needed—we're just using mine.)
from xyzservices.lib import TileProvider
davidrumsey = TileProvider(
    name="David Rumsey",
    url="https://maps.georeferencer.com/georeferences/97a85ab4-5916-5335-a98d-4746ee461d95/2019-10-22T01:14:35.433817Z/map/{z}/{x}/{y}.png?key=SZKdmpxUvt19TydshGDy",
    attribution="(C) David Rumsey",
    )

##Code snippet 9 - Draw map

In [None]:
# Code cell 24

#Let bokeh know the data to use
source = ColumnDataSource(regularized_bib_records)

#This is a bridge between Python and JavaScript: it says what to
#do whenever the value of our slider changes, namely, announce that changed value.
#The text in red after code= is actually JavaScript that we're wedging
#in to what's otherwise Python scripting. (The three single quotes designate a
#multiline string.)
callback = CustomJS(args=dict(source=source), code='''
    source.change.emit();
''')

#Define a slider to filter our records on the map by publication year
slider = RangeSlider(start=earliest_date, end=latest_date, value=(earliest_date, latest_date),
                           step=1, title="Publication Year", max_width=500)

#More Python to JavaScript bridging: whenver the value of the slider changes,
#perform the callback function defined above, lines 11-13.
slider.js_on_change('value', callback)

#More Python to JavaScript bridging. Define a filter that will determine
#which records bokeh will display on our map.
pub_year_filter = CustomJSFilter(args=dict(source1=source, slider=slider), code='''
const indices = [];

// iterate through rows of data source and see if each satisfies some constraint
for (let i = 0; i < source1.get_length(); i++){
    if (source1.data['pub_year'][i] >= slider.value[0] && source1.data['pub_year'][i] <= slider.value[1]) {
        indices.push(true);
    } else {
        indices.push(false);
    }
}
return indices;
''')

#Define a view of our map that employs the pub_year_filter we just defined.
view = CDSView(source=source, filters=[pub_year_filter])

#Construct the actual map with some parameters: how wide it is, its intial viewport
#(based on Mercator coordinates from code cell 20), its x- and y-axes, and title
map = figure(plot_width=1000, x_range=x_vals, y_range=y_vals,
           x_axis_type='mercator', y_axis_type='mercator', title='Bunyan\'s Pilgrim\'s Progress')
#Add a blue circle to the map at each mercator_x/mercator_y pair in our data
map.circle(x='mercator_x', y='mercator_y', size=10, fill_color="blue",
         fill_alpha=0.8, source=source, view=view)

#Choose our map tiles. Lots of options available.
# tile_provider = get_provider(xyz.Esri.WorldGrayCanvas)
tile_provider = get_provider(davidrumsey)
#Use those tiles for our map
map.add_tile(tile_provider)

#Make some information appear when we hover on one of our circles
map.add_tools(HoverTool(
    tooltips=[
              ('ESTC', '@estc_num'),
              ('Imprint', '@imprint')
    ]))

#Define a layout that includes our slider and map
layout = Column(slider, map)
#Show the layout we just defined
show(layout)
