# Harvest details of images related to Victorian places from the SLV catalogue

This notebook harvests a sample of images that relate to Victorian places from the SLV catalogue. The steps are:

- load a dataset of placenames downloaded from VicNames
- search for each placename in the catalogue subject headings, eg: 'Mordialloc (Vic.)', downloading a maximum of 50 results
- get the IIIF identifiers for the images from the MARC records and IIIF manifests

The results of this are:

- a CSV file containing all of the results
- a JSON file organised so that placenames can be used as a key to returns a list of images

The JSON file is used in the [my place](https://slv.wraggelabs.com/myplace/) app.

In [1]:
import requests
import time
import pandas as pd
import re
from requests_cache import CachedSession
from tqdm.auto import tqdm
from pathlib import Path
import json


sess = CachedSession(timeout=60)
tqdm.pandas()

In [2]:
def get_records(placename):
    records = []
    params = {
        "inst": "61SLV_INST",
        "vid": "61SLV_INST:SLV",
        "limit": 50,
        "offset": 0,
        "qInclude": "facet_rtype,exact,images|,|facet_tlevel,exact,online_resources",
        "q": f'sub,exact,"{placename} (Vic.)"'
    }
    response = sess.get("https://find.slv.vic.gov.au/primaws/rest/pub/pnxs", params=params)
    #print(response.url)
    data = response.json()
    for item in data["docs"]:
        if "No copyright restrictions apply." in item["pnx"]["display"]["lds30"]:
            alma_id = item["pnx"]["control"]["sourcerecordid"][0]
            title = item["pnx"]["display"]["title"][0]
            date = item["pnx"]["display"]["creationdate"][0]
            # Get other metadata
            records.append({"placename": placename, "alma_id": alma_id, "title": title, "date": date})
    return records

In [3]:
# Load the place names
df_places = pd.read_csv("places.csv")

In [4]:
records = []
for placename in tqdm(list(df_places["Place Name"].unique())):
    records += get_records(placename.title())
    #time.sleep(5)

  0%|          | 0/5232 [00:00<?, ?it/s]

In [5]:
len(records)

9510

In [6]:
df = pd.DataFrame(records)

In [7]:
df

Unnamed: 0,placename,alma_id,title,date
0,Abbotsford,9938854853607636,"Aerial view of Abbotsford, Victoria.",1 April 1954
1,Abbotsford,9938854843607636,"Aerial view of Abbotsford, Victoria.",19 February 1953
2,Abbotsford,9917156513607636,[Clifton Hill and surrounds from Studley Park]...,[ca. 1882]
3,Abbotsford,9916578523607636,Abbotsford & Johnston St. Bridge from Studley ...,[ca. 1870-ca. 1880]
4,Abbotsford,9917112033607636,[St Helliers Homestead and old Johnston St. Br...,[ca. 1863-ca.1870]
...,...,...,...,...
9505,Yarram,9934974853607636,"Anzac Day march, Yarram, Victoria.",1927?
9506,Yarram,9921622533607636,"Shire Hall & Bank Of Australasia, Yarram.",[between 1905 and 1915?]
9507,Yarram,9939667739207636,Exterior of ANZ Bank in Yarram.,1952/1953
9508,Yarram,9937105033607636,"Court House, Yarram.",[1952?]


In [8]:
def extract_years(row):
    years = re.findall(r"\b(?:18|19|20)\d{2}\b", str(row["date"]))
    if len(years) > 2:
        print(date)
    elif len(years) == 1:
        years = years * 2
    elif len(years) == 0:
        return
    return years

In [9]:
df[["start_year", "end_year"]] = df.apply(extract_years, axis=1, result_type="expand")

In [10]:
def get_image_id(alma_id):
    """
    Get the IE image identifier from the MARC record.
    These ids are used to construct IIIF manifest urls.
    """
    marc = get_marc_record(alma_id)
    try:
        image_id = re.search(r"\$e(IE\d+)", marc).group(1)
    except AttributeError:
        # print(alma_id)
        image_id = ""
    return image_id


def get_iiif_ids(image_id):
    """
    Extract a list of image @ids from an IIIF manifest
    """
    global COUNTER
    global sess
    image_ids = []
    if image_id:
        manifest_url = f"https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/{image_id}/manifest.json"
        try:
            response = sess.get(manifest_url)
        except requests.exceptions.Timeout:
            print(f"timeout: {manifest_url}")
        else:
            if response.ok:
                try:
                    manifest = response.json()
                except requests.JSONDecodeError:
                    print(manifest_url)
                else:
                    # There can be multiple images in a record
                    # So we loop through the canvases to get each one.
                    for canvas in manifest["sequences"][0]["canvases"]:
                        if canvas["images"][0]["resource"]["format"] == "image/jpeg":
                            image_ids.append(
                                canvas["images"][0]["resource"]["service"]["@id"]
                            )
            elif response.status_code == 401:
                # print(response.status_code)
                return ""
            else:
                print(f"{response.status_code}: {manifest_url}")
                print("Restarting session")
                sess = CachedSession(timeout=60, headers={"User-Agent": "GLAM Workbench notebook / glam-workbench.net / tim@timsherratt.au"})
                get_iiif_ids(image_id)
            if not response.from_cache:
                time.sleep(2)
    # print(image_ids)
    return "|".join(image_ids)

def get_marc_record(alma_id):
    """
    Gets a text representation of an item's MARC record.
    """
    response = sess.get(
        f"https://find.slv.vic.gov.au/primaws/rest/pub/sourceRecord?docId=alma{alma_id}&vid=61SLV_INST:SLV"
    )
    return response.text


def get_marc_value(marc, tag, subfield):
    """
    Gets the value of a tag/subfield from a text version of an item's MARC record.
    """
    try:
        tag = re.search(rf"^{tag}\t.+", marc, re.M).group(0)
        subfield = re.search(rf"\${subfield}([^\$]+)", tag).group(1)
    except AttributeError:
        return None
    return subfield.strip(" .,")


In [11]:
# Get the Rosetta id from the MARC record
df["ie_id"] = df["alma_id"].progress_apply(get_image_id)

  0%|          | 0/9510 [00:00<?, ?it/s]

In [12]:
# Get the IIIF id from the manifest
df["image_id"] = df["ie_id"].progress_apply(get_iiif_ids)

  0%|          | 0/9510 [00:00<?, ?it/s]

400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22128402/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22308453/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22444583/manifest.json
Restarting session


In [13]:
df.loc[df["image_id"] != ""].to_csv("place_images.csv", index=False)

In [14]:
data = {}
for placename, images in df.loc[df["image_id"] != ""].groupby("placename"):
    data[placename] = images[["alma_id", "title", "start_year", "end_year", "ie_id", "image_id"]].to_dict(orient="records")

In [15]:
Path("place_images.json").write_text(json.dumps(data))

2107327