# Download newspaper records from the SLV catalogue

This notebook downloads the results of a [search for Victorian newspapers](https://find.slv.vic.gov.au/discovery/search?query=lds03,exact,Australia--Victoria&tab=searchProfile&search_scope=slv_local&vid=61SLV_INST:SLV) from the State Library of Victoria's catalogue. The search looks for records where 'Australia--Victoria' appears in the 'Place newspaper published' field (`lds03`). It gets the results data from a Primo JSON API and paginates through the data by updating the `offset` parameter. The results are save in a newline-delimited JSON (NDJSON) file.

The newspaper data is linked to place information in the [processing newspaper data](process_newspapers.ipynb) notebook.

In [1]:
import requests
import time
import pandas as pd
import re
from requests_cache import CachedSession
from tqdm.auto import tqdm
from pathlib import Path
import json


sess = CachedSession(timeout=60)
tqdm.pandas()

In [None]:
records = []
params = {
    "inst": "61SLV_INST",
    "vid": "61SLV_INST:SLV",
    "limit": 50,
    "offset": 0,
    "qInclude": "facet_genre,exact,Newspapers",
    "facet": "genre,include,Newspapers",
    "q": f"lds03,exact,Australia--Victoria"
}

more = True
while more:
    response = sess.get("https://find.slv.vic.gov.au/primaws/rest/pub/pnxs", params=params)
    data = response.json()
    for item in data["docs"]:
        records.append(item["pnx"]["display"])
    #    alma_id = item["pnx"]["control"]["sourcerecordid"][0]
    #    title = item["pnx"]["display"]["title"]
        # Get other metadata
    #    records.append({"alma_id": alma_id, "title": title})
    if data["docs"]:
        params["offset"] += 50
    else:
        more = False
    print(len(records))
    if not response.from_cache:
        time.sleep(5)

In [191]:
with Path("newspapers.ndjson").open("w") as ndjson:
    for record in records:
        ndjson.write(json.dumps(record) + "\n")

In [228]:
def get_records(placename):
    records = []
    params = {
        "inst": "61SLV_INST",
        "vid": "61SLV_INST:SLV",
        "limit": 50,
        "offset": 0,
        "qInclude": "facet_rtype,exact,images|,|facet_tlevel,exact,online_resources",
        "q": f'sub,exact,"{placename} (Vic.)"'
    }
    response = sess.get("https://find.slv.vic.gov.au/primaws/rest/pub/pnxs", params=params)
    #print(response.url)
    data = response.json()
    for item in data["docs"]:
        if "No copyright restrictions apply." in item["pnx"]["display"]["lds30"]:
            alma_id = item["pnx"]["control"]["sourcerecordid"][0]
            title = item["pnx"]["display"]["title"][0]
            date = item["pnx"]["display"]["creationdate"][0]
            # Get other metadata
            records.append({"placename": placename, "alma_id": alma_id, "title": title, "date": date})
    return records


    

In [229]:
df_places = pd.read_csv("places.csv")

In [230]:
records = []
for placename in tqdm(list(df_places["Place Name"].unique())):
    records += get_records(placename.title())
    #time.sleep(5)

  0%|          | 0/5232 [00:00<?, ?it/s]

In [231]:
len(records)

9510

In [232]:
df = pd.DataFrame(records)

In [233]:
df

Unnamed: 0,placename,alma_id,title,date
0,Abbotsford,9938854853607636,"Aerial view of Abbotsford, Victoria.",1 April 1954
1,Abbotsford,9938854843607636,"Aerial view of Abbotsford, Victoria.",19 February 1953
2,Abbotsford,9917156513607636,[Clifton Hill and surrounds from Studley Park]...,[ca. 1882]
3,Abbotsford,9916578523607636,Abbotsford & Johnston St. Bridge from Studley ...,[ca. 1870-ca. 1880]
4,Abbotsford,9917112033607636,[St Helliers Homestead and old Johnston St. Br...,[ca. 1863-ca.1870]
...,...,...,...,...
9505,Yarram,9934974853607636,"Anzac Day march, Yarram, Victoria.",1927?
9506,Yarram,9921622533607636,"Shire Hall & Bank Of Australasia, Yarram.",[between 1905 and 1915?]
9507,Yarram,9939667739207636,Exterior of ANZ Bank in Yarram.,1952/1953
9508,Yarram,9937105033607636,"Court House, Yarram.",[1952?]


In [241]:
def extract_years(row):
    years = re.findall(r"\b(?:18|19|20)\d{2}\b", str(row["date"]))
    if len(years) > 2:
        print(date)
    elif len(years) == 1:
        years = years * 2
    elif len(years) == 0:
        return
    return years

In [242]:
df[["start_year", "end_year"]] = df.apply(extract_years, axis=1, result_type="expand")

In [244]:
df["image_id"] = df["ie_id"].progress_apply(get_iiif_ids)

  0%|          | 0/9510 [00:00<?, ?it/s]

400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22128402/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22308453/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE22444583/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE425714/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE893049/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE889797/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE1874285/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE863918/manifest.json
Restarting session
400: https://rosetta.slv.vic.gov.au/delivery/iiif/presentation/2.1/IE20609374/manifest.json
Restarting session


In [247]:
df.loc[df["image_id"] != ""].to_csv("place_images.csv", index=False)

In [250]:
data = {}
for placename, images in df.loc[df["image_id"] != ""].groupby("placename"):
    data[placename] = images[["alma_id", "title", "start_year", "end_year", "ie_id", "image_id"]].to_dict(orient="records")

In [251]:
Path("place_images.json").write_text(json.dumps(data))

2105862

In [127]:
df.to_csv("search_results.csv", index=False)

In [126]:
df

Unnamed: 0,placename,alma_id,title,date,ie_id
0,Abbotsford,9939670318507636,[Abbotsford. National Bank Victoria St],"[25-12-63 [Dec. 25, 1963]]",IE5835936
1,Abbotsford,9939656411407636,[Abbotsford. National Bank Victoria St],"[25-12-63 [Dec. 25, 1963]]",IE5652363
2,Abbotsford,9938854853607636,"[Aerial view of Abbotsford, Victoria.]",[1 April 1954],IE21718575
3,Abbotsford,9938854843607636,"[Aerial view of Abbotsford, Victoria.]",[19 February 1953],IE21717415
4,Abbotsford,9922695953607636,"[[Exterior view of Quasi Arc Pty Ltd, Victoria...",[[ca. 1940 - ca. 1959].],IE1338947
...,...,...,...,...,...
10327,Yarram,9934974853607636,"[Anzac Day march, Yarram, Victoria.]",[1927?],IE1876833
10328,Yarram,9921622533607636,"[Shire Hall & Bank Of Australasia, Yarram.]",[[between 1905 and 1915?]],IE1476915
10329,Yarram,9939667739207636,[Exterior of ANZ Bank in Yarram.],[1952/1953],IE20331127
10330,Yarram,9937105033607636,"[Court House, Yarram.]",[[1952?]],IE20333287


In [56]:
for ids in df["image_id"].to_list():
    for image_id in  ids.split("|"):
        if image_id:
            filename = image_id.split("/")[-1].replace(":", "_")
            response = sess.get(f"{image_id}/full/!3000,3000/0/default.jpg")
            Path("images", filename).write_bytes(response.content)

In [76]:
requests.get("https://find.slv.vic.gov.au/discovery/search?query=sub,exact,%22Mordialloc%20(Vic.)%22&tab=searchProfile&search_scope=slv_local&vid=61SLV_INST:SLV&facet=rtype,include,images&facet=tlevel,include,online_resources&offset=0")

CachedResponse(_content=b'<!DOCTYPE html><html id="primoExploreRoot" lang="en"><head><meta charset="utf-8"><meta name="viewport" content="initial-scale=1,width=device-width,shrink-to-fit=no"><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="description" content=""><meta name="keywords" content=""><meta id="ogTitle" property="og:title" content=""><meta id="ogDescription" property="og:description" content=""><meta id="ogImage" property="og:image" content=""><meta id="ogType" property="og:type" content="website"><meta id="ogURL" property="og:url" content=""><meta name="robots" content="noindex"><base href="/discovery/"><title id="primoExploreTitle"></title><link id="sharedCustomerFavIcon" rel="apple-touch-icon-precomposed" href=""><link id="viewCustomerFavIcon" rel="icon" href=""><link id="viewHomeScreenCustomerIconAndroid" rel="icon" href=""><link id="viewHomeScreenCustomerIconIos" rel="apple-touch-icon" href=""><link id="canonicalFullDisplay" rel="canonical" href=""/><li

In [79]:
requests.post("https://find.slv.vic.gov.au/primaws/rest/pub/edelivery?vid=61SLV_INST:SLV&acTriggered=false&blendFacetsSeparately=false&citationTrailFilterByAvailability=true&disableCache=false&getMore=0&inst=61SLV_INST&isCDSearch=false&lang=en&limit=10&newspapersActive=true&newspapersSearch=false&offset=0&otbRanking=false&pcAvailability=true&q=sub,exact,%22Mordialloc+(Vic.)%22&qExclude=&qInclude=facet_rtype,exact,images%7C,%7Cfacet_tlevel,exact,online_resources&rapido=false&refEntryActive=false&rtaLinks=true&scope=slv_local&searchInFulltextUserSelection=true&skipDelivery=Y&sort=rank&tab=searchProfile&vid=61SLV_INST:SLV&lang=en", json=[{"recId":"alma9924826523607636","sharedDigitalCandidates":None},{"recId":"alma9917679643607636","sharedDigitalCandidates":None},{"recId":"alma9916957413607636","sharedDigitalCandidates":None},{"recId":"alma9916539203607636","sharedDigitalCandidates":None},{"recId":"alma9917336833607636","sharedDigitalCandidates":None},{"recId":"alma9917335133607636","sharedDigitalCandidates":None},{"recId":"alma9917319233607636","sharedDigitalCandidates":None},{"recId":"alma9917312613607636","sharedDigitalCandidates":None},{"recId":"alma9917064413607636","sharedDigitalCandidates":None},{"recId":"alma9916574203607636","sharedDigitalCandidates":None}])

<Response [200]>

In [80]:
response.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)