# Download items from contentdm collection

In [1]:
import requests, os
import pandas as pd
from pathlib import Path

from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

In [2]:
csv_path = Path("../data/mccray/changed_data/McCray,1940s,5126-NUMS,V1.csv")
df = pd.read_csv(csv_path)

print(list(df))
print(len(df))

df = df[df['CONTENTdm number'].notnull()]
print(len(df)) # two content dm numbers null !!!

['CONTENTdm number', 'Original Transcript']
5126
5124


In [None]:
# base_url= 'https://cdm17173.contentdm.oclc.org'
base_url = 'http://cdm17173.contentdm.oclc.org/'
collection = 'p17173coll38'
# item_id = df['CONTENTdm number']

id_col = 'CONTENTdm number'
tr_col = 'Original Transcript'

def download_contentdm_image(base_url, collection, item_id, output_path="documents"):
    item_id = int(item_id)

    dir = f"{output_path}/{collection}/{item_id}"

    # check to see if the folder already exists, only fetch if it doesn't
        # if(os.path.isdir(dir)):
        #     print(f"{item_id}: {dir} already exists. Not fetching.")
        # else:
    
    # Build manifest URL
    manifest_url = f"{base_url}/iiif/info/{collection}/{item_id}/manifest.json"
    print("Fetching manifest:", manifest_url)

    response = requests.get(manifest_url)
    if response.status_code != 200:
        print(f"Failed to fetch manifest for item {item_id}: HTTP {response.status_code}")
        return

    manifest = response.json()

    # Each "canvas" is an image/page
    canvases = manifest["sequences"][0]["canvases"]

    for i, canvas in enumerate(canvases, start=1):

            # if the file doesn't exist, pull it
            if(not os.path.exists(f"{dir}/{item_id}_page{i}.jpg")):
                # The IIIF image service ID is inside the "images" block
                service_id = canvas["images"][0]["resource"]["service"]["@id"]

                # Request the full image (100% size)
                img_url = f"{service_id}/full/pct:100/0/default.jpg"
                os.makedirs(dir, exist_ok=True)
                print(f"Downloading page {i}: {img_url}")

                img_data = requests.get(img_url).content
                f"{service_id}/full/pct:100/0/default.jpg"

                filename = f"{dir}/{item_id}_page{i}.jpg"  # fetch from folder later with regex <item_id>_page*
                with open(filename, "wb") as f:
                    f.write(img_data)
                print("Saved:", filename)

def process_row(row, base_url, collection):
    """Process a single row"""
    id, tr = row
    id = int(id)
    
    # Thread-safe printing
    with threading.Lock():
        print("\n", id)
    
    # Fetch and output image
    download_contentdm_image(
        base_url=base_url,
        collection=collection,
        item_id=id
    )
    return id

# Parallelize the loop
max_workers = 40
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    
    for row in df[[id_col, tr_col]].itertuples(index=False, name=None):
        future = executor.submit(process_row, row, base_url, collection)
        futures.append(future)
    
    # Wait for all tasks to complete
    for future in as_completed(futures):
        try:
            result = future.result()
            # Optionally handle the result
        except Exception as e:
            print(f"Error processing row: {e}")



 981
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/981/manifest.json

 998
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/998/manifest.json

 999
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/999/manifest.json

 1000
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1000/manifest.json

 1001
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1001/manifest.json

 1002
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1002/manifest.json

 1003
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1003/manifest.json

 1010
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1010/manifest.json

 1011
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/info/p17173coll38/1011/manifest.json

 1012
Fetching manifest: http://cdm17173.contentdm.oclc.org//iiif/inf