In [1]:
import cv2
import os
import requests
import numpy as np
import json
import csv

In [2]:
import requests
        
def extract_link_from_json(json_url, condition):
    response = requests.get(json_url)
    data = json.loads(response.text)

    sequences = data['sequences']
    for sequence in sequences:
        canvases = sequence['canvases']
        for canvas in canvases:
            on = canvas['images'][0]['on']
            if condition in on:
                link = canvas['images'][0]['resource']['@id']
                return link

    return None

json_files_url = 'https://dss.digitalbibleonline.org/manifests/all/'

with open('image_names.txt', 'r') as f, open('ir_image_identifiers.txt', 'w') as output_file:
    for line in f:
        line = line.rstrip() + '.json'
        json_file_url = f"{json_files_url}{line}"
        link = extract_link_from_json(json_file_url, "Near Infra-Red (NIR) -Recto")
        start_index = link.find('/iiif/') + 6
        end_index = link.find('/full')
        identifier = link[start_index:end_index]
        output_file.write(identifier + '\n')


In [3]:
import csv

def combine_lists(image_names_file, image_identifiers_file, output_file):
    combined_data = []
    
    with open(image_names_file, 'r') as names_file, open(image_identifiers_file, 'r') as identifiers_file:
        names = names_file.readlines()
        identifiers = identifiers_file.readlines()
        
        # Remove newlines from the end of each line
        names = [name.strip() for name in names]
        identifiers = [identifier.strip() for identifier in identifiers]
        
        # Ensure that the number of elements in both lists is the same
        if len(names) != len(identifiers):
            raise ValueError("The number of elements in the lists doesn't match.")
        
        # Combine the lists
        combined_data = list(zip(names, identifiers))
    
    # Write the combined data to a CSV file
    with open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['image_name', 'image_identifier'])  # Column names
        writer.writerows(combined_data)

# Example usage
image_names_file = 'image_names.txt'
image_identifiers_file = 'ir_image_identifiers.txt'
output_file = 'ir_image_names_identifiers.csv'

combine_lists(image_names_file, image_identifiers_file, output_file)


In [4]:
def read_names_identifiers(image_names_identifiers_file):
    data = []   
    with open(image_names_identifiers_file, 'r') as csv_file:
        reader = csv.DictReader(csv_file)
        
        for row in reader:
            data.append(row)
    
    return data

In [7]:
def download_iiif_image(identifier):
    # Download tiles and stich them into a full size image
    
    #Construct prefix using the identifier
    prefix ='https://iaa.iiifhosting.com/iiif/' + identifier + '/'    
    #Read info.json
    info_url = prefix + 'info.json'
    info = requests.get(info_url).json() 
    #Read image sizes and construct an empty image
    height = str(info['height'])
    width = str(info['width'])
    stiched_image = np.empty((info['height'], info['width']), dtype=np.uint8)
    #Read tile sizes
    tile_width = info['tiles'][0]['width']
    tile_height = info['tiles'][0]['height']
    #Download tile by tile and stich them into a full size image
    y = 0
    while y < info['height']:
        x = 0
        adj_height = min(tile_height, info['height'] - y)
        while x < info['width']:
            adj_width = min(tile_width, info['width'] - x)
            tile_url = prefix + str(x) + ',' + str(y)+ ',' + str(adj_width)+ ',' + str(adj_height) + '/full/0/default.jpg'
            response = requests.get(tile_url, stream=True).raw
            tile_array = np.asarray(bytearray(response.read()), dtype="uint8")
            tile_image = cv2.imdecode(tile_array, 0)
            stiched_image[y:y+adj_height, x:x+adj_width] = tile_image
            x = x + tile_width
        y = y + tile_height
    
    return stiched_image

In [8]:
import asyncio

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

image_names_identifiers_file = 'ir_image_names_identifiers.csv'
image_download_folder = 'data/dss_dataset/ir_images/'
names_identifiers = read_names_identifiers(image_names_identifiers_file)

#Run loop iterations in parallel
@background
def download_and_save_images(name, identifier):   
    full_size_image = download_iiif_image(identifier)
    cv2.imwrite(image_download_folder + name + '.jpg', full_size_image)

#Run loop iterations in parallel
for row in names_identifiers:
    download_and_save_images(row['image_name'], row['image_identifier'])