In [20]:
import requests
import base64
import xml.etree.ElementTree as ET
from IPython.display import IFrame
import json
import time

To use the IAW API, we firstly need to specify the API URL and token. The API token will be required in the Authorization header when sending requests

In [21]:
# The Base URL of IAW API.
iaw_api_base = 'https://iaw-server.ardc-hdcl-sia-iaw.cloud.edu.au/api'
# API token.
iaw_api_token = '3|dajsYsH0Dx87OXlAWk7T5ZDRJGgCMaV5WJhgKnth32114c9a'

# Set the headers
headers = {
    'Accept': 'application/json',
    'Authorization': f'Bearer {iaw_api_token}'
}

In the upcoming sections of the code, we will be defining some functions that will come in handy later on.

In [22]:
# Post a collection , Return the collection id ig created successfullly.
def create_collection(collection_name , collection_description):
   
    collection_data = {
        "name": collection_name, 
        "description": collection_description, 
    }
    
    url = f'{iaw_api_base}/collections' 
    response = requests.post(url, headers=headers, data=collection_data)

    if response.status_code == 200:  # Status 201 indicates successful creation
        new_collection = response.json()
        print(f'Collection created successfully with ID {new_collection["id"]}')
        return new_collection["id"]
    else:
        print('Failed to create collection')
        print(response.text)


In [23]:
# Creates an image. Upload the image under the current account
# The actual width and height of the image may differ from the dimensions provided in the XML file.
# This function retrieves the actual image dimensions from the response, calculates the width and height
# ratio, and uses these ratios to adjust the annotation coordinates accordingly
def post_image(name, path,  description , width , height):

    image_path = f'images/{path}'     
    # Read the image file in binary mode
    with open(image_path, 'rb') as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    image_data = {
        "name": name, 
        "description": description,  
        "type" : "iiif",
        "source" : "this",
        "file": encoded_image  # Base64-encoded image content
    }

    
    url = f'{iaw_api_base}/images/local'
    response = requests.post(url, headers=headers, data=image_data)
    
    if response.status_code == 200:
        
        new_image = response.json()
            
        actual_width = new_image['width']
        actual_height = new_image['height']
        
        # Calculate the ratio
        width_ratio = float(actual_width) / float(width)
        height_ratio = float(actual_height) / float(height)
        
        print(f'Image created successfully with id {new_image["id"]}')
        return {
            "id": new_image["id"],
            "source": new_image["iiif_url"],
            "width_ratio": width_ratio,
            "height_ratio": height_ratio
        }
    else:
        print('Failed to create image')
        print(response.text)

In [24]:
# Post a image set to a given collection , add image to it
# After a collection is created successfully , we use the collection id to create a image set for it. And add all 
# the images we just uploaded to it.
def create_image_set_to_collection(collection_id ,image_ids ,name ,description , attribution):
    
    image_set_data = {
        "name": name,
        "description": description,
        "attribution": attribution,
        "published": True,
        "image_ids[]": image_ids 
    }
    
    url = f'{iaw_api_base}/collections/{collection_id}/image-sets' 
    response = requests.post(url, headers=headers, data=image_set_data)
    
    if response.status_code == 200:  
        new_image_set = response.json()
        print(f'Image set for collection {collection_id} created successfully with ID {new_image_set["id"]}')
        return new_image_set["id"]
    else:
        print('Failed to create image set')
        print(response.json())
        

In [25]:
# Now we add a annotation set to the image set we just created
def create_annotation_set(image_set_id , name, description ):
    
    annotation_set_data = {
        "name": name,
        "description": description,
        "image_set_id": image_set_id,
    }
    
    url = f'{iaw_api_base}/image-sets/{image_set_id}/annotation-sets' 
    response = requests.post(url, headers=headers, data=annotation_set_data)
    
    if response.status_code == 200: 
        new_annotation_set = response.json()
        print(f'Annotation set for image set ID {image_set_id} created successfully with ID {new_annotation_set["id"]}')
        return new_annotation_set["id"]
    else:
        print('Failed to create annotation set')
        print(response.json())

In [26]:
# Before we upload annotation. we will need to adjust the annotation location based on the ratios
def adjust_polygon_points(points, width_ratio, height_ratio):
    adjusted_points = []

    for point in points:
        x, y = map(float, point.split(","))
        adjusted_x = x * width_ratio
        adjusted_y = y * height_ratio
        adjusted_points.append(f"{adjusted_x},{adjusted_y}")

    return " ".join(adjusted_points)

In [27]:
# In the end we upload all the annotations for each image. 
# We will be using the 'points' attribute to create polygon for annotated text
def create_annotation(annotation_set_id, image_id, annotation_title, image_source, polygon):
    
    # Create the target as an object (dictionary in Python)
    target = {
        "source": image_source,
        "selector": {
            "type": "SvgSelector",
            "value": polygon
        }
    }
    
    field = {
        "title": {
            "en": {
                "values": [
                    annotation_title
                ]
            },
        }
    }

    # Prepare the annotation data
    annotation_data = {
        "annotation_type_id": 1,
        "annotation_set_id": annotation_set_id,
        "image_id": image_id,
        "target": target, 
        "fields" : field
    }


    url = f'{iaw_api_base}/annotation-sets/{annotation_set_id}/annotations'
    response = requests.post(url, headers=headers, json=annotation_data)
    
    # Handle the response
    if response.status_code == 200:
        new_annotation = response.json()
        print(f'Annotation for annotation set ID {annotation_set_id} created successfully with ID {new_annotation["id"]}')
    else:
        print('Failed to create annotation')
        print(response.json())

In [28]:
# After everything are created successfully, we Publish the image set and annotation set 
def publish_image_and_annotation_set(collection_id , image_set_id , annotation_set_id):
    
    publish_data = {
        "published": True,
    }
    
    url = f'{iaw_api_base}/collections/{collection_id}/image-sets/{image_set_id}' 
    response = requests.put(url, headers=headers, json=publish_data)
    
    if response.status_code == 200: 
        
        image_set_data = response.json()
        print(f'Image set published successfully')
        published_url = "https://iaw.ardc-hdcl-sia-iaw.cloud.edu.au/publications/image-sets/" + image_set_data['publication']['id']
        
        # Publish annotation set 
        url = f'{iaw_api_base}/image-sets/{image_set_id}/annotation-sets/{annotation_set_id}' 
        response = requests.put(url, headers=headers, json=publish_data)
        
        if response.status_code == 200: 
            print("Annotation set published successfully")
            return published_url
        else:
            print("Failed to publish annotation set")
    else:
        print('Failed to publish image set')
        print(response.json())
        

In [29]:
ns = {"ns": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}

def import_single_xml(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Create collection
    collection_name = "Sample Collection"
    collection_description = " "
    collection_id = create_collection(collection_name , collection_description)
    
    # Upload image
    image_name = "0003_4966-P0000-000001-0010-010-001 page3"
    image_description = "0003_4966-P0000-000001-0010-010-001 page3"
    image = post_image( "Sample image", image_name + ".jpg", image_description , "500" , "500" )
    image_id = image["id"]
    image_source = image["source"]
    
    # Create image set and add image to it
    image_set_id = create_image_set_to_collection(collection_id , [image_id] , "Sample image" ,"Sample image" , " ")
    
    # Create annotation set
    annotation_set_id = create_annotation_set(image_set_id , "Text Annotation", " ")
    
    # Loop through each TextRegion
    for text_region in root.findall(".//ns:TextRegion", ns):
       
        # Loop through each TextLine within the current TextRegion
        for text_line in text_region.findall("ns:TextLine", ns):
                        
            # Add annotation from coordinates
            polygon =  "<svg><polygon points=\"" + text_line.find("ns:Coords", ns).attrib["points"] + "\"></polygon></svg>"
            unicode_text = text_line.find("ns:TextEquiv/ns:Unicode", ns).text.strip()
            create_annotation(annotation_set_id , image_id , unicode_text, image_source , polygon)
            
        
    # Publish image set and annotation set
    annotation_url = publish_image_and_annotation_set(collection_id, image_set_id , annotation_set_id)
      
    print(annotation_url)
    iframe = IFrame(src=annotation_url, width="100%", height="700px")
    display(iframe)

# Path to your XML file
xml_file_path = '0003_4966-P0000-000001-0010-010-001 page3.xml'

# Call the function with the file path
import_single_xml(xml_file_path)

Collection created successfully with ID 181
Image created successfully with id 1633
Image set for collection 181 created successfully with ID 655
Annotation set for image set ID 655 created successfully with ID 652
Annotation for annotation set ID 652 created successfully with ID 14061
Annotation for annotation set ID 652 created successfully with ID 14062
Annotation for annotation set ID 652 created successfully with ID 14063
Annotation for annotation set ID 652 created successfully with ID 14064
Annotation for annotation set ID 652 created successfully with ID 14065
Annotation for annotation set ID 652 created successfully with ID 14066
Annotation for annotation set ID 652 created successfully with ID 14067
Annotation for annotation set ID 652 created successfully with ID 14068
Annotation for annotation set ID 652 created successfully with ID 14069
Annotation for annotation set ID 652 created successfully with ID 14070
Annotation for annotation set ID 652 created successfully with ID

In [30]:
# Define the XML namespace
ns = {'ns': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

def import_multiple_xml(xml_file):

    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Create collection
    collection_name = "TEI Transcription"
    collection_description = ""
    collection_id = create_collection(collection_name , collection_description)

    # Extract text annotations and populate the hashmap
    facs_text_map = {}  # id => annotated text
    text = root.find(".//ns:text", ns)
    if text is not None:
        for annotation_text in text.findall(".//ns:l", ns):
            facs_id = annotation_text.attrib.get("facs")
            text_content = annotation_text.text
            
            if facs_id and text_content:
                facs_text_map[facs_id] = text_content
    
    
    # Uoload all image and get image id
    image_info_map = {} 
    image_ids = []

    for facsimile in root.findall(".//ns:facsimile", ns): 
       
        image_id = facsimile.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
        graphic = facsimile.find(".//ns:graphic", ns)    
        image_name = graphic.attrib.get("url")
        image_width = graphic.attrib.get("width").replace("px", "")
        image_height = graphic.attrib.get("height").replace("px", "")
        
        image = post_image(image_id , image_name.split('_', 1)[1] , image_name , image_width , image_height)
        image_info_map[image_name] = {
            'id' : image["id"],
            'source' : image["source"],
            'width_ratio': image["width_ratio"],
            'height_ratio': image["height_ratio"]
        }

        image_ids.append(image["id"])
    

    # Create one image set for ALL image
    image_set_id = create_image_set_to_collection(collection_id , image_ids , "Image Collections" , " " , " ")
    
    #Create annotation set for image set
    annotation_set_id = create_annotation_set(image_set_id , "Text annotation" , " ")
        

    for facsimile in root.findall(".//ns:facsimile", ns): 
        
        graphic = facsimile.find(".//ns:graphic", ns)
        image_name = graphic.attrib.get("url")
        image_info = image_info_map[image_name]

        print("-" * 50) 
       
        for zone in facsimile.findall(".//ns:zone[@rendition='Line']", ns):
           
            zone_id = zone.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
            
            # Adjust the polygon points using the ratio
            adjusted_polygon = adjust_polygon_points(zone.attrib.get("points").split(" ") , image_info['width_ratio'], image_info['height_ratio'])
            polygon = f"<svg><polygon points=\"{adjusted_polygon}\"></polygon></svg>"
            

            # Check if the zone_id exists in the facs_text_map , get annotation text
            if f"#{zone_id}" in facs_text_map: 
                # Add annotation for each annotation set
                annotation_text = facs_text_map[f"#{zone_id}"]
                create_annotation(annotation_set_id , image_info['id'] , annotation_text, image_info['source'] , polygon)
                time.sleep(0.1)
                
    # Publish image set and annotation set
    annotation_url = publish_image_and_annotation_set(collection_id, image_set_id , annotation_set_id)
      
    print(annotation_url)
    iframe = IFrame(src=annotation_url, width="100%", height="700px")
    display(iframe)

# Path to your XML file
xml_file_path = '04965-P0000-000001-0010-010-002.xml'
# Call the function with the file path
import_multiple_xml(xml_file_path)        

Collection created successfully with ID 182
Image created successfully with id 1634
Image created successfully with id 1635
Image created successfully with id 1636
Image created successfully with id 1637
Image created successfully with id 1638
Image created successfully with id 1639
Image created successfully with id 1640
Image created successfully with id 1641
Image created successfully with id 1642
Image created successfully with id 1643
Image created successfully with id 1644
Image created successfully with id 1645
Image created successfully with id 1646
Image created successfully with id 1647
Image created successfully with id 1648
Image created successfully with id 1649
Image created successfully with id 1650
Image created successfully with id 1651
Image created successfully with id 1652
Image created successfully with id 1653
Image created successfully with id 1654
Image created successfully with id 1655
Image created successfully with id 1656
Image created successfully with id 1