In [1]:
import requests
import json
from lxml import objectify, etree
from pprint import pprint

In [79]:
class Transkribus_API():

    def __init__(self, api_base_url="https://transkribus.eu/TrpServer/rest/"):
        self.cols = {}
        self.api_base_url = api_base_url
        self.session_id = False
        self.cache = False

    def _url(self, endpoint):
        """ Helper function that returns a full URL for requests to the REST API, 
            i.e. the API base URL + the relative path of the endpoint. """
        
        return self.api_base_url + endpoint    
    
    def login(self, username, password):
        """ Performs a login and stores the SESSIONID in the "session_id" variable
            of this class. """
        
        credentials = {"user": username,
                       "pw": password}
        response = requests.post(self._url("auth/login"), data=credentials)
        if response:
            r = objectify.fromstring(response.content)
            print(f"TRANSKRIBUS: User {r.firstname} {r.lastname} ({r.userId}) logged in successfully.")
            self.session_id = str(r.sessionId)
            return str(r.sessionId)
        else:
            print("TRANSKRIBUS: Login failed. HTTP status:", response.status_code)
            return False
    
    def logout(self):
        """ Logs out and sets the "session_id" variable to False. """
        
        cookies = dict(JSESSIONID=self.session_id)
        response = requests.post(self._url("auth/logout"), cookies=cookies)
        if response:
            self.session_id = False
            print("TRANSKRIBUS: Logged out successfully.")
            return True
        else:
            print("TRANSKRIBUS: Logout failed. HTTP status:", response.status_code, response.content)
            return False

    def verify(self, username, password):
        """ Logs in and logs out to check whether the credentials
            are valid on the Transkribus server. 
            Returns True or False. """
        
        session_id = self.login(username, password)
        if session_id:
            self.logout(session_id)
            return True
        else: 
            return False        
        
    def request_endpoint(self, endpoint):
        """ Sends a GET request to a Transkribus API endpoint, the 
            "endpoint" argument being a relative path to the REST API endpoint

            Cf. the list of available endpoints:
            https://transkribus.eu/TrpServer/Swadl/wadl.html

            Depending on the content type (JSON or XML), 
            the function tries to decode the raw content of the response.
            It returns a json object or an "objectify" object (lxml). If
            the conversion fails the raw content is returned. """

        cookies = dict(JSESSIONID=self.session_id)
        
        response = requests.get(self._url(endpoint), cookies=cookies)

        if response:
            try:
                json = response.json()
                return json
            except:
                try:
                    xml = objectify.fromstring(response.content)
                    return xml
                except:
                    return response.content  # fallback option if the server returns just text
        else:
            print(f'TRANSKRIBUS: ERROR when requesting "{endpoint}". HTTP status:', response.status_code)
            return False
    
    # Convenience functions to query certain endpoints: 
    
    def get_collections(self):
        """ Get the metadata of the owner's collections. 
            Returns a dict if successful or False if not. """
        
        endpoint = "collections/list"
        collections = self.request_endpoint(endpoint)
        return collections if collections else False
    
    def get_documents_in_collection(self, colId):
        """ Get the metadata of a collection. 
            Returns a dict if successful or False if not.
            
            colId -- collection ID in Transkribus (int) """
        
        endpoint = f"collections/{colId}/list"
        documents = self.request_endpoint(endpoint)
        return documents if documents else False

    def get_pages_in_document(self, colId, docId):
        """ Get the basic metadata of the pages in a document. 
            Returns a dict if successful or False if not.
           
            colId -- collection ID in Transkribus (int) 
            docId -- document ID in Transkribus (int) 
            pageNr -- page ID in Transkribus (int) """
        
        endpoint = f"collections/{colId}/{docId}/pages"
        pages = self.request_endpoint(endpoint)
        return pages if pages else False
        
    def get_page_xml(self, colId, docId, pageNr):
        """ Get the XML content of a page. 
            Returns an "objectify" object (lxml) or False if not successful. 
            
            colId -- collection ID in Transkribus (int) 
            docId -- document ID in Transkribus (int) 
            pageNr -- page ID in Transkribus (int) 
            
            The returned object X has two attributes: X.Metadata and X.Page. 
            X.Page is empty if there are no transcripts yet. 
            If there exists a transcription X.Page has further attributes
            (i and j are list indices counting from 0):
            
            X.Page.Metadata
                  .ReadingOrder
                  .values()   -> list containing imgFileName, width (px), height (px)
                  .TextRegion[i].Coords.attrib['points']               -> coordinates of the whole text region (1)
                                .TextEquiv.Unicode                     -> utf-8 string of the transcription of the whole text region
                                .TextLine[j].Coords.attrib['points']   -> coordinates of this line (1) (2)
                                            .BaseLine.attrib['points'] -> coordinates of this baseline (2)
                                            .TextEquiv.Unicode         -> utf-8 string of the transcription
            
            (1) Instead of .attrib['points'] you can say .values()[0].
            (2) The line is a polygon around the line of text, the BaseLine is a line below the text.
            
            You can check for the existence of attributes: if hasattr(X.Page, "TextRegion")…
            Get a list of existing attributes: X.Page.__dict__

            """
        
        endpoint = f"collections/{colId}/{docId}/{pageNr}/text"
        page_xml = self.request_endpoint(endpoint)
        return page_xml if page_xml is not None else False
    
    def upload_page_xml(self, colId, docId, pageNr, new_status, page_xml):
        """ Upload page XML data to the Transkribus server using a POST request. 
            Returns True or False.
        
            colId -- collection ID in Transkribus (int) 
            docId -- document ID in Transkribus (int) 
            pageNr -- page ID in Transkribus (int) 
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT. 
            
            tsId = transcript ID of the last version of this transcription (int).
            After the upload Transkribus will generate a new transcription Id and save 
            the old one as the 'parentTsId' of the new transcription. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False
        
        headers = {'Content-Type': 'text/xml'} 
        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status,
                  'parent': tsId,
                  'overwrite': 'false'}
        # convert the page_xml object to a pretty utf-8 string:
        data = etree.tostring(page_xml, pretty_print=True, xml_declaration=True).decode("utf-8")

        response = requests.post(self._url(f"collections/{colId}/{docId}/{pageNr}/text"), 
                                 headers=headers,
                                 params=params,
                                 cookies=cookies, 
                                 data=data)

        if response:
            print(f"Uploaded page {colId}/{docId}/{pageNr} successfully: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"ERROR while uploading {colId}/{docId}/{pageNr}: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return False
        
    def update_page_status(self, colId, docId, pageNr, new_status):
        """ Update the status of the transcript with the transcript ID tsId with new_status.
            Returns True or False. 
            
            colId -- collection ID in Transkribus (int) 
            docId -- document ID in Transkribus (int) 
            pageNr -- page ID in Transkribus (int) 
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False
        
        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status}
        
        endpoint = f"collections/{colId}/{docId}/{pageNr}/{tsId}"
        response = requests.post(self._url(endpoint),
                                 params=params,
                                 cookies=cookies)
        
        if response:
            print(f"TRANSKRIBUS: Updated status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"TRANSKRIBUS: ERROR: Could not update status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return False    


In [80]:
#client = Transkribus_API()
#session_id = client.login("a12110417@unet.univie.ac.at", "=b@gMi4}:9;P?b'")

TRANSKRIBUS: User Sarah Bloos (192043) logged in successfully.


In [7]:
#client.get_collections()

[{'type': 'trpCollection',
  'colId': 240344,
  'colName': 'a12110417@unet.univie.ac.at Collection',
  'description': 'a12110417@unet.univie.ac.at',
  'crowdsourcing': False,
  'elearning': False,
  'pageId': 60726316,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=DAEQZNPMDNXLWKBWOHOHPWDD',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=DAEQZNPMDNXLWKBWOHOHPWDD',
  'nrOfDocuments': 2,
  'role': 'Owner',
  'accountingStatus': 1}]

In [11]:
#colId = 240344

# Get the metadata of all the documents in this collection:
#client.get_documents_in_collection(colId)

[{'type': 'trpDocMetadata',
  'docId': 1620944,
  'title': 'ABO_+Z25849202.pdf',
  'uploadTimestamp': 1697809383282,
  'uploader': 'a12110417@unet.univie.ac.at',
  'uploaderId': 192043,
  'nrOfPages': 146,
  'pageId': 60726590,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=XSHYUCVAMDHEVTGYNMZZLNXM',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=XSHYUCVAMDHEVTGYNMZZLNXM',
  'status': 0,
  'fimgStoreColl': 'TrpDoc_DEA_1620944',
  'origDocId': 0,
  'collectionList': {'colList': [{'colId': 240344,
     'colName': 'a12110417@unet.univie.ac.at Collection',
     'description': 'a12110417@unet.univie.ac.at',
     'crowdsourcing': False,
     'elearning': False,
     'nrOfDocuments': 0}]},
  'attributes': [],
  'mainColId': 240344,
  'isInMain': True}]

In [14]:
#DocId = 1620944

In [16]:
#client.get_pages_in_document(colId, DocId)

[{'pageId': 60726590,
  'docId': 1620944,
  'pageNr': 1,
  'key': 'XSHYUCVAMDHEVTGYNMZZLNXM',
  'imageId': 49050367,
  'url': 'https://files.transkribus.eu/Get?id=XSHYUCVAMDHEVTGYNMZZLNXM&fileType=view',
  'thumbUrl': 'https://files.transkribus.eu/Get?id=XSHYUCVAMDHEVTGYNMZZLNXM&fileType=thumb',
  'md5Sum': 'af1e13e6df99557ba41994501f0d8fad',
  'fileSize': 405983,
  'imgFileName': 'p001.jpg',
  'tsList': {'transcripts': [{'tsId': 137766257,
     'parentTsId': -1,
     'key': 'RSRKOWCRPLMDCPYWDSXPJWGH',
     'pageId': 60726590,
     'docId': 1620944,
     'pageNr': 1,
     'url': 'https://files.transkribus.eu/Get?id=RSRKOWCRPLMDCPYWDSXPJWGH',
     'status': 'NEW',
     'userName': 'a12110417@unet.univie.ac.at',
     'userId': 192043,
     'timestamp': 1697809383282,
     'md5Sum': 'f9a015bf7a5e78e297fdbed8f491a16c',
     'nrOfRegions': 0,
     'nrOfTranscribedRegions': 0,
     'nrOfWordsInRegions': 0,
     'nrOfLines': 0,
     'nrOfTranscribedLines': 0,
     'nrOfWordsInLines': 0,
     

In [19]:
#pageNr = 8
#doc = client.get_page_xml(colId, DocId, pageNr)


In [20]:
def check_page(page_xml):
    """ Make sure that the page_xml contains
        – TextRegions
        – Baselines
        – TextEquiv, i.e. actual text in the lines. 
        This is useful for further processing of the data to prevent crashes.
        
        page_xml -- a lxml.objectify object of a page in Transkribus """
        
    ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
    
    if page_xml.find(f".//{ns}TextRegion") is None:
        return "PAGE-XML: ERROR: No TextRegions found."
    if page_xml.find(f".//{ns}Baseline") is None:
        return "PAGE-XML: ERROR: No BaseLines found."
    if page_xml.find(f".//{ns}TextEquiv") is None:
        return "PAGE-XML: ERROR: Lines contain no text."

    return True

#check_page(doc)

True

In [33]:
#doc.Page.TextRegion[0].TextLine[3].TextEquiv.Unicode

'Wenn es für jeden Reisenden unerläßlich ist, sich'

In [34]:
#import re

#def get_custom_attributes(string):
    #custom_attributes = re.compile(r'\{(\w*?):(\w*?);\}')
    #return dict(custom_attributes.findall(string))

In [39]:
# Store the namespace string used by the Transkribus page_xml format:
#ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"

#for line in doc.Page.iter(f"{ns}TextLine"): # Cf. the section "tree iteration" in https://lxml.de/tutorial.html
    # Get the attributes of the TextRegion:
    #custom_attributes = get_custom_attributes(line.getparent().attrib['custom'])
    #regionNr = custom_attributes_region['index']
    
    # In the following line you could filter TextRegions tagged with a specific tag (like "paragraph"):
    #if custom_attributes.get("type"): # No filtering at the moment.
    #print(line.attrib['id'], line.TextEquiv.Unicode)

tr_1_tl_1 I. Wegweiser.
tr_1_tl_2 Orientirung in Wien.
tr_1_tl_3 Rundgang um die Bastei.
tr_1_tl_4 Wenn es für jeden Reisenden unerläßlich ist, sich
tr_1_tl_5 in einer fremden Stadt zu orientiren, so ist es we¬
tr_1_tl_6 nigstens in Wien sehr leicht. Der Stephansplatz
tr_1_tl_7 legt so ziemlich mitten in der Stadt, und mit Hilfe
tr_1_tl_8 des Planes wird man bald die Hauptpulsader der
tr_1_tl_9 innern Stadt kennen lernen; diese führt nämlich vom
tr_1_tl_10 Burgthor durch die Burg über den Michaels¬
tr_1_tl_11 platz, Kohlmarkt, Graben, Stephanö¬
tr_1_tl_12 platz, Bischofs- und Nothenthurmstraße
tr_1_tl_13 zum Nothenthurmthore, und bildet die Figur
tr_1_tl_14 einer doppelten
tr_1_tl_15 — Stufe, deren innerer Winkel
tr_1_tl_16 St. Stephan ist. Zu einer dieser Straßen, auf
tr_1_tl_17 einen dieser Plätze wird man sich bald finden oder
tr_1_tl_18 erfragen. — Eine Besteigung des Stephans¬
tr_1_tl_19 thurmes gibt wohl eine vollständige Uebersicht der
tr_1_tl_20 Stadt und Umgebung, aber zur Ori

In [68]:
#page = []

#for line in doc.Page.iter(f"{ns}TextLine"): # Cf. the section "tree iteration" in https://lxml.de/tutorial.html
    # Get the attributes of the TextRegion:
    #custom_attributes = get_custom_attributes(line.getparent().attrib['custom'])
    #lineNr = get_custom_attributes(line.attrib['custom'])['index']
    #raw_data = line.TextEquiv.Unicode
    #for i in lineNr:
        #for x in raw_data:
            #page[i] = x
    
    #new_line = {'Zeile': f"{lineNr}",
                'Text': raw_data}
    #page.append(new_line)
    

In [69]:
#print(page)

[{'Zeile': '0', 'Text': 'I. Wegweiser.'}, {'Zeile': '1', 'Text': 'Orientirung in Wien.'}, {'Zeile': '2', 'Text': 'Rundgang um die Bastei.'}, {'Zeile': '3', 'Text': 'Wenn es für jeden Reisenden unerläßlich ist, sich'}, {'Zeile': '4', 'Text': 'in einer fremden Stadt zu orientiren, so ist es we¬'}, {'Zeile': '5', 'Text': 'nigstens in Wien sehr leicht. Der Stephansplatz'}, {'Zeile': '6', 'Text': 'legt so ziemlich mitten in der Stadt, und mit Hilfe'}, {'Zeile': '7', 'Text': 'des Planes wird man bald die Hauptpulsader der'}, {'Zeile': '8', 'Text': 'innern Stadt kennen lernen; diese führt nämlich vom'}, {'Zeile': '9', 'Text': 'Burgthor durch die Burg über den Michaels¬'}, {'Zeile': '10', 'Text': 'platz, Kohlmarkt, Graben, Stephanö¬'}, {'Zeile': '11', 'Text': 'platz, Bischofs- und Nothenthurmstraße'}, {'Zeile': '12', 'Text': 'zum Nothenthurmthore, und bildet die Figur'}, {'Zeile': '13', 'Text': 'einer doppelten'}, {'Zeile': '14', 'Text': '— Stufe, deren innerer Winkel'}, {'Zeile': '15', 'Text'

In [81]:
#client.update_page_status(240344, 1620944, 8, "IN_PROGRESS")

TRANSKRIBUS: Updated status of page 8 to IN_PROGRESS in collection 240344, document 1620944.



True

In [82]:
#client.logout()

TRANSKRIBUS: Logged out successfully.


True