<a href="https://colab.research.google.com/github/NiekVerhoeff/workshop/blob/main/transkribus_indexing_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Transkribus Indexing Pipeline**

This notebook allows you to use an OpenAI GPT model trough its API to extract structured data from your Transkribus documents. The first section is for testing and getting your prompt right. With the second part you can use the prompt on all documents in a given Transkribus collection.

If you are using this notebook in Colab, you need to set your OPENAI_API_KEY, TRANSKRIBUS_USERNAME and TRANSKRIBUS_PASSWORD as environmentvariables in the secrets menu on the left (key symbol).



##**Test your Prompt**

In [None]:
import requests
import json
from lxml import objectify, etree
from pprint import pprint
import os
from google.colab import userdata

TRANSKRIBUS_USERNAME = userdata.get('TRANSKRIBUS_USERNAME')
TRANSKRIBUS_PASSWORD = userdata.get('TRANSKRIBUS_PASSWORD')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

#@title Get transcription with Transcribus API

class Transkribus_Web():
    """ The Transkribus_Web class implements the communication with the
        Transkribus REST API.

        All available endpoints: https://transkribus.eu/TrpServer/Swadl/wadl.html
        Documentation: https://readcoop.eu/transkribus/docu/rest-api/
        Official TranskribusPyClient: https://github.com/Transkribus/TranskribusPyClient """

    def __init__(self, api_base_url="https://transkribus.eu/TrpServer/rest/"):
        self.cols = {}
        self.api_base_url = api_base_url
        self.session_id = False
        self.cache = False

    # Internal helper functions:

    def _url(self, endpoint):
        """ Helper function that returns a full URL for requests to the REST API,
            i.e. the API base URL + the relative path of the endpoint. """

        return self.api_base_url + endpoint

    # Core functionality: login, logout, send GET requests

    def login(self, username, password):
        """ Performs a login and stores the SESSIONID in the "session_id" variable
            of this class. """

        credentials = {'user': username,
                       'pw': password}
        response = requests.post(self._url("auth/login"), data=credentials)
        if response:
            r = objectify.fromstring(response.content)
            print(f"TRANSKRIBUS: User {r.firstname} {r.lastname} ({r.userId}) logged in successfully.")
            self.session_id = str(r.sessionId)
            return str(r.sessionId)
        else:
            print("TRANSKRIBUS: Login failed. HTTP status:", response.status_code)
            return False

    def logout(self):
        """ Logs out and sets the "session_id" variable to False. """

        cookies = dict(JSESSIONID=self.session_id)
        response = requests.post(self._url("auth/logout"), cookies=cookies)
        if response:
            self.session_id = False
            print("TRANSKRIBUS: Logged out successfully.")
            return True
        else:
            print("TRANSKRIBUS: Logout failed. HTTP status:", response.status_code, response.content)
            return False

    def verify(self, username, password):
        """ Logs in and logs out to check whether the credentials
            are valid on the Transkribus server.
            Returns True or False. """

        session_id = self.login(username, password)
        if session_id:
            self.logout(session_id)
            return True
        else:
            return False

    def request_endpoint(self, endpoint):
        """ Sends a GET request to a Transkribus API endpoint, the
            "endpoint" argument being a relative path to the REST API endpoint

            Cf. the list of available endpoints:
            https://transkribus.eu/TrpServer/Swadl/wadl.html

            Depending on the content type (JSON or XML),
            the function tries to decode the raw content of the response.
            It returns a json object or an "objectify" object (lxml). If
            the conversion fails the raw content is returned. """

        cookies = dict(JSESSIONID=self.session_id)

        response = requests.get(self._url(endpoint), cookies=cookies)

        if response:
            try:
                json = response.json()
                return json
            except:
                try:
                    xml = objectify.fromstring(response.content)
                    return xml
                except:
                    return response.content  # fallback option if the server returns just text
        else:
            print(f'TRANSKRIBUS: ERROR when requesting "{endpoint}". HTTP status:', response.status_code)
            return False

    # Convenience functions to query certain endpoints:

    def get_collections(self):
        """ Get the metadata of the owner's collections.
            Returns a dict if successful or False if not. """

        endpoint = "collections/list"
        collections = self.request_endpoint(endpoint)
        return collections if collections else False

    def get_collections_ids(self):
        """ Get the metadata of the owner's collections.
        Returns a list of dictionaries containing 'colId' and 'colName' keys if successful, or False if not."""

        endpoint = "collections/list"
        collections = self.request_endpoint(endpoint)

        if collections:
            return [{"colId": col["colId"], "colName": col["colName"]} for col in collections]
        else:
            return False


    def get_documents_in_collection(self, colId):
        """ Get the metadata of a collection.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int) """

        endpoint = f"collections/{colId}/list"
        documents = self.request_endpoint(endpoint)
        return documents if documents else False

    def get_documents_in_collection_ids(self, colId):
        """ Get the metadata of a collection.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int) """

        endpoint = f"collections/{colId}/list"
        documents = self.request_endpoint(endpoint)

        if documents:
            return[{"docId": col["docId"], "title": col["title"]} for col in documents]
        else:
            return False

    def get_pages_in_document(self, colId, docId):
        """ Get the basic metadata of the pages in a document.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int) """

        endpoint = f"collections/{colId}/{docId}/pages"
        pages = self.request_endpoint(endpoint)
        return pages if pages else False

    def get_pagesnrs_in_document(self, colId, docId):
        """ Get the basic metadata of the pages in a document.
        Returns the count of 'pageNr' keys if successful, or False if not.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int) """

        endpoint = f"collections/{colId}/{docId}/pages"
        pages = self.request_endpoint(endpoint)

        if pages:
            return len(pages)
        else:
            return False

    def get_page_xml(self, colId, docId, pageNr):
        """ Get the XML content of a page.
            Returns an "objectify" object (lxml) or False if not successful.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)

            The returned object X has two attributes: X.Metadata and X.Page.
            X.Page is empty if there are no transcripts yet.
            If there exists a transcription X.Page has further attributes
            (i and j are list indices counting from 0):

            X.Page.Metadata
                  .ReadingOrder
                  .values()   -> list containing imgFileName, width (px), height (px)
                  .TextRegion[i].Coords.attrib['points']               -> coordinates of the whole text region (1)
                                .TextEquiv.Unicode                     -> utf-8 string of the transcription of the whole text region
                                .TextLine[j].Coords.attrib['points']   -> coordinates of this line (1) (2)
                                            .BaseLine.attrib['points'] -> coordinates of this baseline (2)
                                            .TextEquiv.Unicode         -> utf-8 string of the transcription

            (1) Instead of .attrib['points'] you can say .values()[0].
            (2) The line is a polygon around the line of text, the BaseLine is a line below the text.

            You can check for the existence of attributes: if hasattr(X.Page, "TextRegion")…
            Get a list of existing attributes: X.Page.__dict__

            """

        endpoint = f"collections/{colId}/{docId}/{pageNr}/text"
        page_xml = self.request_endpoint(endpoint)
        return page_xml if page_xml is not None else False

    def upload_page_xml(self, colId, docId, pageNr, new_status, page_xml):
        """ Upload page XML data to the Transkribus server using a POST request.
            Returns True or False.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT.

            tsId = transcript ID of the last version of this transcription (int).
            After the upload Transkribus will generate a new transcription Id and save
            the old one as the 'parentTsId' of the new transcription. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False

        headers = {'Content-Type': 'text/xml'}
        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status,
                  'parent': tsId,
                  'overwrite': 'false'}
        # convert the page_xml object to a pretty utf-8 string:
        data = etree.tostring(page_xml, pretty_print=True, xml_declaration=True).decode("utf-8")

        response = requests.post(self._url(f"collections/{colId}/{docId}/{pageNr}/text"),
                                 headers=headers,
                                 params=params,
                                 cookies=cookies,
                                 data=data)

        if response:
            print(f"Uploaded page {colId}/{docId}/{pageNr} successfully: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"ERROR while uploading {colId}/{docId}/{pageNr}: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return False

    def update_page_status(self, colId, docId, pageNr, new_status):
        """ Update the status of the transcript with the transcript ID tsId with new_status.
            Returns True or False.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False

        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status}

        endpoint = f"collections/{colId}/{docId}/{pageNr}/{tsId}"
        response = requests.post(self._url(endpoint),
                                 params=params,
                                 cookies=cookies)

        if response:
            print(f"TRANSKRIBUS: Updated status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"TRANSKRIBUS: ERROR: Could not update status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return False

client = Transkribus_Web()
session_id = client.login(TRANSKRIBUS_USERNAME, TRANSKRIBUS_PASSWORD)

#client.get_collections_ids()

colId = 154026 #@param {type:"integer"}
#client.get_documents_in_collection_ids(colId)

docId = 1048988 #@param {type:"integer"}
#client.get_pagesnrs_in_document(colId, docId)

pageNr = 1 #@param {type:"integer"}
doc = client.get_page_xml(colId, docId, pageNr)

#from google.colab import drive

# Mount Google Drive
#drive.mount('/content/drive')

# Define the path to the text file
#file_path = '/content/drive/My Drive/my_text_file.txt'

# Store the namespace string used by the Transkribus page_xml format:
ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"

# Open the file for writing and write the text
#with open(file_path, 'w') as f:
text = ''
for line in doc.Page.iter(f"{ns}TextLine"):
    text += str(line.TextEquiv.Unicode)
    #f.write(str(text))
    #print(text)

for line in doc.Page.iter(f"{ns}TextLine"):
    text2 = (line.TextEquiv.Unicode)
    print(text2)



In [None]:
#with open('/content/drive/MyDrive/my_text_file.txt', 'r') as file:
    #input_text = file.read()

#@markdown ---
#@markdown ###Extract structured data with a gpt-model with OpenAI API
#@markdown ######Check https://platform.openai.com/docs/models/overview for the models that you can use

prompt = "Maak een python list met valide json-objecten voor alle akten in de volgende tekst. Een python list begint en eindigt altijd met vierkante haken. De labels en waarde van een valide json-object zijn altijd gemarkeerd met dubbele quotes en een valide json-object heeft geen returns. In de json-objecten neem je de volgende gegevens op exact zoals ze in de tekst staan: voornaam_kind, achternaam_kind, voornaam_aangever, achternaam_aangever, beroep_aangever, leeftijd_aangever, voornaam_moeder, achternaam_moeder, beroep_moeder, geboorteplaats_kind, geboortedatum_kind in het format YYYY-MM-DD, straatnaam, huisnummer, huisnummer_toevoeging en akte_datum in het format YYYY-MM-DD. Als achternaam_kind neem je de achternaam van de aangever. Als het gegeven niet in de tekst staat, laat je de waarde in het json-object leeg. De json-objecten mogen geen geneste gegevens bevatten en geef geen andere tekst terug dan de python list met valide json-objecten." #@param {type:"string"}
model = "gpt-3.5-turbo-instruct" #@param {type:"string"}
prompt2 = prompt + text
#@markdown

import requests

api_endpoint = 'https://api.openai.com/v1/completions'

headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {OPENAI_API_KEY}'
}

data = {
    'model': model,
    'prompt': prompt2,
    'max_tokens': 1700,
    'temperature': 0,
}

response = requests.post(api_endpoint, headers=headers, json=data)

if response.status_code != 200:
    raise ValueError(f'Request failed with status code {response.status_code}: {response.text}')

response_json = response.json()

if 'choices' not in response_json:
    raise ValueError(f'Response does not contain "choices" key: {response_json}')

generated_text = response_json['choices'][0]['text']
#print(generated_text)

def strip_non_bracket_content(input_string):
    pattern = r'\[([^\[\]]*)\]'
    matches = re.findall(pattern, input_string)
    stripped_string = ''.join(matches)
    return stripped_string

stripped_string = '[' + strip_non_bracket_content(generated_text) + ']'
print(stripped_string)

##**Extract Away!**

In [None]:
import requests
import json
from lxml import objectify, etree
from pprint import pprint
import os
from google.colab import userdata

gebruikersnaam = userdata.get('TRANSKRIBUS_USERNAME')
wachtwoord = userdata.get('TRANSKRIBUS_PASSWORD')
bearer_token = userdata.get('OPENAI_API_KEY')

#@title Use your prompt on all the documents of your Transkribus collection

class Transkribus_Web():
    """ The Transkribus_Web class implements the communication with the
        Transkribus REST API.

        All available endpoints: https://transkribus.eu/TrpServer/Swadl/wadl.html
        Documentation: https://readcoop.eu/transkribus/docu/rest-api/
        Official TranskribusPyClient: https://github.com/Transkribus/TranskribusPyClient """

    def __init__(self, api_base_url="https://transkribus.eu/TrpServer/rest/"):
        self.cols = {}
        self.api_base_url = api_base_url
        self.session_id = False
        self.cache = False

    # Internal helper functions:

    def _url(self, endpoint):
        """ Helper function that returns a full URL for requests to the REST API,
            i.e. the API base URL + the relative path of the endpoint. """

        return self.api_base_url + endpoint

    # Core functionality: login, logout, send GET requests

    def login(self, username, password):
        """ Performs a login and stores the SESSIONID in the "session_id" variable
            of this class. """

        credentials = {'user': username,
                       'pw': password}
        response = requests.post(self._url("auth/login"), data=credentials)
        if response:
            r = objectify.fromstring(response.content)
            print(f"TRANSKRIBUS: User {r.firstname} {r.lastname} ({r.userId}) logged in successfully.")
            self.session_id = str(r.sessionId)
            return str(r.sessionId)
        else:
            print("TRANSKRIBUS: Login failed. HTTP status:", response.status_code)
            return False

    def logout(self):
        """ Logs out and sets the "session_id" variable to False. """

        cookies = dict(JSESSIONID=self.session_id)
        response = requests.post(self._url("auth/logout"), cookies=cookies)
        if response:
            self.session_id = False
            print("TRANSKRIBUS: Logged out successfully.")
            return True
        else:
            print("TRANSKRIBUS: Logout failed. HTTP status:", response.status_code, response.content)
            return False

    def verify(self, username, password):
        """ Logs in and logs out to check whether the credentials
            are valid on the Transkribus server.
            Returns True or False. """

        session_id = self.login(username, password)
        if session_id:
            self.logout(session_id)
            return True
        else:
            return False

    def request_endpoint(self, endpoint):
        """ Sends a GET request to a Transkribus API endpoint, the
            "endpoint" argument being a relative path to the REST API endpoint

            Cf. the list of available endpoints:
            https://transkribus.eu/TrpServer/Swadl/wadl.html

            Depending on the content type (JSON or XML),
            the function tries to decode the raw content of the response.
            It returns a json object or an "objectify" object (lxml). If
            the conversion fails the raw content is returned. """

        cookies = dict(JSESSIONID=self.session_id)

        response = requests.get(self._url(endpoint), cookies=cookies)

        if response:
            try:
                json = response.json()
                return json
            except:
                try:
                    xml = objectify.fromstring(response.content)
                    return xml
                except:
                    return response.content  # fallback option if the server returns just text
        else:
            print(f'TRANSKRIBUS: ERROR when requesting "{endpoint}". HTTP status:', response.status_code)
            return False

    # Convenience functions to query certain endpoints:

    def get_collections(self):
        """ Get the metadata of the owner's collections.
            Returns a dict if successful or False if not. """

        endpoint = "collections/list"
        collections = self.request_endpoint(endpoint)
        return collections if collections else False

    def get_collections_ids(self):
        """ Get the metadata of the owner's collections.
        Returns a list of dictionaries containing 'colId' and 'colName' keys if successful, or False if not."""

        endpoint = "collections/list"
        collections = self.request_endpoint(endpoint)

        if collections:
            return [{"colId": col["colId"], "colName": col["colName"]} for col in collections]
        else:
            return False


    def get_documents_in_collection(self, colId):
        """ Get the metadata of a collection.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int) """

        endpoint = f"collections/{colId}/list"
        documents = self.request_endpoint(endpoint)
        return documents if documents else False

    def get_documents_in_collection_ids(self, colId):
        """ Get the metadata of a collection.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int) """

        endpoint = f"collections/{colId}/list"
        documents = self.request_endpoint(endpoint)

        if documents:
            return[{"docId": col["docId"], "title": col["title"]} for col in documents]
        else:
            return False

    def get_pages_in_document(self, colId, docId):
        """ Get the basic metadata of the pages in a document.
            Returns a dict if successful or False if not.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int) """

        endpoint = f"collections/{colId}/{docId}/pages"
        pages = self.request_endpoint(endpoint)
        return pages if pages else False

    def get_pagesnrs_in_document(self, colId, docId):
        """ Get the basic metadata of the pages in a document.
        Returns the count of 'pageNr' keys if successful, or False if not.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int) """

        endpoint = f"collections/{colId}/{docId}/pages"
        pages = self.request_endpoint(endpoint)

        if pages:
            return len(pages)
        else:
            return False

    def get_page_xml(self, colId, docId, pageNr):
        """ Get the XML content of a page.
            Returns an "objectify" object (lxml) or False if not successful.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)

            The returned object X has two attributes: X.Metadata and X.Page.
            X.Page is empty if there are no transcripts yet.
            If there exists a transcription X.Page has further attributes
            (i and j are list indices counting from 0):

            X.Page.Metadata
                  .ReadingOrder
                  .values()   -> list containing imgFileName, width (px), height (px)
                  .TextRegion[i].Coords.attrib['points']               -> coordinates of the whole text region (1)
                                .TextEquiv.Unicode                     -> utf-8 string of the transcription of the whole text region
                                .TextLine[j].Coords.attrib['points']   -> coordinates of this line (1) (2)
                                            .BaseLine.attrib['points'] -> coordinates of this baseline (2)
                                            .TextEquiv.Unicode         -> utf-8 string of the transcription

            (1) Instead of .attrib['points'] you can say .values()[0].
            (2) The line is a polygon around the line of text, the BaseLine is a line below the text.

            You can check for the existence of attributes: if hasattr(X.Page, "TextRegion")…
            Get a list of existing attributes: X.Page.__dict__

            """

        endpoint = f"collections/{colId}/{docId}/{pageNr}/text"
        page_xml = self.request_endpoint(endpoint)
        return page_xml if page_xml is not None else False

    def upload_page_xml(self, colId, docId, pageNr, new_status, page_xml):
        """ Upload page XML data to the Transkribus server using a POST request.
            Returns True or False.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT.

            tsId = transcript ID of the last version of this transcription (int).
            After the upload Transkribus will generate a new transcription Id and save
            the old one as the 'parentTsId' of the new transcription. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False

        headers = {'Content-Type': 'text/xml'}
        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status,
                  'parent': tsId,
                  'overwrite': 'false'}
        # convert the page_xml object to a pretty utf-8 string:
        data = etree.tostring(page_xml, pretty_print=True, xml_declaration=True).decode("utf-8")

        response = requests.post(self._url(f"collections/{colId}/{docId}/{pageNr}/text"),
                                 headers=headers,
                                 params=params,
                                 cookies=cookies,
                                 data=data)

        if response:
            print(f"Uploaded page {colId}/{docId}/{pageNr} successfully: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"ERROR while uploading {colId}/{docId}/{pageNr}: {response.status_code}")
            print(response.content.decode(encoding="utf-8"))
            return False

    def update_page_status(self, colId, docId, pageNr, new_status):
        """ Update the status of the transcript with the transcript ID tsId with new_status.
            Returns True or False.

            colId -- collection ID in Transkribus (int)
            docId -- document ID in Transkribus (int)
            pageNr -- page ID in Transkribus (int)
            new_status -- new_status of the page. Possible values are NEW, IN_PROGRESS, DONE, FINAL, GT. """

        # Get the transcript ID of the latest transcription:
        current_transcript = self.request_endpoint(f"collections/{colId}/{docId}/{pageNr}/curr")
        if current_transcript:
            tsId = current_transcript['tsId']
        else:
            return False

        cookies = dict(JSESSIONID=self.session_id)
        params = {'status': new_status}

        endpoint = f"collections/{colId}/{docId}/{pageNr}/{tsId}"
        response = requests.post(self._url(endpoint),
                                 params=params,
                                 cookies=cookies)

        if response:
            print(f"TRANSKRIBUS: Updated status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return True
        else:
            print(f"TRANSKRIBUS: ERROR: Could not update status of page {pageNr} to {new_status} in collection {colId}, document {docId}.")
            print(response.content.decode(encoding="utf-8"))
            return False

client = Transkribus_Web()
session_id = client.login(TRANSKRIBUS_USERNAME, TRANSKRIBUS_PASSWORD)

colId = 154026 #@param {type:"integer"}
docs_list = client.get_documents_in_collection_ids(colId)

def generate_list(pageNrs):
    if isinstance(pageNrs, int) and pageNrs > 0:
        return list(range(1, pageNrs + 1))
    else:
        return "pageNrs must be a positive integer."

ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
api_endpoint = 'https://api.openai.com/v1/completions'

headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {bearer_token}'
}

for doc in docs_list:
  pageNrs = client.get_pagesnrs_in_document(colId, doc['docId'])
  pageNrs_list = generate_list(pageNrs)
  #print(pageNrs_list)
  for item in pageNrs_list:
    transcription = client.get_page_xml(colId, doc['docId'], item)
    text = ''
    for line in transcription.Page.iter(f"{ns}TextLine"):
      text += str(line.TextEquiv.Unicode)
    prompt = "Maak een python list met valide json-objecten voor alle akten in de volgende tekst. Een python list begint en eindigt altijd met vierkante haken. De labels en waarde van een valide json-object zijn altijd gemarkeerd met dubbele quotes en een valide json-object heeft geen returns. Geef geen enkele andere tekst mee in je antwoord dan de python list met valide json-objecten. In de json-objecten neem je de volgende gegevens op exact zoals ze in de tekst staan: voornaam_kind, achternaam_kind, voornaam_aangever, achternaam_aangever, beroep_aangever, leeftijd_aangever, voornaam_moeder, achternaam_moeder, beroep_moeder, geboorteplaats_kind, geboortedatum_kind in het format YYYY-MM-DD, straatnaam, huisnummer, huisnummer_toevoeging en akte_datum in het format YYYY-MM-DD. Als achternaam_kind neem je de achternaam van de aangever. Als het gegeven niet in de tekst staat, laat je de waarde in het json-object leeg. De json-objecten mogen geen geneste gegevens bevatten" #@param {type:"string"}
    prompt2 = prompt + text
    model = "gpt-3.5-turbo-instruct" #@param {type:"string"}
    data = {
    'model': model,
    'prompt': prompt2,
    'max_tokens': 1700,
    'temperature': 0,
    }
    response = requests.post(api_endpoint, headers=headers, json=data)

    if response.status_code != 200:
      raise ValueError(f'Request failed with status code {response.status_code}: {response.text}')

    response_json = response.json()

    if 'choices' not in response_json:
      raise ValueError(f'Response does not contain "choices" key: {response_json}')

    generated_text = response_json['choices'][0]['text']
    #print(generated_text)

    import re

    def strip_non_bracket_content(input_string):
        pattern = r'\[([^\[\]]*)\]'
        matches = re.findall(pattern, input_string)
        stripped_string = ''.join(matches)
        return stripped_string

    stripped_string = '[' + strip_non_bracket_content(generated_text) + ']'
    print(stripped_string)