### 1. Download the data

In [256]:
import pandas as pd
import requests
import json
import sys

# Create a session
session = requests.Session()

# Replace 'username' and 'password' with your actual Transkribus username and password
login_payload = {"user": "##############", "pw": "##############"}

# Log in
response = session.post('https://transkribus.eu/TrpServer/rest/auth/login', data=login_payload)

# Check for successful login
if response.status_code == 200:
    print('Logged in successfully')
else:
    print('Failed to log in')
    sys.exit()

# After login, get a list of collections using the session
collections_response = session.get('https://transkribus.eu/TrpServer/rest/collections/list')

# Parse the JSON response into a Python dict
collections_dict = json.loads(collections_response.text)

# Find the collection ID for 'Silent Voices'
for collection in collections_dict:
    if collection['colName'] == 'Silent Voices':
        silent_voices_id = collection['colId']
        break
else:
    print("Collection 'Silent Voices' not found.")
    sys.exit()

# Get documents in the Silent Voices collection
documents_response = session.get(f'https://transkribus.eu/TrpServer/rest/collections/{silent_voices_id}/list')
documents_dict = json.loads(documents_response.text)

# Initialize an empty list to store the data
data = []

# Loop through the documents
for document in documents_dict:
    if document['title'] in ['Brussel, KBR, 1805-1808',
                             'Brussel, KBR, 2485',
                             'Brussel, KBR, 2849-51',
                             'Brussel, KBR, 2877-78',
                             'Brussel, KBR, 2879-80',
                             'Brussel, KBR, 2905-09',
                             'Brussel, KBR, 2979',
                             'Brussel, KBR, 3091',
                             'Brussel, KBR, 3093-95',
                             'Gent, UB, 1374', 
                             'Gent, UB, 941',
                             'Parijs, Bibliothèque Mazarine, 920',
                             "Parijs, Bibliothèque de l'Arsenal, 8224",
                             "Philadelphia, Lewis E 199",
                             "Sint-Petersburg, BAN, O 256",
                             "Wenen, ÖNB, 12.857",
                             "Wenen, ÖNB, 12.905",
                             "Wenen, ÖNB, 13.708",
                             "Wenen, ÖNB, 65"]:
        

        doc_id = document['docId']
        # Get full document data
        doc_response = session.get(f'https://transkribus.eu/TrpServer/rest/collections/{silent_voices_id}/{doc_id}/fulldoc')
        doc_dict = json.loads(doc_response.text)
        # Loop through the pages
        for page in doc_dict['pageList']['pages']:
            #print(page)
            page_id = page['pageId']
            page_nr = page['pageNr']
            img_file_name = page['imgFileName']
            ts_list = page['tsList']['transcripts']
            
            # Find the most recent transcript by comparing timestamp
            most_recent_ts = max(ts_list, key=lambda x: x['timestamp'])
            most_recent_url = most_recent_ts['url']  # Get the URL for the most recent transcript
            
            title = document['title']
            #print(title)
            transkribuspage = page_nr
            original_filename = img_file_name
            status = most_recent_ts['status']
            doc_id = most_recent_ts['docId']
            nr_of_transcribed_regions = most_recent_ts.get('nrOfTranscribedRegions', 0)
            nr_of_lines = most_recent_ts.get('nrOfLines', 0)
            nr_of_words = most_recent_ts.get('nrOfWords', 0)
            
            # Append the data to the list
            data.append([title, transkribuspage, original_filename, status, doc_id, nr_of_transcribed_regions, nr_of_lines, nr_of_words, most_recent_url])

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['codex_sig', 'transkribuspage', 'OriginalFilename', 'Status', 'DocId', 'NrOfTranscribedRegions', 'NrOfLines', 'NrOfWords', 'TranscriptURL'])

# Print the DataFrame
df

Logged in successfully


Unnamed: 0,codex_sig,transkribuspage,OriginalFilename,Status,DocId,NrOfTranscribedRegions,NrOfLines,NrOfWords,TranscriptURL
0,"Brussel, KBR, 2485",1,0_front_r.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=XZSSLFCCHB...
1,"Brussel, KBR, 2485",2,0_front_v.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=IJNNVGBLVW...
2,"Brussel, KBR, 2485",3,1r.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=QVJVECLALX...
3,"Brussel, KBR, 2485",4,1v.jpeg,GT,1002641,1,21,0,https://files.transkribus.eu/Get?id=KIQXDUNKMG...
4,"Brussel, KBR, 2485",5,2r.jpeg,GT,1002641,1,21,0,https://files.transkribus.eu/Get?id=VDAQYONNZE...
...,...,...,...,...,...,...,...,...,...
5863,"Philadelphia, Lewis E 199",22,022_Lewis_E199_back.tif,IN_PROGRESS,1320265,0,0,0,https://files.transkribus.eu/Get?id=QCAURQABOA...
5864,"Philadelphia, Lewis E 199",23,023_Lewis_E199_spine.tif,IN_PROGRESS,1320265,0,0,0,https://files.transkribus.eu/Get?id=GCQOSAAOZG...
5865,"Philadelphia, Lewis E 199",24,024_Lewis_E199_foreedge.tif,IN_PROGRESS,1320265,0,0,0,https://files.transkribus.eu/Get?id=MNSPMYXBEW...
5866,"Philadelphia, Lewis E 199",25,025_Lewis_E199_topedge.tif,IN_PROGRESS,1320265,0,0,0,https://files.transkribus.eu/Get?id=XLZYULOUCM...


In [211]:
df.head()

Unnamed: 0,codex_sig,transkribuspage,OriginalFilename,Status,DocId,NrOfTranscribedRegions,NrOfLines,NrOfWords,TranscriptURL
0,"Brussel, KBR, 2485",1,0_front_r.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=XZSSLFCCHB...
1,"Brussel, KBR, 2485",2,0_front_v.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=IJNNVGBLVW...
2,"Brussel, KBR, 2485",3,1r.jpeg,GT,1002641,0,0,0,https://files.transkribus.eu/Get?id=QVJVECLALX...
3,"Brussel, KBR, 2485",4,1v.jpeg,GT,1002641,1,21,0,https://files.transkribus.eu/Get?id=KIQXDUNKMG...
4,"Brussel, KBR, 2485",5,2r.jpeg,GT,1002641,1,21,0,https://files.transkribus.eu/Get?id=VDAQYONNZE...


In [212]:
import os
import requests

# Create the pagexml directory
base_dir = "../data/pagexmls"
os.makedirs(base_dir, exist_ok=True)

# Download files for each row in the DataFrame
for index, row in df.iterrows():
    title = row['codex_sig']
    url = row['TranscriptURL']
    folder_name = title #.replace(',', '').replace(' ', '_')
    folder_path = os.path.join(base_dir, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    
    file_name = None

    # Send a HEAD request to retrieve the headers
    response = requests.head(url)
    if 'Content-Disposition' in response.headers:
        # Extract the filename from the Content-Disposition header
        header = response.headers['Content-Disposition']
        file_name = header.split('filename=')[1].strip('"')

    if not file_name:
        # If the filename couldn't be extracted from the header, use a default name
        file_name = 'file_' + str(index) + '.xml'

    save_path = os.path.join(folder_path, file_name)

    # Download the file
    response = requests.get(url)
    with open(save_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded: {save_path}")

Downloaded: ../data/pagexmls/Brussel, KBR, 2485/0_front_r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/0_front_v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/1r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/1v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/2r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/2v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/3r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/3v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/4r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/4v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/5r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/5v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/6r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/6v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/7r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/7v.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/8r.xml
Downloaded: ../data/pagexmls/Brussel, KBR, 2485/8v.

### 2. Extract metadata from PAGE XML files

The code block below extracts metadata from PAGE XML files and stores the data in a pandas DataFrame. The extracted metadata includes the **manuscript signature**, the **Transrkibus page number**, **image filename**, **image width** and **height**, **text region type**, **region coordinates**, **full text**, and **reading order**. 

Apart from this metadata which can be extracted from the PAGGE XML directly, we also calculate the **fraction** that a region takes up in an image. To this end, we first calculate the area of the region (using the Shoelace formula) and the area of the full image. Next, we divide the area of the text region by the area of the full image to get the fraction.

The full DataFrame is then exported to an Excel file named `manuscript_data.xlsx`

In [263]:
import os
from tqdm import tqdm
import pandas as pd
from lxml import etree
import re

# Function to calculate area of a polygon given its coordinates
def polygon_area(coords):
    n = len(coords)
    area = 0.0
    for i in range(n):
        j = (i + 1) % n
        area += coords[i][0] * coords[j][1]
        area -= coords[j][0] * coords[i][1]
    return abs(area / 2.0)

In [264]:
def extract_metadata(page_files):
    ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    metadata = pd.DataFrame(columns=['codex_sig', 'transkribuspage', 'image_filename', 'page_xml_filename', 'image_width', 'image_height', 'region_type', 'coordinates', 'fraction', 'text', 'reading_order', 'status'])
    for page_file in tqdm(page_files):
        page_filename = os.path.basename(page_file)
        with open(page_file, 'rb') as f:
            tree = etree.parse(f)

            # Change to match your directory structure
            manuscript = os.path.basename(os.path.dirname(page_file)).replace('_', ', ')

            image_filename = tree.xpath("//pc:Page/@imageFilename", namespaces=ns)
            image_filename = image_filename[0] if image_filename else None

            image_width = tree.xpath("//pc:Page/@imageWidth", namespaces=ns)
            image_width = int(image_width[0]) if image_width else None

            image_height = tree.xpath("//pc:Page/@imageHeight", namespaces=ns)
            image_height = int(image_height[0]) if image_height else None

            page_nr = tree.xpath("//pc:TranskribusMetadata/@pageNr", namespaces=ns)
            page_nr = int(page_nr[0]) if page_nr else None

            status = tree.xpath("//pc:TranskribusMetadata/@status", namespaces=ns)
            status = status[0] if status else None

            for text_region in tree.xpath('//pc:TextRegion', namespaces=ns):
                region_type = text_region.get('type')
                custom = text_region.get('custom')
                reading_order = custom.split("index:")[1].split(";")[0].strip() if custom else None

                for coords in text_region.xpath('./pc:Coords[1]/@points', namespaces=ns):
                    text_region_coords = coords
                    region_coords = text_region_coords.split()
                    #print(region_coords)
                    region_coords = [(int(coord.split(',')[0]), int(coord.split(',')[1])) for coord in region_coords]
                    polygon_area_size = polygon_area(region_coords)
                    image_area = image_width * image_height if image_width and image_height else None
                    fraction = polygon_area_size / image_area if image_area else None
                    formatted_fraction = "{:.2f}".format(fraction) if fraction else None

                full_text = []
                for text_line in text_region.xpath('.//pc:TextLine', namespaces=ns):
                    unicode_list = text_line.xpath('./pc:TextEquiv[last()]/pc:Unicode', namespaces=ns)
                    final_unicode = unicode_list[0].text if len(unicode_list) > 0 else ' '
                    full_text.append(final_unicode)

                final_text = '\n'.join(filter(None, full_text))
                
                row = {
                    'codex_sig': [manuscript],
                    'transkribuspage': [page_nr],
                    'image_filename': [image_filename],
                    'page_xml_filename': [page_filename],
                    'image_width': [image_width],
                    'image_height': [image_height],
                    'region_type': [region_type],
                    'coordinates': [text_region_coords],
                    'fraction': [formatted_fraction],
                    'text': [final_text],
                    'reading_order' : [reading_order],
                    'status': [status]}

                metadata = pd.concat([metadata, pd.DataFrame(row)], ignore_index=True)
    return metadata

page_files = []
for root, dirs, files in os.walk('../data/pagexmls'):
    for file in files:
        if file.endswith('.xml'):
            page_files.append(os.path.join(root, file))

df = extract_metadata(page_files)

# Sorting the DataFrame by 'codex_sig' and 'transkribuspage'
df = df.sort_values(by=['codex_sig', 'transkribuspage'], ascending=[True, True])
# Resetting the index to reflect the new order
df = df.reset_index(drop=True)
# Save the sorted DataFrame to an Excel file
df.to_excel('../data/manuscript_data_new.xlsx', index=False)


100%|██████████████████████████████████████████████████████████████████████████| 5874/5874 [00:31<00:00, 185.55it/s]


In [265]:
df

Unnamed: 0,codex_sig,transkribuspage,image_filename,page_xml_filename,image_width,image_height,region_type,coordinates,fraction,text,reading_order,status
0,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,header,"1405,478 3598,478 3598,721 1405,721",0.01,Een voorredene op sinte gregoriꝰ dyalogus,0,
1,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,paragraph,"1359,751 3792,751 3792,2645 1359,2645",0.12,DE zeere wise e saleghe gregoris paeus\nvan d...,1,
2,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,paragraph,"1736,658 3985,658 3985,1643 1736,1643",0.06,wiste dat si hare te xpūs gheloeue ghege-\nuen...,0,IN_PROGRESS
3,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,marginalia,"2414,1705 4016,1705 4016,1851 2414,1851",0.01,⸫ Nēmeer en vant ics jnt latijn .,1,IN_PROGRESS
4,"Brussel, KBR, 1805-1808",9,0009_KBR_1805-08_2r.tiff,0009_KBR_1805-08_2r.xml,5186,7123,paragraph,"859,5840 847,5678 835,5462 781,4838 751,4480 7...",0.23,Hier beghint een voʼredene\nop .Sʼ. gᵉgorius d...,0,IN_PROGRESS
...,...,...,...,...,...,...,...,...,...,...,...,...
13758,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,paragraph,"952,250 1747,250 1747,2108 952,2108",0.29,so verre kinne vandē volcomenen\nloue der heil...,2,IN_PROGRESS
13759,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,marginalia,"1757,1097 1757,1185 1825,1187 1825,1099",0.00,noᵃ,3,IN_PROGRESS
13760,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"281,276 281,2114 1029,2114 1029,276",0.27,glorificerē selen inder eewicheit .\nDe die pe...,0,DONE
13761,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"1105,276 1106,578 1866,578 1865,276",0.05,Dit boec es in dietsche vten lati-\nne ghetogh...,1,DONE


### 3. Merge with additional information stored in another spreadsheet

In [266]:
## OPTION 1: access sheet through Google Drive API
#!pip install gspread

import gspread
sa = gspread.service_account(filename="../auth/silentvoices-fb9908394ef8.json")
sheet = sa.open_by_url("https://docs.google.com/spreadsheets/d/1EJQtcZ63ZA6Po3HAhBchMkrcwTQMhFp1W1XjkW1wwuc")

# Get list of all worksheets
all_worksheets = sheet.worksheets()

# Initialize an empty dictionary to store the dataframes
dfs = {}

# Loop through each worksheet
for ws in all_worksheets:
    print(ws)
    # Get all values of the worksheet
    values = ws.get_all_values()
    # Only keep the first 15 columns (A to O in Excel)
    values = [row[:15] for row in values]
    # Convert to a dataframe, assuming first row is the header
    df_temp = pd.DataFrame(values[1:], columns=values[0])
    
    # Store the dataframe in the dictionary
    dfs[ws.title] = df_temp

# Remove the 'OVERVIEW' sheet
if 'OVERVIEW' in dfs:
    del dfs['OVERVIEW']

# Concatenate the dataframes
add_df = pd.concat(dfs.values(), ignore_index=True)

<Worksheet 'OVERVIEW' id:1053394346>
<Worksheet 'KBR 394-98' id:2016912023>
<Worksheet 'KBR 1805-08' id:762308753>
<Worksheet 'KBR 2485' id:0>
<Worksheet 'KBR 2849-51' id:956390397>
<Worksheet 'KBR 2877-78' id:387871519>
<Worksheet 'KBR 2879-80' id:352703624>
<Worksheet 'KBR 2905-09' id:460677957>
<Worksheet 'KBR 2979' id:1622097260>
<Worksheet 'KBR 3091' id:1367553580>
<Worksheet 'KBR 3093-95' id:2032175414>
<Worksheet 'Ghent UB 941' id:1567686002>
<Worksheet 'Ghent UB 1374' id:773429885>
<Worksheet 'BA 8224' id:841632581>
<Worksheet 'MA 920' id:1158373373>
<Worksheet 'BAN O256' id:1901642224>
<Worksheet 'ÖNB 13708' id:19886449>
<Worksheet 'ÖNB 65' id:1868536383>
<Worksheet 'ÖNB 12905' id:2075030953>
<Worksheet 'ÖNB 12857' id:571355786>
<Worksheet 'Lewis E 199' id:1609201371>


In [271]:
import unicodedata

df['page_xml_filename'] = df['page_xml_filename'].astype(str)
add_df['page_xml_filename'] = add_df['page_xml_filename'].astype(str)

df['codex_sig'] = df['codex_sig'].apply(lambda x: unicodedata.normalize('NFC', x))
add_df['codex_sig'] = add_df['codex_sig'].apply(lambda x: unicodedata.normalize('NFC', x))

print(df['codex_sig'].unique())
#print(df['transkribuspage'].unique())
print(df['codex_sig'].dtypes)
#print(df['transkribuspage'].dtypes)
print(add_df['codex_sig'].unique())
#print(add_df['transkribuspage'].unique())
print(add_df['codex_sig'].dtypes)
#print(add_df['transkribuspage'].dtypes)

['Brussel, KBR, 1805-1808' 'Brussel, KBR, 2485' 'Brussel, KBR, 2849-51'
 'Brussel, KBR, 2877-78' 'Brussel, KBR, 2879-80' 'Brussel, KBR, 2905-09'
 'Brussel, KBR, 2979' 'Brussel, KBR, 3091' 'Brussel, KBR, 3093-95'
 'Gent, UB, 1374' 'Gent, UB, 941' 'Parijs, Bibliothèque Mazarine, 920'
 "Parijs, Bibliothèque de l'Arsenal, 8224" 'Philadelphia, Lewis E 199'
 'Sint-Petersburg, BAN, O 256' 'TRAINING, VALIDATION, SET, 2849-51, v1'
 'Wenen, ÖNB, 12.857' 'Wenen, ÖNB, 12.905' 'Wenen, ÖNB, 13.708'
 'Wenen, ÖNB, 65']
object
['Brussel, KBR, 394-98' 'Brussel, KBR, 1805-1808' 'Brussel, KBR, 2485'
 'Brussel, KBR, 2849-51' 'Brussel, KBR, 2877-78' 'Brussel, KBR, 2879-80'
 'Brussel, KBR, 2905-09' 'Brussel, KBR, 2979' 'Brussel, KBR, 3091'
 'Brussel, KBR, 3093-95' 'Gent, UB, 941' 'Gent, UB, 1374'
 "Parijs, Bibliothèque de l'Arsenal, 8224"
 'Parijs, Bibliothèque Mazarine, 920' 'Sint-Petersburg, BAN, O 256' ''
 'Wenen, ÖNB, 13.708' 'Wenen, ÖNB, 65' 'Wenen, ÖNB, 12.905'
 'Wenen, ÖNB, 12.857' 'Philadelphia, Lewi

In [272]:
# Merge the dataframes
df_merged = pd.merge(df, add_df, on=['codex_sig', 'page_xml_filename'], how='left')

# Save merged dataframe
df_merged.to_excel('../data/manuscript_data_metadata.xlsx')

In [273]:
df_merged

Unnamed: 0,codex_sig,transkribuspage_x,image_filename,page_xml_filename,image_width,image_height,region_type,coordinates,fraction,text,...,layout,content,language,scribe,production unit,local Herne product,date,translator,transcription,HTR-model used
0,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,header,"1405,478 3598,478 3598,721 1405,721",0.01,Een voorredene op sinte gregoriꝰ dyalogus,...,corrected,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,
1,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,paragraph,"1359,751 3792,751 3792,2645 1359,2645",0.12,DE zeere wise e saleghe gregoris paeus\nvan d...,...,corrected,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,
2,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,paragraph,"1736,658 3985,658 3985,1643 1736,1643",0.06,wiste dat si hare te xpūs gheloeue ghege-\nuen...,...,corrected,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,
3,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,marginalia,"2414,1705 4016,1705 4016,1851 2414,1851",0.01,⸫ Nēmeer en vant ics jnt latijn .,...,corrected,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,
4,"Brussel, KBR, 1805-1808",9,0009_KBR_1805-08_2r.tiff,0009_KBR_1805-08_2r.xml,5186,7123,paragraph,"859,5840 847,5678 835,5462 781,4838 751,4480 7...",0.23,Hier beghint een voʼredene\nop .Sʼ. gᵉgorius d...,...,corrected,dialogenGreg1_3,middledutch,α,I,yes,1395,bibletranslator,GT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13758,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,paragraph,"952,250 1747,250 1747,2108 952,2108",0.29,so verre kinne vandē volcomenen\nloue der heil...,...,corrected,horlogium,middledutch,γ,IV,yes,1375-1400,,HTR,BigMiddleDutchModel_v2
13759,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,marginalia,"1757,1097 1757,1185 1825,1187 1825,1099",0.00,noᵃ,...,corrected,horlogium,middledutch,γ,IV,yes,1375-1400,,HTR,BigMiddleDutchModel_v2
13760,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"281,276 281,2114 1029,2114 1029,276",0.27,glorificerē selen inder eewicheit .\nDe die pe...,...,corrected,horlogium,middledutch,γ,IV,yes,1375-1400,,HTR,BigMiddleDutchModel_v2
13761,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"1105,276 1106,578 1866,578 1865,276",0.05,Dit boec es in dietsche vten lati-\nne ghetogh...,...,corrected,horlogium,middledutch,γ,IV,yes,1375-1400,,HTR,BigMiddleDutchModel_v2


In [None]:
# OPTION 2: donwnload the codex_info file from GDrive and load the data from that file

# we're going to merge the spreadsheet with other information, relating to the corpus (such as: scribe, production unit, etc.)
fn = '../data/codex_info.xlsx'
sheets = pd.read_excel(fn, sheet_name=None)
del sheets['OVERVIEW'] # delete the overview-sheet

In [124]:
sheets.keys() # names of the sheets (i.e. manuscript sigla)

dict_keys(['KBR 394-98', 'KBR 1805-08', 'KBR 2485', 'KBR 2849-51', 'KBR 2877-78', 'KBR 2879-80', 'KBR 2905-09', 'KBR 2979', 'KBR 3091', 'KBR 3093-95', 'Ghent UB 941', 'Ghent UB 1374', 'BA 8224', 'MA 920', 'BAN O256', 'ÖNB 13708', 'ÖNB 65', 'ÖNB 12905', 'ÖNB 12857', 'Lewis E 199'])

In [125]:
add_df = pd.concat(sheets.values())
add_df.reset_index(inplace=True, drop=True)
columns_to_drop = ['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18']
for column in columns_to_drop:
    if column in add_df.columns:
        add_df.drop(column, axis=1, inplace=True)

In [126]:
add_df.head()

Unnamed: 0,codex_sig,transkribuspage,filename,folium,layout,content,language,scribe,production unit,local Herne product,date,translator,transcription,HTR-model used,CER?
0,"Brussel, KBR, 394-98",1,0001_KBR_394-98_cover.tiff,cover,corrected,none,none,none,none,no,1450-1500,,none,,
1,"Brussel, KBR, 394-98",2,0002_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,no,1450-1500,,none,,
2,"Brussel, KBR, 394-98",3,0003_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,none,none,,none,,
3,"Brussel, KBR, 394-98",4,0004_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,none,none,,none,,
4,"Brussel, KBR, 394-98",5,0005_KBR_394-98_1r.tiff,1r,corrected,calendar,none,A,I,yes,1373-1383,,none,,


In [127]:
# we merge the two dataframes
df_merged = pd.merge(df, add_df, on=['codex_sig', 'transkribuspage'], how='left')

# save merged dataframe
df_merged.to_excel('../data/manuscript_data_metadata.xlsx')

In [128]:
df_merged

Unnamed: 0,codex_sig,transkribuspage,image_filename,page_xml_filename,image_width,image_height,region_type,coordinates,fraction,text,...,content,language,scribe,production unit,local Herne product,date,translator,transcription,HTR-model used,CER?
0,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,header,"1405,478 3598,478 3598,721 1405,721",0.01,Een voorredene op sinte gregoriꝰ dyalogus,...,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,,
1,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,0007_KBR_1805-08_1r.xml,5186,7123,paragraph,"1359,751 3792,751 3792,2645 1359,2645",0.12,DE zeere wise e saleghe gregoris paeus\nvan d...,...,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,,
2,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,paragraph,"1736,658 3985,658 3985,1643 1736,1643",0.06,wiste dat si hare te xpūs gheloeue ghege-\nuen...,...,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,,
3,"Brussel, KBR, 1805-1808",8,47180129.tiff,0008_KBR_1805-08_1v.xml,5186,7123,marginalia,"2414,1705 4016,1705 4016,1851 2414,1851",0.01,⸫ Nēmeer en vant ics jnt latijn .,...,proloog,middledutch,a,I,yes,1400,bibletranslator,GT,,
4,"Brussel, KBR, 1805-1808",9,0009_KBR_1805-08_2r.tiff,0009_KBR_1805-08_2r.xml,5186,7123,paragraph,"859,5840 847,5678 835,5462 781,4838 751,4480 7...",0.23,Hier beghint een voʼredene\nop .Sʼ. gᵉgorius d...,...,dialogenGreg1_3,middledutch,α,I,yes,1395,bibletranslator,GT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13758,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,paragraph,"952,250 1747,250 1747,2108 952,2108",0.29,so verre kinne vandē volcomenen\nloue der heil...,...,,,,,,,,,,
13759,"Wenen, ÖNB, 65",193,50510902.jpg,00000193.xml,1944,2598,marginalia,"1757,1097 1757,1185 1825,1187 1825,1099",0.00,noᵃ,...,,,,,,,,,,
13760,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"281,276 281,2114 1029,2114 1029,276",0.27,glorificerē selen inder eewicheit .\nDe die pe...,...,,,,,,,,,,
13761,"Wenen, ÖNB, 65",194,50510904.jpg,00000194.xml,1932,2625,paragraph,"1105,276 1106,578 1866,578 1865,276",0.05,Dit boec es in dietsche vten lati-\nne ghetogh...,...,,,,,,,,,,


### 3. Add HTR-data to the dataframe

In [292]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the excel file
df = pd.read_excel('../data/manuscript_data_rich.xlsx')

# Filter dataframe by language
df = df[df['language'] == 'middledutch']

# Fill NaN values in the 'text' column with an empty string
df['text'] = df['text'].fillna('')

# Initialize CountVectorizer for text tokenization
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())

results = []
# Group by codex_sig and iterate
for codex, group in df.groupby('codex_sig'):
    result = {}
    result['Witness'] = codex

    # Split production units on '-' and count unique units
    unique_production_units = set()
    for unit in group['production unit'].dropna():
        unique_production_units.update(unit.split('-'))
    result['Production units containing Middle Dutch text'] = len(unique_production_units)

    # Count unique folium values
    result['Folia (recto and verso)'] = group['folium'].nunique()

    # Tokenize text column and count total and unique tokens
    tokens = vectorizer.fit_transform(group['text']).toarray()
    total_tokens = tokens.sum()
    unique_tokens = len(vectorizer.get_feature_names_out())

    result['Number of tokens'] = total_tokens
    result['Type-token ratio'] = total_tokens / unique_tokens if unique_tokens != 0 else 0
    
    # Count unique characters
    unique_chars = set(''.join(group['text']))
    result['Number of unique characters'] = len(unique_chars)

    # Split scribe units on '-' and count unique units, disregard 'none', 'unknown' and empty cells
    unique_scribes = set()
    for scribe in group['scribe'].dropna():
        if scribe.lower() not in ['none', 'unknown']:
            unique_scribes.update(scribe.split('-'))
    result['Number of unique scribes'] = len(unique_scribes)

    results.append(result)

# Convert results to dataframe
result_df = pd.DataFrame(results)

In [293]:
result_df

Unnamed: 0,Witness,Production units containing Middle Dutch text,Folia (recto and verso),Number of tokens,Type-token ratio,Number of unique characters,Number of unique scribes
0,"Brussel, KBR, 1805-1808",3,132,95503,6.727932,107,4
1,"Brussel, KBR, 2485",2,135,27442,5.985169,87,1
2,"Brussel, KBR, 2849-51",7,693,154399,8.09601,122,1
3,"Brussel, KBR, 2877-78",2,330,90181,7.614709,84,1
4,"Brussel, KBR, 2879-80",3,202,82113,7.431713,93,3
5,"Brussel, KBR, 2905-09",10,384,46954,5.734489,94,2
6,"Brussel, KBR, 2979",2,49,216,1.61194,45,3
7,"Brussel, KBR, 3091",1,452,75179,7.69252,96,2
8,"Brussel, KBR, 3093-95",2,374,40444,5.194452,101,2
9,"Gent, UB, 1374",5,264,77717,7.281645,116,1


In [294]:
# Save the result_df DataFrame to an Excel file
result_df.to_excel("manuscript_analysis.xlsx", index=False)
