### Script for parsing Transkribus PageXML

In [225]:
import os
from tqdm import tqdm
import pandas as pd
from lxml import etree
import re

In [226]:
## we go through all the manuscripts folders, getting the paths to all available PageXML-files

page_files = []

for manuscript_folder in sorted(os.listdir('../manuscripts')):
    if manuscript_folder == '.DS_Store':
        continue  # skip .DS_Store files
    page_folder = os.path.join('../manuscripts', manuscript_folder, 'page')
    for filename in sorted(os.listdir(page_folder)):
        if filename == '.DS_Store':
            continue  # skip .DS_Store files
        page_file = os.path.join(page_folder, filename)
        page_files.append(page_file)
print(page_files[0:5])

['../manuscripts/Brussel,_KBR,_1805-1808/page/0001_KBR_1805-08_cover.xml', '../manuscripts/Brussel,_KBR,_1805-1808/page/0002_KBR_1805-08_blank.xml', '../manuscripts/Brussel,_KBR,_1805-1808/page/0003_KBR_1805-08_blank.xml', '../manuscripts/Brussel,_KBR,_1805-1808/page/0004_KBR_1805-08_blank.xml', '../manuscripts/Brussel,_KBR,_1805-1808/page/0005_KBR_1805-08_blank.xml']


In [227]:
# 

def extract_metadata(page_files):
    ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    metadata = pd.DataFrame(columns=['codex_sig', 'transkribuspage', 'image_filename', 'image_width', 'image_height', 'region_type', 'coordinates', 'text', 'reading_order', 'status'])
    for page_file in tqdm(page_files):
        with open(page_file, 'rb') as f:
            
            # use regex to extract the text between the second and third slash
            manuscript = re.search(r'/(?P<text>[^/]+)', page_file[page_file.find('/') + 1 :]).group('text')
            manuscript = manuscript.replace('_', ' ')

            tree = etree.parse(f)

            image_filename = tree.xpath("//pc:Page/@imageFilename", namespaces=ns)
            
            image_width = tree.xpath("//pc:Page/@imageWidth", namespaces=ns)
            image_width = int(image_width[0]) if image_width else None
            
            image_height = tree.xpath("//pc:Page/@imageHeight", namespaces=ns)
            image_height = int(image_height[0]) if image_height else None

            page_nr = tree.xpath("//pc:TranskribusMetadata/@pageNr", namespaces=ns)
            page_nr = int(page_nr[0]) if page_nr else None
            status = tree.xpath("//pc:TranskribusMetadata/@status", namespaces=ns)
            
            for text_region in tree.xpath('//pc:TextRegion', namespaces=ns):
                region_type = text_region.get('type')
                custom = text_region.get('custom')
                reading_order = custom.split("index:")[1].split(";")[0].strip()
                
                for coords in text_region.xpath('./pc:Coords[1]/@points', namespaces=ns):
                    text_region_coords = coords

                full_text = []
                
                # iterate over each TextLine in the TextRegion, get the text contained in the last Unicode-element
                for text_line in text_region.xpath('.//pc:TextLine', namespaces=ns):
                    unicode_list = text_line.xpath('./pc:TextEquiv[last()]/pc:Unicode', namespaces=ns)
                    # sometimes, the Unicode-element is empty, and for that reason xpath can't find it
                    # therefore, we add a check to ensure that the list is not empty before trying to access its first element
                    if len(unicode_list) > 0:
                        final_unicode = unicode_list[0].text
                    else:
                        final_unicode = ' '
                    
                    # print the final Unicode element
                    full_text.append(final_unicode)
                    
                final_text = '\n'.join(filter(None, full_text)) #!!!
                
                row = {'codex_sig': manuscript, 'transkribuspage': page_nr, 'image_filename': image_filename, 'image_width': image_width, 'image_height': image_height, 'region_type': region_type, 'coordinates': text_region_coords, 'text': final_text, 'reading_order' : reading_order, 'status': status}
                metadata = pd.concat([metadata, pd.DataFrame(row)], ignore_index=True)

    return metadata

df = extract_metadata(page_files)
df['text'].replace('\n', ' ', regex=True) # remove \n in text
df.to_excel('manuscript_data.xlsx')


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6354/6354 [00:18<00:00, 350.48it/s]


In [228]:
df.head()

Unnamed: 0,codex_sig,transkribuspage,image_filename,image_width,image_height,region_type,coordinates,text,reading_order,status
0,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,5186,7123,header,"1405,478 3598,478 3598,721 1405,721",Een voorredene op sinte gregoriꝰ dyalogus,0,GT
1,"Brussel, KBR, 1805-1808",7,0007_KBR_1805-08_1r.tiff,5186,7123,paragraph,"1359,751 3792,751 3792,2645 1359,2645",DE zeere wise e saleghe gregoris paeus\nvan d...,1,GT
2,"Brussel, KBR, 1805-1808",8,0008_KBR_1805-08_1v.tiff,5186,7123,paragraph,"1736,658 3985,658 3985,1643 1736,1643",wiste dat si hare te xpūs gheloeue ghege-\nuen...,0,GT
3,"Brussel, KBR, 1805-1808",8,0008_KBR_1805-08_1v.tiff,5186,7123,marginalia,"2414,1705 4016,1705 4016,1851 2414,1851",⸫ Nēmeer en vant ics jnt latijn .,1,GT
4,"Brussel, KBR, 1805-1808",9,0009_KBR_1805-08_2r.tiff,5186,7123,paragraph,"859,5840 847,5678 835,5462 781,4838 751,4480 7...",Hier beghint een voʼredene\nop .Sʼ. gᵉgorius d...,0,GT


In [230]:
# now, we're going to merge the spreadsheet with other information, relating to the corpus (such as: scribe, production unit, etc.)
fn = '../data/codex_info.xlsx'
sheets = pd.read_excel(fn, sheet_name=None)
del sheets['OVERVIEW']

In [231]:
sheets.keys()

dict_keys(['KBR 394-98', 'KBR 1805-08', 'KBR 2485', 'KBR 2849-51', 'KBR 2877-78', 'KBR 2879-80', 'KBR 2905-09', 'KBR 2979', 'KBR 3091', 'KBR 3093-95', 'Ghent UB 941', 'Ghent UB 1374', 'BA 8224', 'MA 920', 'BAN O256', 'ÖNB 13708', 'ÖNB 65', 'ÖNB 12905', 'ÖNB 12857', 'Lewis E 199'])

In [232]:
H = pd.concat(sheets.values())
H.reset_index(inplace=True, drop=True)
H.drop(['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'], inplace=True, axis=1)

In [233]:
H.head()

Unnamed: 0,codex_sig,transkribuspage,filename,folium,layout,content,language,scribe,production unit,local Herne product,date,translator,transcription,HTR-model used,CER?
0,"Brussel, KBR, 394-98",1.0,0001_KBR_394-98_cover.tiff,cover,corrected,none,none,none,none,no,1450-1500,,none,,
1,"Brussel, KBR, 394-98",2.0,0002_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,no,1450-1500,,none,,
2,"Brussel, KBR, 394-98",3.0,0003_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,none,none,,none,,
3,"Brussel, KBR, 394-98",4.0,0004_KBR_394-98_blank.tiff,blank,corrected,none,none,none,none,none,none,,none,,
4,"Brussel, KBR, 394-98",5.0,0005_KBR_394-98_1r.tiff,1r,corrected,calendar,none,A,I,yes,1373-1383,,none,,


In [234]:
# we merge the two dataframes
df_merged = pd.merge(df, H, on=['codex_sig', 'transkribuspage'], how='left')

# save merged dataframe
df_merged.to_excel('silentvoices_manuscript_data.xlsx')