In [90]:
## Precomposed characters

# from Wikipedia: 
# A precomposed character (alternatively composite character or decomposable character) is a Unicode entity 
# that can also be defined as a sequence of one or more other characters. 
# A precomposed character may typically represent a letter with a diacritical mark, such as é (Latin small letter e with acute accent).  

# e.g., 'Åström' can be represented as unicode as both:

# Åström (U+00C5 U+0073 U+0074 U+0072 U+00F6 U+006D) --> PREcomposed
# Åström (U+0041 U+030A U+0073 U+0074 U+0072 U+006F U+0308 U+006D) --> DEcomposed

# The script takes as input Page-xml files rendered by Transkribus (via output). 
# It reads in XML files from a specific directory and performs text cleaning on them. 
# Specifically, it searches for certain Decomposed Unicode characters in the text 
# and replaces them with other, Precomposed Unicode characters or a combination of characters based on the dictionary provided. 


In [91]:
import xml.etree.ElementTree as ET
import os

NSMAP = {'PcGts': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'} 

in_path = '../data/UB_Ghent,_1374_(Heber-Serrure_Codex)/page'
out_path = '../data/UB_Ghent,_1374_(Heber-Serrure_Codex)/newpage'

for filename in os.listdir(in_path):
    if not filename.endswith('.xml'): 
        continue
    fullname = os.path.join(in_path, filename)
    
    tree = ET.parse(fullname)
    root = tree.getroot()
    
    for item in root.findall('.//PcGts:Unicode', namespaces=NSMAP):
        text = item.text
        if text is None:
            continue
        text = text.split()
        for word in text:
            #if u'\u0304' in word:
                #print(filename, '-->', word)
            #if u'\u0305' in word:
             #   print(filename, '-->', word)              
            #if u'\u1DD1' in word:
            #    print(filename, '-->', word)
            #if 'r̄' in word:
            #    print(filename, '-->', word)
            #if 'e̅' in word:
                #print(filename, '-->', word)
           # if u'\u0065' + u'\u0113' in word:
                #print(filename, '-->', word)
             if u'\u0065' + u'\u0304' in word:
                print(filename, '-->', word)
                
                

In [98]:
import re

def multiple_replace(dict, text):
    # create regular expression from dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
    
    # For each match, look-up corresponding value in dictionary       
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

dict = {
        "a" + u'\u0304' : u'\u0101', # a
        "a" + u'\u0305' : u'\u0101', # a
    
        "A" + u'\u0304' : u'\u0100', # A
        "A" + u'\u0305' : u'\u0100', # A
    
        "e" + u'\u0304' : u'\u0113', # e
        "e" + u'\u0305' : u'\u0113', # e
    
        "E" + u'\u0304' : u'\u0112', # E
        "E" + u'\u0305' : u'\u0112', # E
    
        "i" + u'\u0304' : u'\u012b', # i
        "i" + u'\u0305' : u'\u012b', # i
    
        "I" + u'\u0304' : u'\u012A', # I
        "I" + u'\u0305' : u'\u012A', # I
    
        "o" + u'\u0304' : u'\u014D', # o
        "o" + u'\u0305' : u'\u014D', # o
    
        "O" + u'\u0304' : u'\u014C', # O
        "O" + u'\u0305' : u'\u014C', # O
    
        "u" + u'\u0304' : u'\u016B', # u
        "u" + u'\u0305' : u'\u016B', # u
    
        "U" + u'\u0304' : u'\u016A', # U
        "U" + u'\u0305' : u'\u016A', # U
    
        "j" + u'\u0304' : u'\ue554', # j
        "j" + u'\u0305' : u'\ue554', # j
    
        "J" + u'\u0304' : u'\ue154', # J
        "J" + u'\u0305' : u'\ue154', # J
    
        "m" + u'\u0304' : u'\ue5b8', # m
        "m" + u'\u0305' : u'\ue5b8', # m

        "n" + u'\u0304' : u'\ue5DC', # n
        "n" + u'\u0305' : u'\ue5DC', # n
    
        "p" + u'\u0304' : u'\ue665', # p
        "p" + u'\u0305' : u'\ue665', # p
    
        "q" + u'\u0304' : u'\ue681', # q
        "q" + u'\u0305' : u'\ue681', # q
    
        "y" + u'\u0304' : u'\u0233', # y
        "y" + u'\u0305' : u'\u0233', # y
    
        "Y" + u'\u0304' : u'\u0232', # Y
        "Y" + u'\u0305' : u'\u0232', # Y
        
        "Noᷓ" : "Noᵃ",
        "noᷓ" : "noᵃ",
    
        "ᵘ" : "ᵃ",
    
        "ᶦ" : "ⁱ", # small capital superscript i to regular superscript i
    
        "@" : "¶",
    
        "'" : "ʼ",
    
        "b" + u'\u0305' : "b" + u'\u0304', # b
    
        "r" + u'\u0305' : "r" + u'\u0304', # r
    
        "c" + u'\u0305' : "c" + u'\u0304', # c
    
        "t²" : "tᷣ"
    
        } 


In [99]:
in_path = '../data/UB_Ghent,_1374_(Heber-Serrure_Codex)/page'
out_path = '../data/UB_Ghent,_1374_(Heber-Serrure_Codex)/newpage'

for filename in os.listdir(in_path):
    if not filename.endswith('.xml'): 
        continue
    fullname = os.path.join(in_path, filename)
    with open(fullname) as text:
        new_text = multiple_replace(dict, text.read())

    with open(f'{out_path}/{filename}', "w") as result:
        result.write(new_text)

In [101]:
chars = []
for filename in sorted(os.listdir(out_path)):
    if not filename.endswith('.xml'): 
        continue
    fullname = os.path.join(out_path, filename)
    
    tree = ET.parse(fullname)
    root = tree.getroot()
    
    for item in root.findall('.//PcGts:Unicode', namespaces=NSMAP):
        text = item.text
        #print(text)

        if text is None:
            continue
        for char in text:
            chars.append(char)
        text = text.split()
        for word in text:
            if u'\u0304' in word:
                print('File -->', filename, '-->', word)
            if u'\u0305' in word:
                print('File -->', filename, '-->', word)
            if u'\u1DD1' in word:
                print('File -->', filename, '-->', word)
            #if 'r̄' in word:
            #    print(filename, '-->', word)
            #if 'ē' in word:
             #   print(filename, '-->', word)
            if '᷑' in word:
                print('File -->', filename, '-->', word)
                # ur!
            if '"' in word:
                print('File -->', filename, '-->', word)
            if "'" in word:
                print('File -->', filename, '-->', word)
            if "ᵘ" in word:
                print('File -->', filename, '-->', word)
            if '̄' in word:
                print('File -->', filename, '-->', word)
                # '̄' --> only r , s and c can show up!
            if  "§" in word:
                print('File -->', filename, '-->', word)
            if  "," in word:
                print('File -->', filename, '-->', word)
            if  "#" in word:
                print('File -->', filename, '-->', word)
            if  "¬" in word:
                print('File -->', filename, '-->', word)
            if  "+" in word:
                print('File -->', filename, '-->', word)
            if  "/" in word:
                print('File -->', filename, '-->', word)
            if  "²" in word:
                print('File -->', filename, '-->', word)
            if  "ᶻ" in word:
                print('File -->', filename, '-->', word)
            if  "@" in word:
                print('File -->', filename, '-->', word)
            if  "[" in word:
                print('File -->', filename, '-->', word)
            if  "" in word:
                print('File -->', filename, '-->', word)
                #propp (ligatuur)
            if  'ᷓ' in word:
                print('File -->', filename, '-->', word)
                # golfje in plaats van superscripte a
            if  '!' in word:
                print('File -->', filename, '-->', word)
                
               # t²
            # A76D LATIN SMALL LETTER IS
                
unique_chars = sorted(set(chars))
print(unique_chars)

File --> 114v.xml --> dic̄
File --> 114v.xml --> dic̄
File --> 114v.xml --> dic̄
File --> 114v.xml --> dic̄
File --> 121r.xml --> gr̄en
File --> 121r.xml --> gr̄en
File --> 121r.xml --> gr̄en
File --> 121r.xml --> gr̄en
File --> 124r.xml --> gr̄e
File --> 124r.xml --> gr̄e
File --> 124r.xml --> gr̄e
File --> 124r.xml --> gr̄e
File --> 132v.xml --> :+
File --> 132v.xml --> +
File --> 132v.xml --> +
File --> 132v.xml --> :+
File --> 132v.xml --> +
File --> 132v.xml --> +
File --> 1r.xml --> That̅
File --> 1r.xml --> That̅
File --> 36v.xml --> stoer̄
File --> 36v.xml --> stoer̄
File --> 36v.xml --> stoer̄
File --> 36v.xml --> stoer̄
File --> 40v.xml --> dꝫ̅
File --> 40v.xml --> dꝫ̅
File --> 41r.xml --> gr̄e
File --> 41r.xml --> gr̄e
File --> 41r.xml --> gr̄e
File --> 41r.xml --> gr̄e
File --> 43r.xml --> scⁱf²e
File --> 43r.xml --> scⁱf²e
File --> 44r.xml --> qr̄r
File --> 44r.xml --> qr̄r
File --> 44r.xml --> qr̄i
File --> 44r.xml --> qr̄i
File --> 44r.xml --> qr̄r
File --> 44r.xml --> q