In [10]:
pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [14]:
!pip install grapheme
!pip install pandas
!pip install seaborn
!pip install scipy
#!pip install --upgrade numpy
!pip install numpy==1.26.2



In [15]:
from glob import glob 
import os 
from itertools import combinations 
import grapheme 
from collatex import * 
from tqdm import tqdm 
import numpy as np 
import pandas as pd
import seaborn as sb 
import matplotlib.pyplot as plt 

from lxml import etree
from re import sub 
import xml.etree.ElementTree as ET
import string

In [16]:
sigles = [os.path.basename(fn).replace('xml_', '').replace('.xml', '') for fn in glob('../data/xml/*.xml')] 
sigles = sorted(sigles)
sigles

['A',
 'B2',
 'Br',
 'C',
 'D',
 'G',
 'H',
 'L',
 'M',
 'Me',
 'P',
 'R',
 'b',
 'd1',
 'd2',
 'd3',
 'd4',
 'd5',
 'd6']

In [17]:
def get_gap_lines(tree):
    gap_lines = []
    for text in tree.iterfind('.//' + "{" + NSMAP["MVN"] + "}" + 'text'):
        if 'n' in text.attrib:
            for line in text.iterfind('.//' + "{" + NSMAP["MVN"] + "}" + 'l'):
                if line.find('.//' + "{" + NSMAP["MVN"] + "}" + 'gap') is not None:
                    if 'n' in line.attrib:
                        n_value = line.attrib['n']
                        parts = n_value.split('_')
                        if len(parts) > 1:
                            k = title + '_' + "_".join(parts[1:])  # Join the parts after the first underscore
                        else:
                            k = title + '_' + n_value
                        gap_lines.append(k)

    return gap_lines

In [18]:
NSMAP = {'MVN': 'http://www.tei-c.org/ns/1.0'} 
removes = ('teiHeader', 'fw', 'supplied', 'abbr')
removes_expan_false = ('teiHeader', 'fw', 'supplied', 'ex', 'expan')
chars = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}       

In [23]:
import re
def extract_lines(xml_file, expan = True,  
                  punct = True, lower = True,
                  sep_abbr = True): 
    lines = {}
    key_count = {}
    duplicate_keys = set()
    tree = etree.parse(xml_file) 
    
    if expan:
        etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes), with_tail=False) 
    else: 
        etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes_expan_false), with_tail=False)
  
    context = etree.iterwalk(tree, events=("start", "end"))
    text = u"" 
    k = '' 
    
    for action, node in context: 
        tag_only = node.tag.replace("{http://www.tei-c.org/ns/1.0}","")  #remove ns for easier access
        if 'n' in node.attrib and tag_only == 'text': 
            title = node.attrib['n'] 

        if 'n' in node.attrib and tag_only == "l":
           # k = node.attrib['n']
            n_value = node.attrib['n']
            parts = n_value.split('_')
            if len(parts) > 1:
                k = title + '_' + "_".join(parts[1:])  
            else:
                k = title + '_' + n_value 
        # if a new pb (standalone element) is processed:
        if action == 'start' and tag_only == 'text': 
            continue
            
        elif action == 'start' and tag_only == 'lg':
            continue 
            
        # if new lb (standalone) is processed:
        elif action == 'start' and tag_only == 'lb':
            continue

        # list elements which you want to iterate through. this is not really neccessary.
        elif tag_only in ("group","text","MVN","body","cb","p","note"):
            continue

        # for all other elements, distinguish bet ween the start-event of the processing and
        # and the end-event. Attach the tail AFTER the child nodes were processed (= end-event) 
         
        elif action == 'start':
            #comment the following two lines out to not get the element markers
            #f.write(f"[{tag_only}]") 
            #text += f"[{tag_only}]"

            ############################################################################
            ########## filter out special characters, bars,                   ##########
            ########## superscript, or specific tags.                         ##########
            ############################################################################
                    
            
            #if a special glyph is present, encode it accordingly
                
            if tag_only == 'g':
                if sep_abbr:
                    if node.attrib['ref'] == '#bar': # ā, ē, ī, ō, ū, n̄ etc.
                        text += u'\u005f' #low line _

                    elif node.attrib['ref'] == '#apomod': # ʼ
                        text += u'\u02bc'

                    elif node.attrib['ref'] == '#usmod': # ꝰ
                        text += u'\ua770' 

                    elif node.attrib['ref'] == '#condes': # ꝯ
                        text += u'\ua76f'

                    elif node.attrib['ref'] == '#para': # ¶
                        text += u'\xb6'

                    elif node.attrib['ref'] == '#etfin': # ꝫ
                        text += u'\ua76b'

                    elif node.attrib['ref'] == '#pbardes': # ꝑ
                        text += '\ua751'

                    elif node.attrib['ref'] == '#pbardes': # ꝕ
                        text += u'\ua755'

                    elif node.attrib['ref'] == '#pflour': # ꝓ
                        text += u'\ua753'
                        
                    elif node.attrib['ref'] == '#rrot': #ꝛ
                        text += (u'\uA75B')
                    else:
                        text += str(node.attrib['ref']) # get the actual ref if there still are any left
                    
                else:
                    if node.attrib['ref'] == '#bar': # ā, ē, ī, ō, ū, n̄ etc.
                        text += u'\u0304'

                    elif node.attrib['ref'] == '#apomod': # ʼ
                        text += u'\u02bc'

                    elif node.attrib['ref'] == '#usmod': # ꝰ
                        text += u'\ua770'

                    elif node.attrib['ref'] == '#condes': # ꝯ
                        text += u'\ua76f'

                    elif node.attrib['ref'] == '#para': # ¶
                        text += u'\xb6'

                    elif node.attrib['ref'] == '#etfin': # ꝫ
                        text += u'\ua76b'

                    elif node.attrib['ref'] == '#pbardes': # ꝑ
                        text += u'\ua751'

                    elif node.attrib['ref'] == '#pbardes': # ꝕ
                        text += u'\ua755'

                    elif node.attrib['ref'] == '#pflour': # ꝓ
                        text += u'\ua753'
                        
                    elif node.attrib['ref'] == '#rrot': #ꝛ
                        text += (u'\uA75B')

                    else:
                        node.attrib['ref']
                        text += str(node.attrib['ref']) # get the actual ref if there still are any left

            # encode superscript letters
            superscript_dict = {'a':'ᵃ', 'b':'ᵇ', 'c':'ᶜ', 'd':'ᵈ', 'e':'ᵉ', 'f':'ᶠ',
                               'g':'ᵍ', 'h':'ʰ', 'i':'ᶦ', 'j':'ʲ', 'k':'ᵏ', 'l':'ˡ', 
                                'm':'ᵐ', 'n':'ⁿ', 'o':'ᵒ', 'p':'ᵖ', 'r':'ʳ', 's':'ˢ', 
                                't':'ᵗ', 'u':'ᵘ', 'v':'ᵛ', 'w':'ʷ', 'x':'ˣ', 'y': 'ʸ', 'z': 'ᶻ'}

            if tag_only == 'hi' and 'rend' in node.attrib and node.attrib['rend'] == 'superscript': #rend(ition) supplies information about the appearance of an element
                if node.text in superscript_dict:
                    text += str(superscript_dict[node.text]).strip()

            elif tag_only == 'ex':
                    text += str('*'+node.text+'€')
            
            elif tag_only == 'del':
                if node.text:
                    text += f"<del>{node.text}</del>"

            # encode punctuation marks
            elif tag_only == 'pc':
                text += str(node.text).strip()

            # encode roman numerals
            elif tag_only == 'num':
                if node.text:
                    text += str('.'+node.text+'.').strip()

            elif tag_only == 'damage':
                text += ('[...]')
                
            elif tag_only == 'del':
                def strikethrough(text):
                    return ''.join([char + '\u0336' for char in node.text])
                if node.text: 
                    text += strikethrough(node.text)

            # if there is still a node with text in it
            elif (node.text):
                text += node.text        

        # after the child elements
        elif action == 'end':
            #if there is a tail
            if (node.tail and node.tail not in "\t"): #if the tail is not yet in the text 
                #comment the following two lines out to not get the tail marker
                #text += "[tail]"
                #f.write("[tail]")
                #append to text-concatenation
                text += str(node.tail)

        if tag_only == 'l' or tag_only == 'lg':
            if k: 
                text = sub(r'\n', '', text) 
                
        if tag_only == 'lb':
            if k:
                text = sub(r'\n', '', text)
                if not punct:
                    punctuation_with_pilcrow = string.punctuation + '¶' + '⸫'
                    text = text.translate(str.maketrans('', '', punctuation_with_pilcrow))
                    text = text.translate(str.maketrans('', '', string.punctuation))
                if lower: 
                    text = text.lower()
                   
                    #text = text[::-1]
                  
             #   lines[k] = text 
                if k not in key_count:
                    key_count[k] = 0
                else:
                    key_count[k] += 1
                    duplicate_keys.add(k)

    # Create a unique key by appending the counter to the key, some stanzas appear twice....
                if key_count[k] > 0:
                    unique_key = f"{k}_{key_count[k]}"  
                else:
                    unique_key = k  
                lines[unique_key] = text  # Store the line with the unique key

                text = ''
    print("Keys that occur more than once:", duplicate_keys)

    # catch dangling last line (if applicable):
    if text:
        lines[k] = text
    text = sub(r'\n', '', text)  # Verwijder nieuwe regels

    if not punct:
        punctuation_with_pilcrow = string.punctuation + '¶' + '⸫'
        text = text.translate(str.maketrans('', '', punctuation_with_pilcrow))
        text = re.sub(r'\s+', ' ', text).strip()
        #text = text.translate(str.maketrans('', '', string.punctuation))  # Verwijder interpunctie
    if lower:
        text = text.lower()  # Zet om naar kleine letters
    lines[k] = text 
    num_orig_lines = len(lines)
    print(num_orig_lines)
    # remove lines with gaps:
    #gap_lines = get_gap_lines(tree)
    #lines = {k:v for k, v in lines.items() if k not in gap_lines}
    #print(f'-> removed {num_orig_lines - len(lines)} lines with gaps')
    #lines = {k:v for k, v in lines.items() if v.strip()} #if a line with a gap is removed, remove empty key, strip() removes spaces #The items() method returns a key-value pair
    
    return lines
    #num_orig_lines = len(lines)
    #print(num_orig_lines)
d = extract_lines(f'../data/xml/xml_{sigles[3]}.xml', expan = True, punct = True, lower = True)
print(d)

Keys that occur more than once: {'disticha_IV,48_0004', 'disticha_IV,48_0001', 'disticha_IV,48_0003', 'disticha_IV,48_0002'}
298
{'prologue_prologue_0045': 'dit seide catoen en*de€ sp*ra€c ald*us€', 'prologue_prologue_0046': 'si deus est animus', 'disticha_I,01_0001': 'nu merct sone wat ic ghebiede', 'disticha_I,01_0002': 'en*de€ wat es dat ic di bediede', 'disticha_I,01_0003': 'dats dattu di d*aer€ ane salt ke*er€en', 'disticha_I,01_0004': 'dattu gode salt e*m€mer eeren', 'disticha_I,02_0001': 'du salt des maerghins vrouch vp stae*n€', 'disticha_I,02_0002': 'en*de€ om dine bederue gaen', 'disticha_I,02_0003': 'me*n€ seghet die te langhe slaept', 'disticha_I,02_0004': 'dat he*m€ die slaep onduchtich maect', 'disticha_I,03_0001': '¶ wachti dat me*n€ niet en seghet', 'disticha_I,03_0002': 'dat vele tale*n€ andi leghet', 'disticha_I,03_0003': 'me*n€ seghet menegherande tale', 'disticha_I,03_0004': 'ne sittet niemene goed*er€s wale', 'disticha_I,04_0001': 'sone doe alse die vroede doet', '

In [24]:
mss = {} 
for sigle in tqdm(sigles): 
    mss[sigle] = extract_lines(f'../data/xml/xml_{sigle}.xml',
                               expan = True, punct = True, lower = True,
                               sep_abbr = False) 
#print(mss)

100%|████████████████████████████████████████████| 19/19 [00:00<00:00, 194.62it/s]

Keys that occur more than once: set()
393
Keys that occur more than once: set()
263
Keys that occur more than once: set()
34
Keys that occur more than once: {'disticha_IV,48_0004', 'disticha_IV,48_0001', 'disticha_IV,48_0003', 'disticha_IV,48_0002'}
298
Keys that occur more than once: set()
287
Keys that occur more than once: set()
206
Keys that occur more than once: set()
261
Keys that occur more than once: set()
72
Keys that occur more than once: {'disticha_I,11_0001', 'disticha_I,11_0002', 'disticha_II,14_0004', 'disticha_II,14_0003'}
233
Keys that occur more than once: set()
80
Keys that occur more than once: {'disticha_II,08_0002', 'disticha_II,08_0003', 'disticha_III,20_0001', 'disticha_III,21_0004', 'disticha_III,21_0003', 'disticha_III,20_0003', 'disticha_II,08_0001', 'disticha_III,20_0004', 'disticha_III,20_0002', 'disticha_III,21_0001', 'disticha_III,21_0002', 'disticha_II,08_0004'}
188
Keys that occur more than once: set()
72
Keys that occur more than once: set()
108
Keys th




In [25]:
mss.keys()

dict_keys(['A', 'B2', 'Br', 'C', 'D', 'G', 'H', 'L', 'M', 'Me', 'P', 'R', 'b', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6'])

In [26]:
import re
idx = set()
for ms in mss:
    idx.update(set(mss[ms].keys()))

def repl(idx): 
    idx = idx.split('_')  # Split the key by underscores
   
   # print(parts)
    if idx[0] == "prologue":
        idx[0] = '1 prologue'
    if idx[0] == "disticha":
        idx[0] = '2 disticha'
    if idx[0] == "epilogue":
        idx[0] = '3 epilogue'
    if len(idx) >= 4:
        # Join the first, second, and last parts
        idx = idx[0] + '-' + idx[1] + '-' + idx[2] + '-' +idx[-1]  
    elif len(idx) == 3:
#    if len(idx) >= 2:
        idx = idx[0] + '-' + idx[1] + '-' + idx[-1]  # Concatenate the first and last parts with a hyphen        
    else:
        idx = idx
    return idx

idx = [repl(i) for i in idx]


idx = sorted(idx)
witnesses = sorted(mss.keys())


lines = np.empty([len(idx), len(witnesses)], dtype="object")

#for ms in mss.keys():
  #  for l in mss[ms]:
   #     lines[idx.index(l), witnesses.index(ms)] = mss[ms][l].replace('*', '<i>').replace('€', '</i>')

for ms in mss.keys():
    for l in mss[ms]:
        transformed_l = repl(l)  # Apply the transformation to each key
        lines[idx.index(transformed_l), witnesses.index(ms)] = mss[ms][l].replace('*', '<i>').replace('€', '</i>')

idx = sorted(idx)
witnesses = sorted(mss.keys())

lines = pd.DataFrame(lines, index=idx, columns=witnesses)
lines.to_html('../data/xlsx/synoptic.html', escape=False)