In [10]:
!pip install --upgrade collatex

Requirement already up-to-date: collatex in c:\users\sofie\anaconda3\lib\site-packages (2.2)


In [45]:
from glob import glob
import os
from itertools import combinations

from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

In [52]:
sigles = [os.path.basename(fn).replace('xml_', '').replace('.xml', '') for fn in glob('../data/xml_martijn/*.xml')]
sigles = sorted(sigles)
sigles

['A',
 'Ant',
 'B',
 'BR',
 'C',
 'D',
 'D2',
 'E',
 'F',
 'G',
 'Ge',
 'K',
 'L',
 'O',
 'W',
 'Y',
 'Z']

In [53]:
from lxml import etree
from re import sub #re — Regular expression operations #
import xml.etree.ElementTree as ET
import string


NSMAP = {'MVN': 'http://www.tei-c.org/ns/1.0'}
removes = ('teiHeader', 'fw', 'supplied', 'abbr') 
removes_expan_false = ('teiHeader', 'fw', 'supplied', 'ex', 'expan')

def extract_lines(xml_file, expan = True, punct = True, lower = True): #added a 'flag' to the extraction function --> if... else
    lines = {}
    tree = etree.parse(xml_file)
    
    
    if expan:
        #delete all elements with the provided tag names from a tree or subtree
        #will also remove the tail text unless explicitly set the with_tail keyword argument option to False
        etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes), with_tail=False) 
    else: 
        etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes_expan_false), with_tail=False)
        
    for element in tree.iterfind('.//'+"{"+ NSMAP["MVN"]+ "}"+'l'): #add namespace because otherwise it won't find the element gap
        if(element.find('.//'+"{"+ NSMAP["MVN"]+ "}"+'gap')) is not None:
            element.getparent().remove(element)
            
    chars = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
             'k', 'l','m', 'n', 'o', 'p', 'q', 'r', 's', 't', 
             'u', 'v', 'w', 'x', 'y', 'z'}
            
    context = etree.iterwalk(tree, events=("start", "end")) #a tree walker generates events from an existing tree. 'Start' and 'end' represent opening and closing elements
    #u prefix indicates Unicode
    text = u"" 
    #this will be the key in the dictionary of lines {}
    k = '' 
    for action, node in context:
        #remove ns for easier access
        #tag_only bevat de tags op een knooppunt
        #.tag selects all child elements with the given tag. A tag is a string identifying what kind of data this element represents (the element type, in other words).
        #.replace removes ns www.tei...
        
        tag_only = node.tag.replace("{http://www.tei-c.org/ns/1.0}","") 
            
        #node.attrib: XML elements have attributes 
        #attrib is a dictionary containing the element’s attributes. 
        if 'n' in node.attrib and tag_only == 'text': 
            title = node.attrib['n'] #for example Eerste Martijn
        
        if 'n' in node.attrib and tag_only == "l":                
            k = title + '-' + node.attrib['n'] #for example 001, 002, 003...
  
        # if a new pb (standalone element) is processed:
        if action == 'start' and tag_only == 'text': 
            continue
            
        # if new lb (standalone) is processed:
        elif action == 'start' and tag_only == 'lb':
            continue

        # list elements which you want to iterate through. this is not really neccessary.
        elif tag_only in ("group","text","MVN","body","cb","p"):
            continue

        # for all other elements, distinguish between the start-event of the processing and
        # and the end-event. Attach the tail AFTER the child nodes were processed (= end-event) 

        elif action == 'start':
            #comment the following two lines out to not get the element markers
            #f.write(f"[{tag_only}]") 
            #text += f"[{tag_only}]"

            ############################################################################
            ########## filter out special characters, bars,                   ##########
            ########## superscript, or specific tags.                         ##########
            ############################################################################
            
            #if a special glyph is present, encode it accordingly
            
            replacements = {"a": u'\u0101', 'A': u'\u0100', 'e': u'\u0113', 'E': u'\u0112','n': u'\u00D1','N': u'\u00F1','o': u'\u014D',
                    'O': u'\u014C','u': u'\u016B','U': u'\u016A','i': u'\u012B','I': u'\u012A','j': u'\u025F', 'J': u'\u0248', 'm': u'\u1E3F',
                    'M': u'\u1E3E','Y': u'\u0232','y': u'\u0233', 'h': u'\uE517', 'p': u'\u1E55'}
            
            if tag_only == 'g':

                #if 'ref' in node.attrib and node.attrib['ref'] == '#bar':
                if node.attrib['ref'] == '#bar': # ā, ē, ī, ō, ū, n̄ etc.
                    if text[-1] in replacements: # if final letter before #bar is in replacement dict
                        for key, value in replacements.items(): # loop over all key-value pairs
                            text = text[:-1] + text[-1].replace(key, value) # replace text with text starting at beginning up until that letter, then add decomp character

                elif node.attrib['ref'] == '#apomod': # ʼ
                    text += u'\u02bc'

                elif node.attrib['ref'] == '#usmod': # ꝰ
                    text += u'\ua770'

                elif node.attrib['ref'] == '#condes': # ꝯ
                    text += u'\ua76f'

                elif node.attrib['ref'] == '#para': # ¶
                    text += u'\xb6'

                elif node.attrib['ref'] == '#etfin': # ꝫ
                    text += u'\ua76b'

                elif node.attrib['ref'] == '#pbardes': # ꝑ
                    text += u'\ua751'

                elif node.attrib['ref'] == '#pbardes': # ꝕ
                    text += u'\ua755'

                elif node.attrib['ref'] == '#pflour': # ꝓ
                    text += u'\ua753'

                else:
                    node.attrib['ref']
                    text += str(node.attrib['ref']) # get the actual ref if there still are any left

            #encode superscript letters
            superscript_dict = {'a':'ᵃ', 'b':'ᵇ', 'c':'ᶜ', 'd':'ᵈ', 'e':'ᵉ', 'f':'ᶠ',
                               'g':'ᵍ', 'h':'ʰ', 'i':'ᶦ', 'j':'ʲ', 'k':'ᵏ', 'l':'ˡ', 
                                'm':'ᵐ', 'n':'ⁿ', 'o':'ᵒ', 'p':'ᵖ', 'r':'ʳ', 's':'ˢ', 
                                't':'ᵗ', 'u':'ᵘ', 'v':'ᵛ', 'w':'ʷ', 'x':'ˣ', 'y': 'ʸ', 'z': 'ᶻ'}

            if tag_only == 'hi' and 'rend' in node.attrib and node.attrib['rend'] == 'superscript': #rend(ition) supplies information about the appearance of an element
                if node.text in superscript_dict:
                    text += str(superscript_dict[node.text]).strip()

            #encode punctuation marks
            elif tag_only == 'pc':
                text += str(node.text).strip()

            #encode roman numerals
            elif tag_only == 'num':
                if node.text:
                    text += str('.'+node.text+'.').strip()

            #if there is still a node with text in it
            elif (node.text):
                text += node.text        

        #after the child elements
        elif action == 'end':
            #if there is a tail
            #the tail attribute holds the text between the element’s end tag and the next tag, or None
            if (node.tail and node.tail not in "\t"): #if the tail is not yet in the text 
                #comment the following two lines out to not get the tail marker
                #text += "[tail]"
                #f.write("[tail]")
                #append to text-concatenation
                text += str(node.tail)
        if tag_only == 'lb':
            if k:
                text = sub(r'\n', '', text) #when an "r" prefix is present, a character following a backslash is included in the string/all backslashes are left in string. 
                if not punct:
                    text = text.translate(str.maketrans('', '', string.punctuation)) #the value of the keys k is the text 
                if lower: 
                    text = text.lower()
                   
                    # text = text[::-1]
                  
                lines[k] = text #the value of the keys k is the text 
                text = ''                    
    lines = {k:v for k, v in lines.items() if v.strip()} #if a line with a gap is removed, remove empty key       
    
    return lines

# lowercasen (flag) en interpunctie weghalen
d = extract_lines(f'../data/xml_martijn/xml_{sigles[0]}.xml', expan = False, punct = False, lower = True)
#print(d)

In [54]:
sigles 
mss = {}

for sigle in tqdm(sigles):
    mss[sigle] = extract_lines(f'../data/xml_martijn/xml_{sigle}.xml', expan = True, punct = False, lower = True)
#print(mss)

100%|███████████████████████████████████████████| 17/17 [00:00<00:00, 42.79it/s]


In [55]:
# mock input = {'eerste_martijn_001': [], 'eerste_martijn_002': []}

def transform_input(sigles, mss):
    input = {}
    for i in sigles:
        for line_name in mss[i]:
            if line_name not in input:
                input[line_name] = [{"id": i, "content": mss[i][line_name]}]
            else:
                input[line_name] += [{"id": i, "content": mss[i][line_name]}]
    return input

input = transform_input(sigles, mss)

# send to collatex

In [56]:
#useful website: http://interedition.github.io/collatex/  .html
import json
from collatex import *
import re

In [57]:
def send_to_collatex(input):  # stuur van elke versie 1 regel door
    output = {}
    collation = Collation()
    for i in input:
        #if i in ["Tweede Martijn-001", "Tweede Martijn-002", "Tweede Martijn-003"]: #hier als voorbeeld enkel de eerste 3 lijnen in de dictionary naar collatex sturen, de anderen worden genegeerd.
            #print(str(input[i]))
        json_input = """{"witnesses": """ + str(input[i]) + "}"
        json_input = re.sub(r"\'", "\"", json_input)
        output[i] = str(collate(json.loads(json_input), 
                                    layout="vertical", 
                                    near_match=True, 
                                    segmentation=False, 
                                    output='table',
                                   ))
        print(collate(json.loads(json_input),
                                    layout="horizontal", 
                                    near_match=True, 
                                    segmentation=False, 
                                    output='table',    
                                ))
    return output

test = send_to_collatex(input)
print(test["Tweede Martijn-003"])


+---+----------+---------+-----+-------+------+
| A | vvaphene | martin  | hoe | salt  | gaen |
| B | waphene  | martin  | hoe | salt  | gaen |
| C | uuapene  | martijn | hoe | zalt  | gaen |
| D | wapen    | martijn | hoe | salt  | gaen |
| F | wapene   | merten  | hoe | saelt | gaen |
| G | wapen    | merten  | hoe | saelt | gaen |
| L | waphene  | merten  | hoe | saelt | gaen |
| O | waphene  | martin  | hoe | salt  | gaen |
| Y | waphene  | martijn | hoe | saelt | gaen |
+---+----------+---------+-----+-------+------+
+---+-----+------+---------+------+--------+-------+
| A | sal | die  | weerelt | hiet | langhe | staen |
| B | sal | de   | werelt  | yet  | lang   | staen |
| C | zal | dese | werelt  | iet  | langhe | staen |
| D | sal | dese | werelt  | -    | lange  | staen |
| F | sal | dese | werelt  | yet  | lange  | staen |
| G | sal | dese | werelt  | -    | lange  | staen |
| L | sal | dese | werelt  | -    | lange  | staen |
| O | sal | dese | weerelt | yet  | langhe | sta