In [531]:
from lxml import etree
from tqdm import tqdm
from collections import Counter
import pandas as pd

sigles = ['A', 'Ant', 'B', 'BR', 'C', 'D', 'D2', 'E', 'F', 'G', 'Ge', 'K', 'L', 'O', 'W', 'Y', 'Z']

# Create a dictionary to store word counts for each witness
word_counts_dict = {}

for sigle in tqdm(sigles):
    xml_file = f"../collateconstraints/data/xml_martijn/xml_{sigle}.xml"

    def extract_text_within_expan(element):
        text = ''
        for child in element.iter():
            if child.text:
                text += child.text.strip().lower()
            if child.tail:
                text += child.tail.strip().lower()
        return text.strip()

    tree = etree.parse(xml_file)
    NSMAP = {"tei": "http://www.tei-c.org/ns/1.0"}

    abbreviations = []

    for expan_element in tree.iterfind('.//tei:expan', namespaces=NSMAP):
        text_within_expan = extract_text_within_expan(expan_element)
        abbreviations.append(text_within_expan)

    #print(abbreviations)

    word_counts = Counter(abbreviations)
    word_counts_dict[sigle] = word_counts

# Create a DataFrame for each witness's top 5 results
dfs = []
for sigle, word_counts in word_counts_dict.items():
    top_results = word_counts.most_common(10)
    df = pd.DataFrame(top_results, columns=['Expanded Word', 'Count'])
    df['Witness'] = sigle
    dfs.append(df)

# Concatenate all DataFrames
result_df = pd.concat(dfs)

# Reset the index of the final DataFrame
result_df.reset_index(drop=True, inplace=True)

# Print the final DataFrame
print(result_df)

100%|██████████████████████████████████████████| 17/17 [00:00<00:00, 143.13it/s]

    Expanded Word  Count Witness
0            ende    289       A
1             hem     71       A
2            daer     68       A
3             van     66       A
4             den     59       A
5             men     30       A
6            omme     25       A
7            maer     25       A
8          sonder     24       A
9            want     21       A
10           ende      3     Ant
11            der      1     Ant
12         merten      1     Ant
13            dat      1     Ant
14        gelogen      1     Ant
15          neger      1     Ant
16        menegen      1     Ant
17     metterhant      1     Ant
18           lken      1     Ant
19           inne      1     Ant
20           ende    224       B
21            den     47       B
22            hem     46       B
23          minne     37       B
24         sonder     34       B
25            der     32       B
26            van     22       B
27         werelt     18       B
28           omme     18       B
29        




In [652]:
from lxml import etree
from tqdm import tqdm
from collections import Counter
import pandas as pd

sigles = ['A', 'Ant', 'B', 'BR', 'C', 'D', 'D2', 'E', 'F', 'G', 'Ge', 'K', 'L', 'O', 'W', 'Y', 'Z']

# Create a dictionary to store word counts for each witness
word_counts_dict = {}

for sigle in tqdm(sigles):
    xml_file = f"../collateconstraints/data/xml_martijn/xml_{sigle}.xml"

    def extract_text_within_expan(element):
        text = ''
        for child in element.iter():
            
            if 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#bar':
                text += u'\u0304'
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#apomod': 
                text += u'\u02bc'    
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#usmod': 
                text += u'\ua770'     
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#condes': 
                text += u'\ua76f'  
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#para':
                text += u'\xb6'
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#etfin': 
                text += u'\ua76b'
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#pbardes': 
                text += u'\ua751'
            elif 'g' and 'ref' in child.attrib and child.attrib['ref'] == '#pflour': 
                text += u'\ua753'
            else:
                if child.attrib == '#ref':
                    text += str(child.attrib)
           
            superscript_dict = {'a':'ᵃ', 'b':'ᵇ', 'c':'ᶜ', 'd':'ᵈ', 'e':'ᵉ', 'f':'ᶠ',
                               'g':'ᵍ', 'h':'ʰ', 'i':'ᶦ', 'j':'ʲ', 'k':'ᵏ', 'l':'ˡ', 
                                'm':'ᵐ', 'n':'ⁿ', 'o':'ᵒ', 'p':'ᵖ', 'r':'ʳ', 's':'ˢ', 
                                't':'ᵗ', 'u':'ᵘ', 'v':'ᵛ', 'w':'ʷ', 'x':'ˣ', 'y': 'ʸ', 'z': 'ᶻ'}
            
            if 'hi' and 'rend' in child.attrib and child.attrib['rend'] == 'superscript': 
                if child.text in superscript_dict:
                    text += str(superscript_dict[child.text]).strip()
                    
            elif child.text:
                text += child.text.strip().lower()
    
            if child.tail:
                text += child.tail.strip().lower()
                 
        return text.strip()

    tree = etree.parse(xml_file)
    NSMAP = {"tei": "http://www.tei-c.org/ns/1.0"}

    abbreviations = []

    for expan_element in tree.iterfind('.//tei:abbr', namespaces=NSMAP):
        text_within_expan = extract_text_within_expan(expan_element)
        abbreviations.append(text_within_expan)
   
    print(abbreviations)

    word_counts = Counter(abbreviations)
    word_counts_dict[sigle] = word_counts

# Create a DataFrame for each witness's top 5 results
dfs = []
for sigle, word_counts in word_counts_dict.items():
    top_results = word_counts.most_common(10)
    df = pd.DataFrame(top_results, columns=['Expanded Word', 'Count'])
    df['Witness'] = sigle
    dfs.append(df)

# Concatenate all DataFrames
result_df = pd.concat(dfs)

# Reset the index of the final DataFrame
result_df.reset_index(drop=True, inplace=True)

# Print the final DataFrame
print(result_df)

100%|██████████████████████████████████████████| 17/17 [00:00<00:00, 121.98it/s]

['mʼtin', 'wʼelt', 'lāghe', 'staē', 'cranckē', 'vᵒuwe', 'vʼeere', 'sondʼ', 'heʼen', 'dē', 'valschē', 'heʼen', 'conē', 'en̄', 'pluckē', 'vandē', 'en̄', 'dē', 'rechtē', 'bespottē', 'en̄', 'vʼscroue', 'ghedoghē', 'en̄', 'en̄', 'hē', 'indē', 'helschē', 'sondʼ', 'ouʼ', 'sprekē', 'ondʼ', 'wanē', 'quā', 'wilē', 'en̄', 'en̄', 'indē', 'en̄', 'ꝯpaen', 'hebbē', 'vʼstaē', 'dʼ', 'vʼclaerne', 'dinē', 'beuaē', 'vʼmochte', 'dē', 'hemelschē', 'traē', 'en̄', 'en̄', 'wʼelt', 'vʼheffene', 'bouē', 'vʼuwe', 'en̄', 'en̄', 'en̄', 'heʼen', 'en̄', 'gheuē', 'eenē', 'en̄', 'willē', 'wanē', 'ōme', 'en̄', 'dē', 'scalckē', 'radē', 'en̄', 'sceldē', 'dē', 'heʼe', 'wordē', 'clēmene', 'ōmaer', 'en̄', 'wʼelt', 'en̄', 'en̄', 'vʼbannē', 'en̄', 'wedʼ', 'bidʼ', 'sonnē', 'aldʼ', 'wʼelt', 'oghē', 'dʼ', 'ieghē', 'ghetoghē', 'hʼ', 'vʼdonckert', 'hē', 'en̄', 'vʼplecht', 'dē', 'droeghē', 'ouʼ', 'en̄', 'en̄', 'hē', 'en̄', 'dʼ', 'ōme', 'wāt', 'dʼ', 'wanē', 'comē', 'wordē', 'wāt


