In [3]:
from lxml import etree
import re
import csv
import pandas as pd

In [4]:
path = '/Users/deborahleem/Documents/1_PhD/XMLSloane2021/output_person_place_misc_cleaned_up_25042021.xml'
doc = etree.parse(open(path))
root = doc.getroot()

In [5]:
excluded = {
  "EPF108491027v": ["2108","2109","2110","2111"], # "Miscellanies"
  "EPF108491177v": ["1","2"], # "Antiquities"
  "EPF108491178v": ["1","2","3","4"], # "Antiquities"
  "EPF108491179v": ["4","5","6","7","8","1"], # "Antiquities"
  "EPF108491180v": ["2","3"], # "Antiquities"
  "EPF108491181v": ["4","5"], # "Antiquities"
  "EPF108491182v": ["1","2","3","4","5"], # "Antiquities"
  "EPF108491183v": ["6","7","8","9","10","11","12"], # "Antiquities"
  "EPF108491184v": ["13","14","1","2","3","4","5","1"], # "Antiquities"
  "EPF108491186v": ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"], # "Antiquities"
  "EPF108491187v": ["20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39"], # "Antiquities"
  "EPF108491188v": ["40","41","42","43","44","45","46","47","48","1","2","3","4","5","6","7","8","9"], # "Antiquities"
  "EPF108491189v": ["10","11","12","1","2","3","4","5","6"], # "Antiquities"
  "EPF108491190v": ["7","8","9","10","11"], # "Antiquities"
}

In [6]:
#To remove some elements
def remove_element(el):
    parent = el.getparent()
    if el.tail:
        prev = el.getprevious()
        if prev:
            prev.tail = (prev.tail or '') + el.tail
        else:
            parent.text = (parent.text or '') + el.tail
    parent.remove(el)

# Remove every <add rend="del"> and <add rend="pencil"> from document before the rest of analysis.
#Confusing to have these with catnum
for e in list(root.iterfind('.//{*}add')):
    if ('rend' in e.attrib) and (e.attrib['rend'] == 'del' or e.attrib['rend'] == 'pencil'):
        #print('removing', etree.tostring(e, pretty_print=True))
        if e.text == '-1861' or e.text == '2094.' or e.text == '-382.':
            continue
        remove_element(e)

  


In [7]:
def find_enclosing_div3(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div3':
            return element
        element = element.getparent()
    
def find_pb_xml_id(element):
    element = find_enclosing_div3(element)
    while element is not None:
#         print(element.tag)
        if element.tag == '{http://www.tei-c.org/ns/1.0}pb':
            return element.attrib['{http://www.w3.org/XML/1998/namespace}id']
        element = element.getprevious()

In [8]:
#div2 <label> contains a catalogue name for each catalogue in Misc volume
def find_div2_label(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div2':
            label = element.find('{*}label')
            if label is not None:
                return ''.join(label.itertext()).strip()
            return
        element = element.getparent()

In [9]:
#Clean up catnum

def cleanup_catnum(text):
    #removing from beginning and end:
    # \s spaces/tabs/newlines
    # \- dashes
    # . dots
    # ? question marks
    # _ underscores
    # ✓
    text = re.sub(r'^[ ✓\t\n.\-_?]+', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+$', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+\*', "*", text)
    text = text.replace('. ', ' ')
    return text

In [10]:
#Page number <pb> contains a unique ID for each page
def find_pb_xml_id(element):
    element = find_enclosing_div3(element)
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}pb':
            return element.attrib['{http://www.w3.org/XML/1998/namespace}id']
        element = element.getprevious()

In [11]:
#find catalogue number
def find_catnums_in_div3(div3):
    return [cleanup_catnum(''.join(cn.itertext())) for cn in div3.findall("{*}catnum")]

In [12]:
#to find all underlined in div3s
def find_underlines_in_div3(div3):
    return [''.join(el.itertext()).strip() for el in div3.findall(".//{*}add[@rend='underline']")]

In [13]:
#to indicate when a person name or country is underlined 
#so that we know which words represent place names and person names
#looking at underline and checks inside the underline element
"""
2. Issue:
<pb xml:id="EPF108491019"/> catnum225 Handisyd not picked up
Because it's deleted

"""


def find_person_for_element(element):
    inside = element.find(".//{*}persName")
    if inside is not None:
        return inside.attrib['ref']
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}persName':
            return element.attrib['ref']
        element = element.getparent()

In [14]:
def find_place_for_element(element):
    inside = element.find(".//{*}placeName")
    if inside is not None:
        return inside.attrib['ref']
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}placeName':
            return element.attrib['ref']
        element = element.getparent()

In [15]:
#looking for underlined person in the whole div3
def find_underlined_persons(element):
    underlines = element.findall(".//{*}add[@rend='underline']")
    refs = [find_person_for_element(el) for el in underlines]
    return [r for r in refs if r]

In [16]:
def find_underlined_places(element):
    underlines = element.findall(".//{*}add[@rend='underline']")
    refs = [find_place_for_element(el) for el in underlines]
    #to remove 'none'
    return [r for r in refs if r]

In [17]:
"""
Add text
Add text xml
Look at add_rend.ipynb where xml and text paragraph are extracted

"""

'\nAdd text\nAdd text xml\nLook at add_rend.ipynb where xml and text paragraph are extracted\n\n'

In [42]:
def text_in_tag(tag):
    return re.sub(r'\s+', ' ', ''.join(tag.itertext())).strip()

def xml_in_tag(tag):
    # This needs a fix, there is extra stuff at the end
    return re.sub(r'[^>]*$', '', str(etree.tostring(tag)))

In [64]:
df_underlined = pd.DataFrame({
    'catnum': [],
    'underline': [],
    'label': [],
    'xml_id': [],
    'person': [],
    'place': [],
    'text': [],
    'xml': [],
})

In [65]:
def check_if_excluded(element):
#     print("CHECK", ('Index' in find_div2_label(element)), find_div2_label(element))
    if 'Index' in find_div2_label(element):
        return True
    for catnum in element.findall("{*}catnum"):
        text = "".join(catnum.itertext())
        text = cleanup_catnum(text)
        pb_xml_id = find_pb_xml_id(catnum)
        if pb_xml_id in excluded:
            if text in excluded[pb_xml_id]:
                return True
    return False

In [66]:
#to remove [] from the csv
def join_list(catnums):
    return ", ".join(catnums)


In [67]:
#to extract text within <add rend="underline"> </add>

for div3 in root.iter("{*}div3"):
    underlines = find_underlines_in_div3(div3)
    if not underlines:
        continue
    if check_if_excluded(div3):
        continue
#     if 'Turkey' not in str(etree.tostring(div3)):
#         continue
#     print("CHECKING",
#         "I'm a div3",
#         "my catnums are", find_catnums_in_div3(div3),
#         ", my underlines are", underlines,
#         ", label:", find_div2_label(div3),
#         ", pb xml id:", find_pb_xml_id(div3)         
#     )

    df_underlined = df_underlined.append({
        'catnum': join_list(find_catnums_in_div3(div3)),
        'underline': join_list(underlines),
        'label': find_div2_label(div3), 
        'xml_id': find_pb_xml_id(div3),
        'person': join_list(find_underlined_persons(div3)), 
        'place': join_list(find_underlined_places(div3)),
        'text': text_in_tag(div3),
        'xml': xml_in_tag(div3)
        }, ignore_index=True)

#     print(
#         "I'm a div3",
#         "my catnums are", find_catnums_in_div3(div3),
#         ", my underlines are", underlines,
#         ", label:", find_div2_label(div3),
#         ", pb xml id:", find_pb_xml_id(div3),
#         ", my persons:", find_underlined_persons(div3),
#         ", my places:", find_underlined_places(div3)
#     )

In [68]:
print(df_underlined)
df_underlined.to_csv('underlined.csv')

     catnum                         underline                         label  \
0      1799                       bow, Amyand                 Miscellanies.   
1      1800                       Arrows, Id.                 Miscellanies.   
2      1801                trumpet, Theobalds                 Miscellanies.   
3      1802                              shoe                 Miscellanies.   
4      1803                            girdle                 Miscellanies.   
...     ...                               ...                           ...   
3871    298                       Jasper haft  Agate cups botles spoons &c.   
3872    299                           Another  Agate cups botles spoons &c.   
3873    300                           Another  Agate cups botles spoons &c.   
3874    301                        Jasper cup  Agate cups botles spoons &c.   
3875    302  Amethyst, snuff box, Montmorency  Agate cups botles spoons &c.   

             xml_id                 person place  \

In [None]:
#total number of entries (catnum) extrated with underlined words
df

In [None]:
#how many person names underlined


In [None]:
#how many countries underlined

