In [45]:
#Extract (from only Sloane records, no indexs) all catnum where place or person name (or both) mentioned with 
#page numbers<pb>, <div2>, and text within <div3>
#There are div3s with no catnum.
from lxml import etree
import re
import csv
import pandas as pd

In [46]:
path = '/Users/deborahleem/Documents/PhD/SloaneDBwork19/Miscellanies5Ah_MASTER_fixed_whitespaceDL.xml'
doc = etree.parse(open(path))
root = doc.getroot()

In [47]:
def remove_element(el):
    parent = el.getparent()
    if el.tail:
        prev = el.getprevious()
        if prev:
            prev.tail = (prev.tail or '') + el.tail
        else:
            parent.text = (parent.text or '') + el.tail
    parent.remove(el)

# Remove every <add rend="del"> and <add rend="pencil"> from document before the rest of analysis
for e in list(root.iterfind('.//{*}add')):
    if ('rend' in e.attrib) and (e.attrib['rend'] == 'del' or e.attrib['rend'] == 'pencil'):
        #print('removing', etree.tostring(e, pretty_print=True))
        if e.text == '-1861' or e.text == '2094.' or e.text == '-382.':
            continue
        remove_element(e)

  """


In [48]:
part_excluded = {
  "EPF108491027v": ["2108","2109","2110","2111"], # "Miscellanies"
  "EPF108491177v": ["1","2"], # "Antiquities"
  "EPF108491277": ["57"], # "Mathematical"
}

excluded = {
  "EPF108491178v": ["1","2","3","4"], # "Antiquities"
  "EPF108491179v": ["4","5","6","7","8","1"], # "Antiquities"
  "EPF108491180v": ["2","3"], # "Antiquities"
  "EPF108491181v": ["4","5"], # "Antiquities"
  "EPF108491182v": ["1","2","3","4","5"], # "Antiquities"
  "EPF108491183v": ["6","7","8","9","10","11","12"], # "Antiquities"
  "EPF108491184v": ["13","14","1","2","3","4","5","1"], # "Antiquities"
  "EPF108491186v": ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"], # "Antiquities"
  "EPF108491187v": ["20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39"], # "Antiquities"
  "EPF108491188v": ["40","41","42","43","44","45","46","47","48","1","2","3","4","5","6","7","8","9"], # "Antiquities"
  "EPF108491189v": ["10","11","12","1","2","3","4","5","6"], # "Antiquities"
  "EPF108491190v": ["7","8","9","10","11"], # "Antiquities"
}
#last two entries on EPF108491271 wrapped in div3 are not Sloane, Pictures &c.

In [49]:
def find_div2_label(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div2':
            label = element.find('{*}label')
            if label is not None:
                return ''.join(label.itertext()).strip()
            return
        element = element.getparent()

In [50]:
def find_enclosing_div3(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div3':
            return element
        element = element.getparent()
    
def find_pb_xml_id(element):
    element = find_enclosing_div3(element)
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}pb':
            return element.attrib['{http://www.w3.org/XML/1998/namespace}id']
        element = element.getprevious()

In [51]:
def cleanup_catnum(text):
    #removing from beginning and end:
    # \s spaces/tabs/newlines
    # \- dashes
    # . dots
    # ? question marks
    # _ underscores
    # ✓
    text = re.sub(r'^[ ✓\t\n.\-_?]+', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+$', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+\*', "*", text)
    text = text.replace('. ', ' ')
    return text

In [52]:
#ignore non-Sloane catnums and index <label type="index">. End of index </div2>


In [53]:
people_csv_data = []

# labels = set()

for div3 in root.iter("{*}div3"):
    persons = list(div3.iter('{*}persName'))
    places = list(div3.iter('{*}placeName'))
    catnums = list(div3.iter('{*}catnum'))
    clean_catnums = [cleanup_catnum(''.join(catnum.itertext())) for catnum in catnums]
    
    page_id = find_pb_xml_id(div3)
    label = find_div2_label(div3)
    
#     labels.add(label)
    if label.startswith('Index'):
        continue
    
    if len(persons) > 0 or len(places) > 0:
        if page_id in excluded:
            continue

#         This is an individual row to be saved as a dict
        res = {}

        res["div2_label"] = label
        div3_text = ''.join(div3.itertext())

        person_res = []
        for person in persons:
            name = re.sub(r'\s+', " ", ''.join(person.itertext()))
            name = re.sub(r'^\s+', '', name)
            name = re.sub(r'\s+$', '', name)  
            person_res.append(name)

        res["person"] = person_res

        places_res = []
        for place in places:
            name = re.sub(r'\s+', " ", ''.join(place.itertext()))
            name = re.sub(r'^\s+', '', name)
            name = re.sub(r'\s+$', '', name)   
            places_res.append(name)

        res["place"] = places_res

        catnum_res = []
        
#         Clean the catnums
        for name in clean_catnums:
            if not name:
                name = ''
            catnum_res.append(name)
            
        if not catnum_res:
            catnum_res = ['']
        res["catnum"] = catnum_res
        people_csv_data.append(res)

# print(labels)

In [54]:
# Create the dataframe and populate based on the dictionary

df = pd.DataFrame({'div2_label': [], 'person': [], 'place': [], 'catnum': []})
for row in people_csv_data:
    df = df.append({'div2_label': row['div2_label'], 'person': str(row['person']), 'place': str(row['place']), 'catnum': str(row['catnum'])}, ignore_index=True)
df.head()

Unnamed: 0,div2_label,person,place,catnum
0,Miscellanies.,['Mr. Amyand.'],['East Indies'],['1799']
1,Miscellanies.,['Mr. Theobalds.'],['Norway'],['1801']
2,Miscellanies.,['Mr. Roberts'],[],['1806']
3,Miscellanies.,['Mr. M'],['Lisbon'],['1807']
4,Miscellanies.,[],['Lisbone'],['1808']


In [55]:
# number of records
len(df)

1401

In [56]:
# records with no person. No of records with place name only.
df[df.person == '[]']
# len(df[df.person == '[]'])

Unnamed: 0,div2_label,person,place,catnum
4,Miscellanies.,[],['Lisbone'],['1808']
6,Miscellanies.,[],['China'],['1810']
10,Miscellanies.,[],['Malabar'],['1818']
11,Miscellanies.,[],['England'],['1819']
12,Miscellanies.,[],['London'],['1']
...,...,...,...,...
1388,Agate cups botles spoons &c.,[],['East India'],['194']
1391,Agate cups botles spoons &c.,[],['Suratte'],['203']
1394,Agate cups botles spoons &c.,[],['Germany'],['218']
1398,Agate cups botles spoons &c.,[],['India'],['239']


In [57]:
# records with no person or place
df[(df.person == '[]') & (df.place == "[]")]

Unnamed: 0,div2_label,person,place,catnum


In [58]:
# records with no place. Total no of records with person only.
len(df[df.place == "[]"])

555

In [59]:
# Both person and place are populated
len(df[(df.person != '[]') & (df.place != "[]")])

343

In [60]:
df[df.catnum == "['1823']"]

Unnamed: 0,div2_label,person,place,catnum
20,Miscellanies.,[],['Surinam'],['1823']


In [61]:
with open('people_and_places.csv', 'w') as file:
    writer = csv.writer(file)
    for row in people_csv_data:
        writer.writerow(row.values())