In [1]:
#Find person names that match the list in the header. Identify the ones that do not match.
#Print names that match with their <idno>viaf (header) if exists; catnum; div2; div3 content; and page number
#Print the same for the unmatched name.

#Latest file name Miscellanies5Ah_MASTER_fixed_whitespaceDL-KS_190321 (1).xml on 20 March 2021

#Excludes Indexes
#To ignore digits from <add rend="del"> and <add rend="pencil"> which occurs once with catnum 2082
#from <catnum>, remove: -; ____; ?; .; and a tick in front of a number.
#Excludes non-Sloane cat entries.

#Last updated 21st March 2021

from lxml import etree
import re
import csv

In [2]:
#path = '/Users/deborahleem/Documents/1_PhD/SloaneDBwork19/Miscellanies5Ah_MASTER_fixed_whitespaceDL-KS_190321 (1).xml'
path = '/Users/deborahleem/Documents/1_PhD/SloaneDBwork19/modified_misc3.xml'
doc = etree.parse(open(path))
mytree = doc.getroot()
root = mytree

In [3]:
#Need to exclude non-Sloane catnums and Ignore Index

In [4]:
excluded = {
  "EPF108491027v": ["2108","2109","2110","2111"], # "Miscellanies"
  "EPF108491177v": ["1","2"], # "Antiquities"
  "EPF108491178v": ["1","2","3","4"], # "Antiquities"
  "EPF108491179v": ["4","5","6","7","8","1"], # "Antiquities"
  "EPF108491180v": ["2","3"], # "Antiquities"
  "EPF108491181v": ["4","5"], # "Antiquities"
  "EPF108491182v": ["1","2","3","4","5"], # "Antiquities"
  "EPF108491183v": ["6","7","8","9","10","11","12"], # "Antiquities"
  "EPF108491184v": ["13","14","1","2","3","4","5","1"], # "Antiquities"
  "EPF108491186v": ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"], # "Antiquities"
  "EPF108491187v": ["20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39"], # "Antiquities"
  "EPF108491188v": ["40","41","42","43","44","45","46","47","48","1","2","3","4","5","6","7","8","9"], # "Antiquities"
  "EPF108491189v": ["10","11","12","1","2","3","4","5","6"], # "Antiquities"
  "EPF108491190v": ["7","8","9","10","11"], # "Antiquities"
}

In [5]:
def remove_element(el):
    parent = el.getparent()
    if el.tail:
        prev = el.getprevious()
        if prev:
            prev.tail = (prev.tail or '') + el.tail
        else:
            parent.text = (parent.text or '') + el.tail
    parent.remove(el)

# Remove every <add rend="del"> and <add rend="pencil"> from document before the rest of analysis.
#Confusing to have these with catnum
for e in list(root.iterfind('.//{*}add')):
    if ('rend' in e.attrib) and (e.attrib['rend'] == 'del' or e.attrib['rend'] == 'pencil'):
        #print('removing', etree.tostring(e, pretty_print=True))
        if e.text == '-1861' or e.text == '2094.' or e.text == '-382.':
            continue
        remove_element(e)

In [6]:
#which persName goes with which ID
idmap = {}
viafmap = {}
for x in mytree.findall('.//{http://www.tei-c.org/ns/1.0}listPerson/{http://www.tei-c.org/ns/1.0}person'):
    id = x.attrib['{http://www.w3.org/XML/1998/namespace}id']
    for name in x.findall('{http://www.tei-c.org/ns/1.0}persName'):
        text = name.text
#         text = re.sub(r'\.', '', text)
        idmap[text] = '#' + id
        idno = x.find("{*}idno")
        if idno is not None:
            viafmap[text] = idno.text

In [None]:
viafmap

In [8]:
def find_enclosing_div3(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div3':
            return element
        element = element.getparent()
    
def find_pb_xml_id(element):
    element = find_enclosing_div3(element)
    while element is not None:
#         print(element.tag)
        if element.tag == '{http://www.tei-c.org/ns/1.0}pb':
            return element.attrib['{http://www.w3.org/XML/1998/namespace}id']
        element = element.getprevious()
                
# for catnum in root.iterfind('.//{*}catnum'):
#     print(find_pb_xml_id(catnum))

In [9]:
def find_div2_label(element):
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}div2':
            label = element.find('{*}label')
            if label is not None:
                return ''.join(label.itertext()).strip()
            return
        element = element.getparent()

In [10]:
#Clean up catnum

def cleanup_catnum(text):
    #removing from beginning and end:
    # \s spaces/tabs/newlines
    # \- dashes
    # . dots
    # ? question marks
    # _ underscores
    # ✓
    text = re.sub(r'^[ ✓\t\n.\-_?]+', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+$', "", text)
    text = re.sub(r'[ ✓\t\n.\-_?]+\*', "*", text)
    text = text.replace('. ', ' ')
    return text

In [11]:
def cleanup_person_name(name):
    # Remove all spaces at start
    name = re.sub(r'^\s+', "", name)
    # Get rid of all dots
    #name = re.sub(r'\.', '', name)
    # Get rid of final 's
    #name = re.sub('\s*\'s', '', name)
    # = means contnued on next line, remove that
    name = re.sub('\s*=[\s=]*', '', name)
    # Remove spaces in "M r" etc. when at beginning of any word
    name = re.sub(r'\bM r\b', 'Mr ', name)
    name = re.sub(r'\bM rs\b', 'Mrs ', name)
    name = re.sub(r'\bD r\b', 'Dr ', name)
    name = re.sub(r'\bS r\b', 'Sr ', name)
    name = re.sub(r'\bCap t\b', 'Capt ', name)
    name = re.sub(r'\bEsq r\b', 'Esqr ', name)
    name = re.sub(r'\bW m\b', 'Wm ', name)

    # Clean up multiple spaces again
    #name = re.sub(r'\s+', " ", name)
    
    return name

In [12]:
#Page number
def find_pb_xml_id(element):
    element = find_enclosing_div3(element)
    while element is not None:
        if element.tag == '{http://www.tei-c.org/ns/1.0}pb':
            return element.attrib['{http://www.w3.org/XML/1998/namespace}id']
        element = element.getprevious()

In [13]:
def find_catnums_in_div3(div3):
    return [cleanup_catnum(''.join(cn.itertext())) for cn in div3.findall("{*}catnum")]

#matching names and inserting ID into persName ref
matches = {False: 0, True: 0}

# {False: 524, True: 603}

data = []

for x in mytree.findall('.//{http://www.tei-c.org/ns/1.0}div3//{http://www.tei-c.org/ns/1.0}persName'):
    # All text inside persName tag
    name = re.sub(r'\s+', " ", ''.join(x.itertext()))
    name = cleanup_person_name(name)

    # A few more rules
#     if name not in idmap:
#         print("not match", name)

#     if name in idmap:
    div3 = find_enclosing_div3(x)
    div3_text = ''.join(div3.itertext()).strip()
    catnums = find_catnums_in_div3(div3)
    label = find_div2_label(x)
    if label.startswith('Index'):
        continue
    page = find_pb_xml_id(x)
#etree.tounicode(div3) to get XML
    row = [name, idmap.get(name, None), viafmap.get(name, None), catnums, label, page, div3_text, etree.tounicode(div3)]
    data.append(row)

#         matches[name in idmap] += 1
#     if name in idmap:
#         x.attrib['ref'] = idmap[name]
        
#print matches and count non-matches    
# print(matches)

'''
people_match_nonmatch_text.csv' gives the names that match from the header inc normalised names
and also the ones do not have their names in the header
'''
with open('people_match_nonmatch_text.csv', 'w') as file:
    writer = csv.writer(file)
    for row in data:
        writer.writerow(row)

In [None]:
!less /Users/deborahleem/scripts/notebooks/sloane/names_not_in_header.csv

,name,id,2,3,4,5,6,7
3,Mr. M c. Cormick,,,['1807'],Miscellanies.,EPF108491002v,"-1807.
            A  stopple & a leather boracho or bottle wherein is carried wine on a
              journey wt. a wooden
              horn mouth to
              drink out of it. from Mr. M
                c. Cormick a surgeon who brought
              it from Lisbon.
            
              
              19 N","<div3 xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:ea=""http://www.enlightenmentarchitectures.org"">
            <ea:catnum type=""primary"" place=""margin"">-1807.</ea:catnum>
            <p>A  stopple &amp; a <material>leather</material> boracho or bottle wherein is carried wine on<lb/> a
              journey w<hi rend=""sup"">t</hi>. a <material>wooden</material>
              <material>horn</material> mouth<lb/> to
              drink out of it. from <persName>M<hi rend=""sup"">r</hi>. M
                <hi rend=""sup"">c</hi>. <add rend=""underline"">Cormick</add></persName> a surgeon w

In [None]:
# new stuff requested
path = 'people_match_nonmatch_text.csv'
df = pd.read_csv(path, header=None)

In [None]:
df = df.rename(columns={1: 'id', 0:'name'})

In [None]:
id_where_null = df[df.id.isnull()]

In [None]:
!pwd

In [None]:
id_where_null.to_csv('names_not_in_header.csv')

In [None]:

grouped = id_where_null.groupby(by='name').count()


In [None]:
grouped

In [None]:
len(id_where_null)

In [None]:
# end

In [None]:
len(idmap)

In [None]:
import pandas as pd
p = '5_people_match_nonmatch_text_with_dot.csv'
df = pd.read_csv(p, header=None, names = ['Name', 'id','viaf', 'catnum', 'd', 'e', 'f'])

In [None]:
!pwd

In [None]:
# nu of rows
len(df)

In [None]:
# These are values where viaf is populated
len(df[~df.viaf.isnull()])

In [None]:
# values that are populated without the dupes. i.e keep all unique
# by unique we mean a distinct list of all the values
# Deb doesnt care
unique_viaf = df[~df.viaf.isnull()].viaf.drop_duplicates()
len(unique_viaf)

In [None]:
# These are values where viaf is populated
pop_viaf = df[~df.viaf.isnull()]

In [None]:
# These are values that are unique (they have zero repeats)
158-88

In [None]:
# These are rows that occur more than once
grouped = pop_viaf.groupby(by='viaf').count()
grouped.head()

In [None]:
df.head()

In [None]:
grouped.sort_values('id', ascending=False).a.head(30)

In [None]:
import numpy as np
np.nan is np.nan

In [None]:
def get_name(a, b):
    if b is not np.nan:
        return b.replace('#', '')
    else:
        return a


df['normalised_name'] = df.apply(lambda row: get_name(row['Name'], row['id']), axis=1)

In [None]:
x = df.loc[0, 'id']

In [None]:
type(x)

In [None]:
x is np.nan

In [None]:
df.head()

In [None]:
df.groupby('normalised_name').count().sort_values('Name', ascending=False)

In [None]:
df.sort_values('normalised_name', ascending=True)[['Name', 'normalised_name', 'catnum']]

In [None]:
df.sort_values('normalised_name', ascending=True)[['Name', 'normalised_name', 'catnum']].to_csv('all_names_normalized.csv')