# Hathi Parse MARC add Authorized Name

This script expects to work on a TSV that has MARC XML in a `hathi_marc` column, likely added by the `download_hathi_marc` script previously. It will add `name_marc` the full name found in the 100, 110 or 111, 700, 710, 711, in the subfield order $a$b$c$q$d$g MARC field.


In [157]:
import pandas as pd
import pymarc
import io


In [158]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"


In [159]:
def add_auth_name(d):

    # do we have some marc data?
    if type(d['hathi_marc']) == str:   
        # the pymarc library expects a file to open, we don't have files we have strings, 
        # so make a file like object and put our string into it so we can parse it
        with io.StringIO() as f:
            f.write(d['hathi_marc'])
            f.seek(0)
            # parse it, its returns a list of records, but we only have one, so take the 0 index
            record = pymarc.marcxml.parse_xml_to_array(f)[0]

            # we'll grab the first contribtuor from the 7xx fields if there is no 1xx author
            # there will likely be other 7xx contributor though we are only using the first one which is often the editor
            field = None
            if '100' in record:
                field = record['100']
            elif '110' in record:
                field = record['110']
            elif '111' in record:
                field = record['111']
            elif '700' in record:
                field = record['700']
            elif '710' in record:
                field = record['710']
            elif '711' in record:
                field = record['711']                                
            else:
                print("No Author found!:", d['hathi_marc'])
                return d

            # assbel the heading in the correct order 
            name = field['a']
            if 'b' in field:
                name = name + ' ' + field['b']
            if 'c' in field:
                name = name + ' ' + field['c']
            if 'q' in field:
                name = name + ' ' + field['q']                  
            if 'd' in field:
                name = name + ' ' + field['d']   
            if 'g' in field:
                name = name + ' ' + field['g']   
 
            # have seen empty "" 100 fields
            if len(name.strip()) == 0:
                print("No Author found!:", d['hathi_marc'])
                return d

            # remove the optional trailing period on all headings if there
            if name[-1] == '.':
                name = name[:-1]

            d['author_marc'] = name

    else:
        print("No MARC data to parse:",d)

    return d

In [160]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

# run our function over all of them
df = df.apply(lambda d: add_auth_name(d),axis=1 )  
# # overwrite back out
df.to_csv(path_to_tsv, sep='\t')



No Author found!: <?xml version="1.0" encoding="UTF-8"?><collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"><record><leader>00410nam a2200145Ia 4500</leader><controlfield tag="001">006502944</controlfield><controlfield tag="003">MiAaHDL</controlfield><controlfield tag="005">20130926000000.0</controlfield><controlfield tag="006">m        d        </controlfield><controlfield tag="007">cr bn ---auaua</controlfield><controlfield tag="008">850413s1946    enk           000 1 eng d</controlfield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">sdr-nrlfGLAD17165742-B</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(OCoLC)9596441</subfield></datafield><datafield tag="040" ind1=" " ind2=" "><subfield code="a">BAT</subfield><subfield code="c">BAT</subfield><subfield code="d">CUY</subfield><subf