# Parse MARC add Publisher Name

This script expects to work on a TSV that has MARC XML in column, It will add `publisher_marc` the full name found in the 260 $b


In [None]:
import pandas as pd
import pymarc
import io


In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"
marc_xml_column = "hathi_marc"

In [None]:
def add_auth_name(d):

    # do we have some marc data?
    if type(d[marc_xml_column]) == str:   
        # the pymarc library expects a file to open, we don't have files we have strings, 
        # so make a file like object and put our string into it so we can parse it
        with io.StringIO() as f:
            f.write(d[marc_xml_column])
            f.seek(0)
            # parse it, its returns a list of records, but we only have one, so take the 0 index
            record = pymarc.marcxml.parse_xml_to_array(f)[0]

            # we'll grab the first contribtuor from the 7xx fields if there is no 1xx author
            # there will likely be other 7xx contributor though we are only using the first one which is often the editor
            field = None
            if '260' in record:
                if 'b' in record['260']:
                    pub = record['260']['b']
                    pub=pub.strip()
                    if len(pub) > 3:
                        # remove the optional trailing comma on all headings if there
                        if pub[-1] == ',':
                            pub = pub[:-1]

                        d['publisher_marc'] = pub
                    else:
                        print(pub,'too short')




    else:
        print("No MARC data to parse:",d)

    return d

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

# run our function over all of them
df = df.apply(lambda d: add_auth_name(d),axis=1 )  
# # overwrite back out
df.to_csv(path_to_tsv, sep='\t')

