# Add OL Work ID

This script expects you have downloaded the Open Library editions file: https://openlibrary.org/developers/dumps
It will first loop through you data and build a lookup for all the ids its has for your data, then loop through the ol dump and see what matches up
Fill out the column names below, if you don't have a piece of the data set it equal to None


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import json


In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"
path_to_ol_editions = "/Users/m/Downloads/ol_dump_editions_2023-07-31.txt"

record_id = "recordid" # this is the id that will be used to backfill the data into your data, should be populated for all records

oclc_classify = "oclc_classify" # if you have the classify xml blob else set to None
oclc = "oclc"
isbns = "isbns"

lookup_oclc = {}
lookup_isbn = {}
record_id_lookup = {}



In [None]:
def parse_classify_row(d):

    # do we have some classify data?
    if oclc_classify != None:

        if type(d[oclc_classify]) == str:   

            data = extract_classify(d[oclc_classify])
            if data != None:
                
                for e in data["editions"]:

                    o = str(e['oclc'])
                    lookup_oclc[o] = d[record_id]


            else:
                #print("bad data", d['oclc_classify'])
                pass

            

        else:
            # print("No Classify data to parse:",d)
            pass

        
    if oclc != None:            
        if pd.isnull(d[oclc]) == False:

            o = str(int(d[oclc]))
            lookup_oclc[o] = d[record_id]
    
    if isbns != None:
        if pd.isnull(d[isbns]) == False:

            isbn_numbers = d[isbns].split("|")
            for i in isbn_numbers:
                lookup_isbn[i] = d[record_id]

    
    return d

In [None]:
def add_ol_work(d):

    
    if str(d[record_id]) in record_id_lookup:
        d['ol_work'] = record_id_lookup[str(d[record_id])][0]['key']
        
    return d

In [None]:
def extract_classify(xml):


		soup = BeautifulSoup(str(xml))

		work_soup = soup.find("work")

		if work_soup == None:
			# print("can not parse xml:")
			# print(xml)
			return None

		results = {}

		results['work_statement_responsibility'] = None if work_soup.has_attr('author') == False else work_soup['author']
		results['work_editions'] = None if work_soup.has_attr('editions') == False else int(work_soup['editions'])
		results['work_eholdings'] = None if work_soup.has_attr('eholdings') == False else int(work_soup['eholdings'])
		results['work_format'] = None if work_soup.has_attr('format') == False else work_soup['format']
		results['work_holdings'] = None if work_soup.has_attr('holdings') == False else int(work_soup['holdings'])
		results['work_itemtype'] = None if work_soup.has_attr('itemtype') == False else work_soup['itemtype']
		results['work_owi'] = None if work_soup.has_attr('owi') == False else work_soup['owi']
		results['work_title'] = None if work_soup.has_attr('title') == False else work_soup['title']
		results['main_oclc'] = work_soup.text

		
		
		authors_soup = soup.find_all("author")
		results['authors'] = []
		for a in authors_soup:
			results['authors'].append({
					"name" : a.text,
					"lccn" : None if a.has_attr('lc') == False else a['lc'],
					"viaf" : None if a.has_attr('viaf') == False else a['viaf']
				})

		for a in results['authors']:
			if a['lccn'] == 'null':
				a['lccn'] = None	
			if a['viaf'] == 'null':
				a['viaf'] = None	

		# try to find the first main contributor
		results['work_author'] = None
		if results['work_statement_responsibility'] != None:
			if len(results['work_statement_responsibility'].split("|"))>0:
				first_author = results['work_statement_responsibility'].split("|")[0].strip()
				for a in results['authors']:
					if a['name'].strip() == first_author:
						results['work_author'] = a



		results["normalized_ddc"] = None
		results["normalized_lcc"] = None

		ddc_soup = soup.find("ddc")
		if ddc_soup != None:
			ddc_soup = soup.find("ddc").find("mostpopular")
			if ddc_soup != None:
				if ddc_soup.has_attr('nsfa'):
					results["normalized_ddc"] = ddc_soup['nsfa']

		lcc_soup = soup.find("lcc")
		if lcc_soup != None:
			lcc_soup = soup.find("lcc").find("mostpopular")
			if lcc_soup != None:
				if lcc_soup.has_attr('nsfa'):
					results["normalized_lcc"] = lcc_soup['nsfa']


		results["headings"] = []
		heading_soup = soup.find_all("heading")
		for h in heading_soup:
			results["headings"].append({
					"id" : h['ident'],
					"src": h['src'],
					"value" : h.text
				})
			

		edition_soup = soup.find_all("edition")
		# print(isbn,len(edition_soup))
		results["editions"] = []
		for e in edition_soup:
			edition = {}
			edition['author'] = None if e.has_attr('author') == False else e['author']
			edition['eholdings'] = None if e.has_attr('eholdings') == False else int(e['eholdings'])
			edition['format'] = None if e.has_attr('format') == False else e['format']
			edition['holdings'] = None if e.has_attr('holdings') == False else int(e['holdings'])
			edition['itemtype'] = None if e.has_attr('itemtype') == False else e['itemtype']
			edition['language'] = None if e.has_attr('language') == False else e['language']
			edition['oclc'] = None if e.has_attr('oclc') == False else e['oclc']
			edition['title'] = None if e.has_attr('title') == False else e['title']
			results["editions"].append(edition)
		
		if len(results["editions"]) > 0:
			results["largest_holding_oclc"] = results["editions"][0]['oclc']

		return results

In [None]:
# load the tsv
# df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)
# df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

# # run our function over all of them to build the lookups
# df = df.apply(lambda d: parse_classify_row(d),axis=1 )  


# print('lookup_oclc',len(lookup_oclc))
# print('lookup_isbn',len(lookup_isbn))
# counter = 0
# with open(path_to_ol_editions) as ol_file:

#     for line in ol_file:
#         counter=counter+1
#         if counter % 1000000 == 0:
#             print(counter)

#         json_text = line.split("\t")[4]
#         try:
#             data = json.loads(json_text)
#         except:
#             print("Bad parse:", json_text)
#         if 'works' in data:
                
#             if 'oclc_numbers' in data:
#                 for o in data['oclc_numbers']:
#                     if o in lookup_oclc:
#                         record_id_lookup[lookup_oclc[o]] = data['works']
#             if 'isbn_10' in data:
#                 for i in data['isbn_10']:
#                     if i in lookup_isbn:
#                         record_id_lookup[lookup_isbn[i]] = data['works']
#             if 'isbn_13' in data:
#                 for i in data['isbn_13']:
#                     if i in lookup_isbn:
#                         record_id_lookup[lookup_isbn[i]] = data['works']


# json.dump(record_id_lookup, open('record_id_lookup.json','w'))
record_id_lookup = json.load(open('record_id_lookup.json'))
print('record_id_lookup',len(record_id_lookup))

df2 = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)
df2.drop(df.filter(regex="Unname"),axis=1, inplace=True)
# add in our data based on record_id
df2 = df2.apply(lambda d: add_ol_work(d),axis=1 )  
# overwrite back out
df2.to_csv(path_to_tsv, sep='\t')
