# Download OCLC MARC from Classify Results

This script downloads MARC records from OCLC for the first X largest holdings from Classify Data, you need to have the classify XML blob in the data (`oclc_classify`) to parse


In [None]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import json
import requests


In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/iowa_writers_workshop-hathitrust_metadata.tsv"
oclc_classify = 'oclc_classify'
isbn = 'isbns'
oclc_marc = 'oclc_marc'
user_agent = 'YOUR PROJECT NAME HERE'
record_id = "recordid" 
pause_between_req = 1
WSkey = "xxxxxxxxx"
counter = []

In [None]:
def add_oclc_marc(d):

    counter.append(d[record_id])
    print(len(counter), end="|")


    # do we have some classify data?
    if type(d[oclc_classify]) == str:   

        # you can add other criteria here, but we are only looking for ones with no ISBN yet
        #if pd.isnull(d[isbn]) == True:

            if pd.isnull(d[oclc_marc]) == True:
                
                data = extract_classify(d[oclc_classify])
                if data != None:


                    try:
                        int(data['largest_holding_oclc'])
                    except:
                        print("Cannot convert to int",data['largest_holding_oclc'])
                        return d
                    
                    url = f"https://www.worldcat.org/webservices/catalog/content/{int(data['largest_holding_oclc'])}?wskey={WSkey}"
                    
                    r = requests.get(url, headers={'Accept': 'application/json', 'User-Agent': user_agent})
                    
                    if '<leader>' not in r.text:
                        print("No OCLC record XML:",data['largest_holding_oclc'])
                        return d
                    
                    d[oclc_marc] = r.text
                    time.sleep(pause_between_req)


                else:
                    #print("bad data", d[oclc_classify])
                    pass

        

    else:
        # print("No Classify data to parse:",d)
        pass
    
    return d

In [None]:
def extract_classify(xml):


		soup = BeautifulSoup(str(xml))

		work_soup = soup.find("work")

		if work_soup == None:
			# print("can not parse xml:")
			# print(xml)
			return None

		results = {}

		results['work_statement_responsibility'] = None if work_soup.has_attr('author') == False else work_soup['author']
		results['work_editions'] = None if work_soup.has_attr('editions') == False else int(work_soup['editions'])
		results['work_eholdings'] = None if work_soup.has_attr('eholdings') == False else int(work_soup['eholdings'])
		results['work_format'] = None if work_soup.has_attr('format') == False else work_soup['format']
		results['work_holdings'] = None if work_soup.has_attr('holdings') == False else int(work_soup['holdings'])
		results['work_itemtype'] = None if work_soup.has_attr('itemtype') == False else work_soup['itemtype']
		results['work_owi'] = None if work_soup.has_attr('owi') == False else work_soup['owi']
		results['work_title'] = None if work_soup.has_attr('title') == False else work_soup['title']
		results['main_oclc'] = work_soup.text

		
		
		authors_soup = soup.find_all("author")
		results['authors'] = []
		for a in authors_soup:
			results['authors'].append({
					"name" : a.text.split('[')[0].strip(),
					"lccn" : None if a.has_attr('lc') == False else a['lc'],
					"viaf" : None if a.has_attr('viaf') == False else a['viaf']
				})

		for a in results['authors']:
			if a['lccn'] == 'null':
				a['lccn'] = None	
			if a['viaf'] == 'null':
				a['viaf'] = None	

		# try to find the first main contributor
		results['work_author'] = None
		if results['work_statement_responsibility'] != None:
			if len(results['work_statement_responsibility'].split("|"))>0:
				first_author = results['work_statement_responsibility'].split("|")[0].strip()
				for a in results['authors']:
					if a['name'].strip() == first_author:
						results['work_author'] = a



		results["normalized_ddc"] = None
		results["normalized_lcc"] = None

		ddc_soup = soup.find("ddc")
		if ddc_soup != None:
			ddc_soup = soup.find("ddc").find("mostpopular")
			if ddc_soup != None:
				if ddc_soup.has_attr('nsfa'):
					results["normalized_ddc"] = ddc_soup['nsfa']

		lcc_soup = soup.find("lcc")
		if lcc_soup != None:
			lcc_soup = soup.find("lcc").find("mostpopular")
			if lcc_soup != None:
				if lcc_soup.has_attr('nsfa'):
					results["normalized_lcc"] = lcc_soup['nsfa']


		results["headings"] = []
		heading_soup = soup.find_all("heading")
		for h in heading_soup:
			results["headings"].append({
					"id" : h['ident'],
					"src": h['src'],
					"value" : h.text
				})
			

		edition_soup = soup.find_all("edition")
		# print(isbn,len(edition_soup))
		results["editions"] = []
		for e in edition_soup:
			edition = {}
			edition['author'] = None if e.has_attr('author') == False else e['author']
			edition['eholdings'] = None if e.has_attr('eholdings') == False else int(e['eholdings'])
			edition['format'] = None if e.has_attr('format') == False else e['format']
			edition['holdings'] = None if e.has_attr('holdings') == False else int(e['holdings'])
			edition['itemtype'] = None if e.has_attr('itemtype') == False else e['itemtype']
			edition['language'] = None if e.has_attr('language') == False else e['language']
			edition['oclc'] = None if e.has_attr('oclc') == False else e['oclc']
			edition['title'] = None if e.has_attr('title') == False else e['title']
			results["editions"].append(edition)
		
		if len(results["editions"]) > 0:
			results["largest_holding_oclc"] = results["editions"][0]['oclc']

		return results

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):
    print("")
    counter = []
    # if you want it to skip X number of chunks uncomment this, the number is the row to skip to
    # if idx < 23:
    #     continue

    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: add_oclc_marc(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')

