# OCLC Download Classify

This script adds the Classify xml blob from the OCLC API to a TSV file, it is a preparatory script to run other scripts to extract metadata from the Classify data. 

It is a requirement that you have either the OCLC number in the data 

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It creates a new column in the file `oclc_classify` which holds the MARC XML 

In [None]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup


## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`id_column_name` - the name of the column header that contains the oclc record id number

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call

`WSKey` - the OCLC WSkey

In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/nyt_hardcover_fiction_bestsellers-titles.tsv"
id_column_name = "oclc"
id_author_name = 'author'
id_title_name = 'title'
user_agent = 'YOUR PROJECT NAME HERE'
pause_between_req = 0
WSkey = "xxxx"



In [None]:
def add_oclc(d):



    # if there is already a value skip it
    if 'oclc_classify' in d:
        if type(d['oclc_classify']) == str:        
            print('Skip',d[id_column_name])
            return d

    # # if you want to add any logic to only download some records add it here
    # if type(d['author_lccn']) == str:
    #     print('Skip',d[id_column_name])
    #     return d

    headers = {'X-OCLC-API-Key': WSkey}
    params = {'author': d[id_author_name], 'title': d[id_title_name], 'summary' : 'false', 'maxRecs':100}
    r = requests.get('https://metadata.api.oclc.org/classify/', params=params,headers=headers)

    work_parsed = None
    work_unparsed = None

    if r.text.find('<response code="2"/>') > -1:
        work_parsed = extract_classify(r.text)
        work_unparsed = r.text

    elif r.text.find('<response code="4"/>') > -1:
        
        soup = BeautifulSoup(str(r.text))
        work_soup = soup.find("works")
        largest_count = 0
        largest_work = None
        for work in work_soup.find_all("work"):
            if int(work['holdings']) > largest_count:
                largest_count = int(work['holdings'])
                largest_work = work


        print(largest_work)
        params = {'owi': largest_work['owi'], 'summary' : 'false', 'maxRecs':100}
        r = requests.get('https://metadata.api.oclc.org/classify/', params=params,headers=headers)

        work_parsed = extract_classify(r.text)
        work_unparsed = r.text

    elif r.text.find('<response code="100"/>') > -1 or r.text.find('<response code=\\"100\\"/>') >-1:
        print(params,'100: No input. The method requires an input argument.')
    elif r.text.find('<response code="101"/>') > -1 or r.text.find('<response code=\\"101\\"/>') >-1:
        print(params,'101: Invalid input. The standard number argument is invalid.')
    elif r.text.find('<response code="102"/>') > -1 or r.text.find('<response code=\\"102\\"/>') >-1:
        print(params,'102: ?.')
    elif r.text.find('<response code="200"/>') > -1 or r.text.find('<response code=\\"200\\"/>') >-1:
        print(params,'200: Unexpected error.')
    else:
        print("unknown Problem:",r.text)


    if work_parsed != None:

        d['oclc_classify'] = work_unparsed

    time.sleep(pause_between_req)

    return d

In [None]:
def extract_classify(xml):


		soup = BeautifulSoup(str(xml))

		work_soup = soup.find("work")

		if work_soup == None:
			# print("can not parse xml:")
			# print(xml)
			return None

		results = {}

		results['work_statement_responsibility'] = None if work_soup.has_attr('author') == False else work_soup['author']
		results['work_editions'] = None if work_soup.has_attr('editions') == False else int(work_soup['editions'])
		results['work_eholdings'] = None if work_soup.has_attr('eholdings') == False else int(work_soup['eholdings'])
		results['work_format'] = None if work_soup.has_attr('format') == False else work_soup['format']
		results['work_holdings'] = None if work_soup.has_attr('holdings') == False else int(work_soup['holdings'])
		results['work_itemtype'] = None if work_soup.has_attr('itemtype') == False else work_soup['itemtype']
		results['work_owi'] = None if work_soup.has_attr('owi') == False else work_soup['owi']
		results['work_title'] = None if work_soup.has_attr('title') == False else work_soup['title']
		results['main_oclc'] = work_soup.text

		
		
		authors_soup = soup.find_all("author")
		results['authors'] = []
		for a in authors_soup:
			results['authors'].append({
					"name" : a.text.split('[')[0].strip(),
					"lccn" : None if a.has_attr('lc') == False else a['lc'],
					"viaf" : None if a.has_attr('viaf') == False else a['viaf']
				})

		for a in results['authors']:
			if a['lccn'] == 'null':
				a['lccn'] = None	
			if a['viaf'] == 'null':
				a['viaf'] = None	

		# try to find the first main contributor
		results['work_author'] = None
		if results['work_statement_responsibility'] != None:
			if len(results['work_statement_responsibility'].split("|"))>0:
				first_author = results['work_statement_responsibility'].split("|")[0].strip()
				for a in results['authors']:
					print(a['name'].split('[')[0].strip(), first_author )
					if a['name'].strip() == first_author:
						results['work_author'] = a



		results["normalized_ddc"] = None
		results["normalized_lcc"] = None

		ddc_soup = soup.find("ddc")
		if ddc_soup != None:
			ddc_soup = soup.find("ddc").find("mostpopular")
			if ddc_soup != None:
				if ddc_soup.has_attr('nsfa'):
					results["normalized_ddc"] = ddc_soup['nsfa']

		lcc_soup = soup.find("lcc")
		if lcc_soup != None:
			lcc_soup = soup.find("lcc").find("mostpopular")
			if lcc_soup != None:
				if lcc_soup.has_attr('nsfa'):
					results["normalized_lcc"] = lcc_soup['nsfa']


		results["headings"] = []
		heading_soup = soup.find_all("heading")
		for h in heading_soup:
			results["headings"].append({
					"id" : h['ident'],
					"src": h['src'],
					"value" : h.text
				})
			

		edition_soup = soup.find_all("edition")
		# print(isbn,len(edition_soup))
		results["editions"] = []
		for e in edition_soup:
			edition = {}
			edition['author'] = None if e.has_attr('author') == False else e['author']
			edition['eholdings'] = None if e.has_attr('eholdings') == False else int(e['eholdings'])
			edition['format'] = None if e.has_attr('format') == False else e['format']
			edition['holdings'] = None if e.has_attr('holdings') == False else int(e['holdings'])
			edition['itemtype'] = None if e.has_attr('itemtype') == False else e['itemtype']
			edition['language'] = None if e.has_attr('language') == False else e['language']
			edition['oclc'] = None if e.has_attr('oclc') == False else e['oclc']
			edition['title'] = None if e.has_attr('title') == False else e['title']
			results["editions"].append(edition)
		
		if len(results["editions"]) > 0:
			results["largest_holding_oclc"] = results["editions"][0]['oclc']

		return results

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    # if you want it to skip X number of chunks uncomment this, the number is the row to skip to
    # if idx < 707:
    #     continue

    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: add_oclc(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')


