# Download Google Books by Title/Author

This script will add various fields from Google Books results based on a title and author search

In [None]:
import pandas as pd
import requests
import time
import string
import unicodedata

## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`title_column` - the name of the column header that contains the title
`author_column` - the name of the column header that contains the author

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call



In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/nyt_hardcover_fiction_bestsellers-titles.tsv"
title_column = "title"
author_column = "author"
user_agent = 'YOUR PROJECT NAME HERE'
pause_between_req = 1


In [None]:
def download_google_books(d):



    # if there is already a value skip it
    if 'oclc_marc' in d:
        if type(d['oclc_marc']) == str:        
            print('Skip',d[id_column_name])
            return d

    # if you want to add any logic to only download some records add it here
    if type(d['author_lccn']) == str:
        print('Skip',d[id_column_name])
        return d

    try:
        int(d[id_column_name])
    except:
        print("Cannot convert to int",d[id_column_name])
        return d
    
    url = f"https://www.worldcat.org/webservices/catalog/content/{int(d[id_column_name])}?wskey={WSkey}"
    
    r = requests.get(url, headers={'Accept': 'application/json', 'User-Agent': user_agent})
    
    if '<leader>' not in r.text:
        print("No OCLC record XML:",d[id_column_name])
        return d
    
    d['oclc_marc'] = r.text

    time.sleep(pause_between_req)

    return d

In [None]:
def normalize_string(s):
    s = str(s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = " ".join(s.split())
    s = s.lower()
    s = s.casefold()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = s.replace('the','')
    return s.strip()

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    # if you want it to skip X number of chunks uncomment this, the number is the row to skip to
    # if idx < 537:
    #     continue

    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: download_google_books(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')


