In [11]:
import requests
from time import sleep
import pickle

In [2]:
# Load the categories

category_ids = {}

with open("categories.txt", "r") as file:
    for line in file.readlines():
        cat = line.strip().split(" - ")
        category_ids[cat[0]] = int(cat[1])

        
category_ids["Essays on Pets"]

36228

## Getting book urls for a particular category

In [3]:
def get_book_details_by_category(cat_id, per_page=48, start_page=1, delay=1):
    """Polls the database for new books until none are provided. Only gets paperback and ebooks"""
    
    # Construct the URL
    
    base_url = "https://www.harlequin.com/item_solr_query.ajx?"
    category = f"categoryId={cat_id}"
    filters = "&queryString= AND (mfgpartno:'eb' OR mfgpartno:'tp') AND (podate:[* TO NOW])"
    page_num = start_page
    
    # Flag for results
    new_results = True
    
    # Holder for results sets
    holder = []
    
    while new_results:
        full_url = base_url + category + f"&reqKey=Category&page={page_num}&pageNumber={per_page}" + filters
        response = requests.get(full_url)
        docs = response.json()["response"]["docs"]
        if len(docs) > 0:
            holder.extend(docs)
            page_num += 1
        else:
            new_results = False
        sleep(delay)
    
    return holder

## Getting book details for all genres (slow)

In [4]:
def get_book_details_for_categories(categories):
    """Gets book details for a range of categories"""
    
    # Holder for all books
    holder = []
    
    # Category number
    count = 1
    
    for key, value in categories.items():
        print(" " * 200, end="\r")
        print(f"Trying {key} ({count}/{len(categories.keys())}); {len(holder)} books gathered so far.                       ", end="\r")
        try:
            holder.extend(get_book_details_by_category(value))
        except:
            print(f"Failed with category {key}")
        count += 1
    return holder

In [5]:
books = get_book_details_for_categories(category_ids)

Trying Ménage (450/450); 75597 books gathered so far.                                                                                                                                                   

In [10]:
len(books)

75624

## Save the raw data

In [12]:
with open("books.pkl", "wb") as file:
    pickle.dump(books, file)