In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

def get_genre_subject(url):
    driver = webdriver.Chrome()
    driver.get(url)

    # Click on the "Full details" button
    driver.find_element(By.XPATH, "//a[@data-key='full-details-link']").click()

    # Wait for the overlay to appear (you may need to adjust the wait time)
    driver.implicitly_wait(5)

    # Get the HTML content after the overlay is loaded
    html = driver.page_source

    # Now you can use BeautifulSoup to extract genre and subject values
    soup = BeautifulSoup(html, 'html.parser')

    # Find the genre label and ul element
    genre_label = soup.select_one('.cp-bib-field-label:-soup-contains("Genre")')

    # Creating the genre_label
    if genre_label:
        genre_ul = genre_label.find_next('ul', class_='values-list')
        # Extract the text content from each span within the li elements
        genre_values = [li.select_one('span.formatted-value').text for li in genre_ul.find_all('li')]
        # Remove dots and extra spaces from genre values
        genre_values = ', '.join([genre.replace(".", "").strip() for genre in genre_values])  
    else:
        genre_values = None

    # Similarly, extract subject values
    subject_label = soup.select_one('.cp-bib-field-label:-soup-contains("Subject")')

    # Creating the subject_label
    if subject_label:
        subject_ul = subject_label.find_next('ul', class_='values-list')
        # Extract the text content from each span within the li elements
        subject_values = [li.select_one('span.formatted-value').text for li in subject_ul.find_all('li')]
        # Remove dots and extra spaces from subject values
        subject_values = ', '.join([subject.replace(".", "").strip() for subject in subject_values])
    else:
        subject_values = None

    # Don't forget to close the Selenium WebDriver
    driver.quit()

    return genre_values, subject_values

In [2]:
# Executing "get_genre_subject" function 

url = "https://kdl.bibliocommons.com/v2/record/S174C631702"

genre_values, subject_values = get_genre_subject(url) # one problem with this approach of inserting commas as separators is that some subjects/genres already have commas in them

In [19]:
#Taking in a KDL link to get basic information about a single book

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

def book_info(url):

    html = requests.get(url)

    soup = BeautifulSoup(html.content, 'html.parser')

    # Title
    title = soup.find("h1" ,class_='cp-heading heading-modest title heading--linked') #Find_all does not have a text attribute so would need to use find
    title = title.text
    t_length = int(len(title)/2)

    # Description - need a try block here for titles that don't have descriptions listed 
    try:
        description = soup.find("div", class_="cp-bib-description").text
    except AttributeError:
        description = "No description found in Bibliocommons" 

    # Availability
    sid = soup.find(id='content')
    status = sid.find("span", class_ = "cp-screen-reader-message cp-format-chooser-sr-message")
    status = status.text.split(",")[4].replace(".", "").strip()

    # Rating 
    u_rating = soup.find(class_='rating-info')
    rating = u_rating.text.split("(")[0][12:]

    # Item Type
    sid = soup.find(id='content')
    item_type = sid.find("span", class_ = "cp-screen-reader-message cp-format-chooser-sr-message")
    item_type = item_type.text.split(",")[1].strip()

    # Genre & Subject
    genre_values, subject_values = get_genre_subject(url)

    # Author
    author = soup.find("div", class_='cp-author-link')
    author = author.text
    a_length = int(len(author)/2)  

    #print("Title: " + title[t_length:] + "\n" + "author" + author[length:] + "\n" + "Item Type: " + item_type + "\n" + "Rating: " + rating + "\n" "Status: " + status + "\n" "Description: " + description + "\n")
 
    new_data = {
    "Title": title[t_length:],
    "Author": author[a_length:],
    "Item Type": item_type,
    "Rating": rating,
    "Status": status,
    "Description": description,
    'Genre': genre_values,
    'Subject': subject_values,
    'Link': url
    } 

    new_data = pd.DataFrame(new_data, index=[0])
    
    return new_data

In [18]:
# Trying out "book_info" function 
url = "https://kdl.bibliocommons.com/v2/record/S174C940822"
data = book_info(url)

# Saving the data gathered from "book_info" function and saving it as a df

df = pd.DataFrame(columns=["Title", "Author", "Item Type", "Rating", "Status", "Description", "Genre", "Subject", "Link"], index=None) # need to initialize df

df = pd.concat([df, data], ignore_index=True)

In [None]:
# Troubleshooting getting # of copies and avaliability  
# All of the circ info is there on the webpage but from some reason I can get that HTML code when I scrap it.... 

from bs4 import BeautifulSoup

# Assuming html_content contains your HTML code
html_content = """
<div class="panel-section circulation-section"><div class="cp-circulation-info"><span class="availability-status"><span class="cp-availability-status available" data-key="availability-status-available">Available but not holdable</span></span><div class="circulation-details"><div><span aria-hidden="true" class="total-copies-count"><span class="circulation-count">41</span> copies</span><span class="cp-screen-reader-message">41 copies</span></div><div><span aria-hidden="true" class="available-count"><span class="circulation-count">3</span> available</span><span class="cp-screen-reader-message">3 available</span></div><div><span aria-hidden="true" class="on-hold-count"><span class="circulation-count">108</span> on hold</span><span class="cp-screen-reader-message">108 on hold</span></div></div></div></div>
"""
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the span with class "availability-status" and get its text
circs = soup.find_all('div', class_='circulation-details')

for circ in circs:
    print(circ.text)

In [11]:
# Getting the number of pages that the staff list has so that I can then itterate by that number

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re 
import math
import time

url = "https://kdl.bibliocommons.com//list/share/1709636399_kdl_adults/2480807429_literary_easter_eggs_in_tom_lake_by_ann_patchett"
html = requests.get(url)
s = BeautifulSoup(html.content, 'html.parser')

total_books = s.find('span', class_='item_count')  # Accounting for webpages that have multiple pages 
if total_books is None:
    total_books = s.find('span', class_='item_count_label')

match = re.search(r'\d+', total_books.text)
if match:
    totbok = int(match.group())
    
    numpages = math.ceil(totbok/25) # Using math.ceil to round up answer to nearest whole number
    
    print(numpages)

1


In [22]:
# Getting all the item ids of one staff list page scraped 

import pandas as pd
import requests
from bs4 import BeautifulSoup

item_id_list = []
url = "https://kdl.bibliocommons.com/list/share/2069242809_cekovach1/2480796599_teen_must_reads"
html = requests.get(url)

s = BeautifulSoup(html.content, 'html.parser')

#results = s.find(id='main') # in case we need it 
book_divs = s.find_all('div', class_='list_item_title')

for book_div in book_divs:
    link = book_div.find('a')

    if link:
        item_id = link['href'].split("/")[3]

        if item_id.endswith("174"):
            item_id = int(item_id[:-3])
            item_id_list.append(item_id)

In [9]:
# looping over the URL to get every page of books scraped 

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re 
import math
import time
import random

item_id_list = []
url = "https://kdl.bibliocommons.com/list/share/2069242809_cekovach1/2480796599_teen_must_reads"

for page in range(1, numpages + 1):  # odd way of writing this because range(4) would output as 0,1,2,3. So I have to set a starting point of 1 and add 1 to numpages at the end
    full_url = url + f"?page={page}"
    html = requests.get(full_url)

    sleep_time = random.randint(1,10)
    time.sleep(sleep_time)

    s = BeautifulSoup(html.content, 'html.parser')
    #results = s.find(id='main') # in case we need it 
    book_divs = s.find_all('div', class_='list_item_title')

    for book_div in book_divs:
        link = book_div.find('a')
        
        if link:
            item_id = link['href'].split("/")[3]

            if item_id.endswith("174"):  # inadvertently filters out all non-books in list
                item_id = int(item_id[:-3])
                item_id_list.append(item_id)

In [None]:
# Looping through the entire list of book id numbers to get their information 

#with the current settings it took 11 mins 30 seconds to run, also only produces 72 of 78 items because we're excluding non books atm *shrug*

import random
import time

df = pd.DataFrame(columns=["Title", "Item Type", "Rating", "Status", "Description"], index=None) # Initialzing our data to store all output from book_info function 
intervals = 0 

newlist = item_id_list[:5]

for id in newlist:
    url = f"https://kdl.bibliocommons.com/v2/record/S174C{id}"

    sleep_time = random.randint(1,10)
    time.sleep(sleep_time)

    info = book_info(url)
    df = df.append(info, ignore_index=True)
    
    intervals += 1
    print(intervals, "out of ", totbok, "\n")

In [20]:
# "get_books_from_staff_list" function takes a staff list as input and returns information about all the books in that staff list

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re 
import math
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By

# Run book_info & get_genre_subject function first!

def get_books_from_staff_list(staff_pick_url):

    html = requests.get(staff_pick_url)
    s = BeautifulSoup(html.content, 'html.parser')

    sleep_time = random.randint(1,15) # Sleep time adjustments 
    time.sleep(sleep_time)

    total_books = s.find('span', class_='item_count')  # Accounting for webpages that have multiple pages 
    if total_books is None:
        total_books = s.find('span', class_='item_count_label')

    match = re.search(r'\d+', total_books.text) # keeps any digits in the string
    if match:
        totbok = int(match.group())
        
        numpages = math.ceil(totbok/25) # Using math.ceil to round up answer to nearest whole number

    time.sleep(2) # breaking up the inital page requests - Sleep time adjustments 

    item_id_list = [] # initalizing our list to store item ids

    for page in range(1, numpages + 1):  # odd way of writing this because range(4) would output as 0,1,2,3. So I have to set a starting point of 1 and add 1 to numpages at the end
        full_url = staff_pick_url + f"?page={page}"
        html = requests.get(full_url)

        sleep_time = random.randint(1,15) # Sleep time adjustments 
        time.sleep(sleep_time)

        s = BeautifulSoup(html.content, 'html.parser')
        book_divs = s.find_all('div', class_='list_item_title')

        for book_div in book_divs:
            link = book_div.find('a')
            
            if link:
                item_id = link['href'].split("/")[3]

                if item_id.endswith("174"):  # inadvertently filters out all non-books in list
                    item_id = int(item_id[:-3])
                    item_id_list.append(item_id)

    name_of_staff_list = s.find('h1', class_='list_title') #Gathers the name of the staff list to be printed out 
    name_of_staff_list = name_of_staff_list.text.strip()

    staff_list_books = pd.DataFrame(columns=["Title", "Author", "Item Type", "Rating", "Status", "Description", "Genre", "Subject", "Link"], index=None) # Initialzing our data to store all output from book_info function 
    intervals = 0 

    for id in item_id_list:
        url = f"https://kdl.bibliocommons.com/v2/record/S174C{id}"

        sleep_time = random.randint(1,15)  # Sleep time adjustments
        time.sleep(sleep_time)

        data = book_info(url)
        #staff_list_books = staff_list_books.append(data, ignore_index=True)

        staff_list_books = pd.concat([staff_list_books, data], ignore_index=True)

        intervals += 1
        print(f"Scraped item {intervals} out of {totbok} from \"{name_of_staff_list}\" Staff List \n")

    return staff_list_books, item_id_list, totbok, numpages

In [None]:
# Running "get_books_from_staff_list" Function 
# Takes 22 minutes to run on Teen Must Reads list

staff_pick_url = "https://kdl.bibliocommons.com/list/share/1709638099_kdl_staffpicks/2469845619_anticipated_new_non-fiction" 

staff_list_books, item_id_list, totbok, numpages = get_books_from_staff_list(staff_pick_url)

In [19]:
# code to save a df to an excel file if desired
file_path = r"C:\Users\Ryan\Desktop\book_list2.xlsx"

# Save the DataFrame as an Excel file
df.to_excel(file_path, index=False)

#print(f"DataFrame saved to {file_path}")

In [21]:
# Gathering a list of all of the staff lists from a URL  

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def staff_list_accumulation(list_of_staff_lists):

    html = requests.get(list_of_staff_lists)
    s = BeautifulSoup(html.content, 'html.parser')

    # Find all date elements
    date_boxes = s.find_all('div', class_='dataPair clearfix small list_created_date')

    # Find all titles, categories, and descriptions
    titles = s.find_all('span', class_='title')
    categories = s.find_all('div', class_='list_type small')
    descriptions = s.find_all('div', class_='description')

    # Create an empty DataFrame
    sl_df = pd.DataFrame(columns=['SL_Title', 'Category', 'SL_Description', 'SL_Created', 'SL_Link'])

    # Iterate through both the date elements and titles/categories/descriptions
    for date_box, title, category, description in zip(date_boxes, titles, categories, descriptions):
        # Extract the date
        date_u = date_box.find('span', class_='value')
        date = datetime.strptime(date_u.text.strip(), '%b %d, %Y') if date_u else None

        # Extract book list link 
        link = title.find('a')
        if link:
            link = (link.get('href'))
            init_url = "https://kdl.bibliocommons.com/"
            full_link = init_url + link

        # Extract title, category, and description
        title_text = title.text.strip() 
        category_text = category.text.strip() 
        description_text = description.text.strip() #if description else None

        # Append to the DataFrame
        new_row = {'SL_Title': title_text, 'Category': category_text, 'SL_Description': description_text, 'SL_Created': date, 'SL_Link': full_link}
        sl_df = pd.concat([sl_df, pd.DataFrame([new_row])], ignore_index=True)

    return sl_df

In [13]:
# Running "staff_list_accumulation" function by itself to get information about many staff lists from a KDL URL

list_of_staff_lists = "https://kdl.bibliocommons.com/search?creator_library=174&creator_type=STAFF&display_quantity=2&page=1&q=ignored&search_category=alllists&supress=true&t=alllists&title=All+staff+lists"

sl_df = staff_list_accumulation(list_of_staff_lists)

#sl_df

In [2]:
from Functions import *

In [None]:
                        ###################################################### THE MONSTER ######################################################

# Code to look at a list of staff lists, itterate through each list, then scrape the list of books of the list and append to a monster_books, combines everything done above

list_of_staff_lists = "https://kdl.bibliocommons.com/search?creator_library=174&creator_type=STAFF&display_quantity=1&page=6&q=ignored&search_category=alllists&supress=true&t=alllists&title=All+staff+lists"

sl_df = staff_list_accumulation(list_of_staff_lists)

staff_list_links = sl_df['SL_Link'].tolist()
staff_list_titles = sl_df['SL_Title'].tolist()

# Assuming you have an existing DataFrame called monster_books
monster_books = pd.DataFrame()

# Iterate through staff_list_links
for i, staff_pick_url in enumerate(staff_list_links):
    # Call the function and get the results
    staff_list_books, item_id_list, totbok, numpages = get_books_from_staff_list(staff_pick_url)

    # Add a 'Staff List Name' column to the staff_list_books DataFrame
    staff_list_books['Staff List Name'] = staff_list_titles[i]

    # Append the staff_list_books DataFrame to the existing DataFrame
    monster_books = pd.concat([monster_books, staff_list_books], ignore_index=True)

In [7]:
# Merging the books and staff lists DataFrames on the together

all_results_df = pd.merge(monster_books, sl_df, how='inner', left_on='Staff List Name', right_on='SL_Title')

# Drop the duplicate column 'title' (if needed)
all_results_df.drop(columns=['SL_Title'], inplace=True)

In [8]:
# code to save a df to an excel file if desired
file_path = r"C:\Users\Ryan\Coding Projects\Web Scraping\Saved Data Files\book_list_monster.xlsx"

# Save the DataFrame as an Excel file
all_results_df.to_excel(file_path, index=False)

#print(f"DataFrame saved to {file_path}")

In [None]:
# Grabbing the number of pages in a staff list section - 
# not really relavent because you can put 100 staff lists on a single page in the URL and I won't be doing more than that unless I go psycho

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re 
import math
import time

url = "https://kdl.bibliocommons.com/search?creator_library=174&creator_type=STAFF&display_quantity=100&page=2&q=ignored&search_category=alllists&supress=true&t=alllists&title=All+staff+lists"
html = requests.get(url)
s = BeautifulSoup(html.content, 'html.parser')

total_list = s.find('div', class_='utility_bar_paginate')  # Accounting for webpages that have multiple pages 
digits = re.findall(r'\d+', total_list.text) # Using regex to find all digits that are put into a list 
total_staff_lists = int(digits[2])

staff_list_pages = math.ceil(total_staff_lists/25)
staff_list_pages
