In [3]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import urllib.parse

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [5]:
chrome_options = Options()

In [6]:
chrome_service = Service(ChromeDriverManager().install())

In [7]:
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

In [8]:
import json

In [17]:
file_path = '../web scraping/libwebscraping.json'

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [20]:
work_titles = []

for entry in data:
    # Assuming each entry is a dictionary and has a key 'workTitle'
    if 'workTitle' in entry:
        work_titles.append(entry['workTitle'])


In [9]:
import logging
logging.basicConfig(filename='selenium_debug.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [14]:
def search_worldcat(search_title):
    driver = webdriver.Chrome()  # or the driver you are using
    encoded_title = urllib.parse.quote_plus(search_title.strip())
    search_url = f"https://search.worldcat.org/search?q={encoded_title}"
    driver.get(search_url)
    logging.debug(f"Navigating to URL: {search_url}")

    try:
        # Attempt to accept cookies if the consent banner appears
        try:
            # Wait for the cookie acceptance button to be clickable
            wait_for_cookies = WebDriverWait(driver, 10)
            accept_cookies_button = wait_for_cookies.until(
                EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
            )
            # Click the accept cookies button
            accept_cookies_button.click()
            print("Cookies accepted.")
        except Exception as e:
            print("Cookie consent dialog not found or error accepting cookies:", e)
        
        # Proceed with the original function logic
        # Wait for the search results to be present
        wait = WebDriverWait(driver, 30)
        all_results = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@data-testid, 'title-')]")))

        # Check if the first result matches the search title
        if all_results:
            first_result = all_results[0]
            first_result_title = first_result.text.strip().lower()
            logging.debug(f"First search result title: {first_result_title}")

            if search_title.strip().lower() == first_result_title:
                first_result.click()

                # Extract the dynamic part from the URL
                current_url = driver.current_url
                dynamic_part = current_url.split('/')[-1]

                # Wait for the subject information to load
                subject_selector = f"span[aria-labelledby='subject-{dynamic_part}']"
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, subject_selector)))
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Find the parent span element and then extract all a tags within it
                subject_container = soup.select_one(subject_selector)
                subject_elements = subject_container.find_all("a", class_="MuiTypography-root")
                subjects = [element.get_text() for element in subject_elements]
                print(subjects)
                return subjects
            else:
                logging.debug(f"The first result does not match the search title: {search_title}")
        else:
            logging.debug("No search results found.")
    except Exception as e:
        print("Error processing the search result:", e)
        logging.exception("Error processing the search result")
    finally:
        driver.quit()

    return None

In [15]:
title = "Your Paradise"
subjects = search_worldcat(title)
if subjects:
    print("Subjects for", title, ":", subjects)

Cookies accepted.
Subjects for Your Paradise : ['Corée du Sud Conditions sociales Romans, nouvelles, etc', 'Fiction', 'Korea (South)', 'Korea (South) Social conditions Fiction', 'Leprosy Patients', 'Leprosy Patients Korea (South)', 'Lépreux Corée du Sud', 'Romans', 'Social conditions']


In [19]:
with open('./refined_fiction_libwebscraping.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [21]:
for entry in data:
    subjects = search_worldcat(entry['sourceTitle'])
    if subjects is not None:
        entry['subjects'] = subjects


Cookies accepted.
Cookies accepted.
Cookies accepted.
Cookies accepted.
Cookies accepted.
Cookies accepted.
Cookie consent dialog not found or error accepting cookies: Message: stale element reference: stale element not found
  (Session info: chrome=121.0.6167.139); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x0000000104d667dc chromedriver + 4040668
1   chromedriver                        0x0000000104d5e9e0 chromedriver + 4008416
2   chromedriver                        0x00000001049d1870 chromedriver + 284784
3   chromedriver                        0x00000001049ded9c chromedriver + 339356
4   chromedriver                        0x00000001049d6cb8 chromedriver + 306360
5   chromedriver                        0x00000001049d6dc4 chromedriver + 306628
6   chromedriver                        0x00000001049d552c chromedriver + 300332

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=121.0.6167.139)
Stacktrace:
0   chromedriver                        0x0000000102b4e7dc chromedriver + 4040668
1   chromedriver                        0x0000000102b469e0 chromedriver + 4008416
2   chromedriver                        0x00000001027b9870 chromedriver + 284784
3   chromedriver                        0x00000001027a2a38 chromedriver + 191032
4   chromedriver                        0x00000001027a2a6c chromedriver + 191084
5   chromedriver                        0x00000001027a2294 chromedriver + 189076
6   chromedriver                        0x00000001027c3e00 chromedriver + 327168
7   chromedriver                        0x0000000102837220 chromedriver + 799264
8   chromedriver                        0x00000001027f174c chromedriver + 513868
9   chromedriver                        0x00000001027f2044 chromedriver + 516164
10  chromedriver                        0x0000000102b13a04 chromedriver + 3799556
11  chromedriver                        0x0000000102b17ee4 chromedriver + 3817188
12  chromedriver                        0x0000000102afc260 chromedriver + 3703392
13  chromedriver                        0x0000000102b18a2c chromedriver + 3820076
14  chromedriver                        0x0000000102aef01c chromedriver + 3649564
15  chromedriver                        0x0000000102b35e3c chromedriver + 3939900
16  chromedriver                        0x0000000102b35fb4 chromedriver + 3940276
17  chromedriver                        0x0000000102b46660 chromedriver + 4007520
18  libsystem_pthread.dylib             0x000000018756f034 _pthread_start + 136
19  libsystem_pthread.dylib             0x0000000187569e3c thread_start + 8


In [18]:
with open('updated_libwebscraping.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)