In [29]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import urllib.parse

In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [6]:
chrome_options = Options()

In [7]:
chrome_service = Service(ChromeDriverManager().install())

In [11]:
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

In [14]:
import json

In [17]:
file_path = '../web scraping/libwebscraping.json'

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [20]:
work_titles = []

for entry in data:
    # Assuming each entry is a dictionary and has a key 'workTitle'
    if 'workTitle' in entry:
        work_titles.append(entry['workTitle'])


In [52]:
import logging
logging.basicConfig(filename='selenium_debug.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [58]:
def search_worldcat(search_title):
    driver = webdriver.Chrome()  # or the driver you are using
    encoded_title = urllib.parse.quote_plus(search_title.strip())
    search_url = f"https://search.worldcat.org/search?q={encoded_title}"
    driver.get(search_url)
    logging.debug(f"Navigating to URL: {search_url}")

    try:
        # Wait for the search results to be present
        wait = WebDriverWait(driver, 30)
        all_results = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@data-testid, 'title-')]")))

        # Check if the first result matches the search title
        if all_results:
            first_result = all_results[0]
            first_result_title = first_result.text.strip().lower()
            logging.debug(f"First search result title: {first_result_title}")

            if search_title.strip().lower() == first_result_title:
                first_result.click()

                # Extract the dynamic part from the URL
                current_url = driver.current_url
                dynamic_part = current_url.split('/')[-1]

                # Wait for the subject information to load
                subject_selector = f"span[aria-labelledby='subject-{dynamic_part}']"
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, subject_selector)))
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Find the parent span element and then extract all a tags within it
                subject_container = soup.select_one(subject_selector)
                subject_elements = subject_container.find_all("a", class_="MuiTypography-root")
                subjects = [element.get_text() for element in subject_elements]

                return subjects
            else:
                logging.debug(f"The first result does not match the search title: {search_title}")
        else:
            logging.debug("No search results found.")
    except Exception as e:
        print("Error processing the search result:", e)
        logging.exception("Error processing the search result")
    finally:
        driver.quit()

    return None

In [59]:
title = "Your Paradise"
subjects = search_worldcat(title)
if subjects:
    print("Subjects for", title, ":", subjects)

Subjects for Your Paradise : ['Corée du Sud Conditions sociales Romans, nouvelles, etc', 'Fiction', 'Korea (South)', 'Korea (South) Social conditions Fiction', 'Leprosy Patients', 'Leprosy Patients Korea (South)', 'Lépreux Corée du Sud', 'Romans', 'Social conditions']
