# Advanced Crawling

## Crawling and scraping a webpage in amazon

In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the Amazon product page
url = "https://www.amazon.com/Atomic-Habits-Proven-Build-Break/dp/0735211299/?_encoding=UTF8&pd_rd_w=fifPC&content-id=amzn1.sym.f2128ffe-3407-4a64-95b5-696504f68ca1&pf_rd_p=f2128ffe-3407-4a64-95b5-696504f68ca1&pf_rd_r=93MK9W7SB3AATE6Y30BX&pd_rd_wg=iexfr&pd_rd_r=1866f822-5759-48aa-aed7-33eb83d8a0b6&ref_=pd_hp_d_btf_crs_zg_bs_283155"

try:
    # Send a GET request to the webpage
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Attempt to extract product title and price
    title = soup.find('span', id='productTitle')
    price = soup.find('span', class_='a-offscreen')  # Common class for price

    # Check if elements were found
    if title:
        print("Product Title:", title.get_text(strip=True))
    else:
        print("Failed to find product title. The content may be dynamically loaded.")

    if price:
        print("Price:", price.get_text(strip=True))
    else:
        print("Failed to find price. The content may be dynamically loaded.")

except requests.RequestException as e:
    print(f"Error fetching the webpage: {e}")

Failed to find product title. The content may be dynamically loaded.
Failed to find price. The content may be dynamically loaded.


In [None]:
url = "https://www.amazon.com/Atomic-Habits-Proven-Build-Break/dp/0735211299/?_encoding=UTF8&pd_rd_w=fifPC&content-id=amzn1.sym.f2128ffe-3407-4a64-95b5-696504f68ca1&pf_rd_p=f2128ffe-3407-4a64-95b5-696504f68ca1&pf_rd_r=93MK9W7SB3AATE6Y30BX&pd_rd_wg=iexfr&pd_rd_r=1866f822-5759-48aa-aed7-33eb83d8a0b6&ref_=pd_hp_d_btf_crs_zg_bs_283155" 
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    ),
    # Adding the "Accept-Language" header helps ensure Amazon returns the English version of the page,
    # which can prevent issues where content is dynamically loaded or hidden based on region or language.
    "Accept-Language": "en-US,en;q=0.9",
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    # Example: Extract the book title
    title_tag = soup.find("span", id="productTitle")
    if title_tag:
        print("Book Title:", title_tag.get_text(strip=True))
    else:
        print("Book title not found.")
else:
    print(f"Failed to fetch page. Status code: {response.status_code}")


Book Title: Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones


In [3]:
import random
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

USER_AGENTS = [
    # A few common user agents
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
]

user_agent = random.choice(USER_AGENTS)

# try:
import undetected_chromedriver as uc
# Set up Selenium with Chrome in headless mode
chrome_options = Options()
chrome_binary = "/opt/google/chrome/chrome"
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('--headless')         # Run Chrome without GUI
chrome_options.add_argument('--disable-gpu')      # Disable GPU acceleration

# Initialize the WebDriver
driver = uc.Chrome(options=chrome_options, browser_executable_path=chrome_binary)

# Navigate to the webpage
driver.get(url)

# Wait for the product title to load (max 10 seconds)
title_element = WebDriverWait(driver, 50).until(
    EC.presence_of_element_located((By.ID, 'productTitle'))
)
print("Product Title:", title_element.text.strip())

# Try multiple strategies to find the price
price = None
price_selectors = [
    (By.CLASS_NAME, 'a-offscreen'),  # Common price class
    (By.ID, 'price'),  # ID for main price
    (By.CLASS_NAME, 'a-price-whole'),  # Whole price part
    (By.XPATH, "//span[contains(@class, 'a-price') and contains(@class, 'a-text-price')]//span"),  # Price with discount
    (By.XPATH, "//span[@data-a-size='xl']//span[@class='a-offscreen']")  # Larger price display
]

for by, selector in price_selectors:
    try:
        price_element = WebDriverWait(driver, 50).until(
            EC.presence_of_element_located((by, selector))
        )
        price_text = price_element.text.strip()
        print("price text: ", price_text)
        if price_text and '$' in price_text:  # Ensure it's a valid price
            price = price_text
            break
    except TimeoutException:
        continue

if price:
    print("Price:", price)
else:
    print("Failed to find price. The price element may not be present or is dynamically loaded with a different structure.")
# except TimeoutException:
#     print("Failed to load elements. The page may not have loaded correctly or elements are missing.")
# except WebDriverException as e:
#     print(f"Error with WebDriver: {e}")
# finally:
#     # Clean up by closing the browser
#     driver.quit()

Product Title: Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones
price text:  
price text:  11
price text:  
price text:  
Failed to find price. The price element may not be present or is dynamically loaded with a different structure.


## Crawling Goodreads.com

In [50]:
import requests
from bs4 import BeautifulSoup

# --- Prepare headers and cookies for the request ---
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9,fa-IR;q=0.8,fa;q=0.7",
    "referer": "https://www.goodreads.com/book/show/40121378-atomic-habits?ac=1&from_search=true&qid=okcu46oZQB&rank=1",
    "sec-ch-ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
}

# You may need to update the cookies below with valid session cookies from your browser if authentication is required
cookies = {
    "ccsid": "492-7803361-5127910",
    "locale": "en",
    "blocking_sign_in_interstitial": "true",
    "csm-sid": "140-1904891-1149508",
    "__qca": "P1-dcf4d398-b23a-409a-a59d-eb03958544de",
    "session-id": "139-4638899-0436333",
    "csm-hit": "tb:s-VK2SR48DSB4YR958HVRT|1753281489948&t:1753281489949",
    "ubid-main": "132-4139823-5053422",
    "lc-main": "en_US",
    "session-id-time": "2384001530l",
    "session-token": 'u65B3YcOHKls1AxXOCfKGuVaAp1yKDJmitfEZmMHLyx4WfkOOsH5FDmuQJGh9dA/rFSSecAzTrkJexsobtadH/oamEcY1vvltjEUcopjyzcnfSSv30ennQltlT9WkUu63FMjIrEaEQhTr34BjZoGh4v/+tG558lF2T8jRfWyHA7JY98O+DiT2ddvAjKpN8TbffR9Yejpoxydi4Mm7ymcdz94nIdsT57eGlVtgSDsS47J3CuGOCWtUSuwtjyVK64Y+ZLxWc2MxmaFBcYu/msSajgYPwf4eNnvkktaTvn/17fKYzY521DksPCGHePJ8/eI7JLzacxYRB+o2ox1J+60HwvdDAyyf0e5DTEZABJtucfWKBiO/5R3yQ==',
    "x-main": "cuan1qUXnegr6@riNsHN7b47Ck1?cj9ETfvl1t1ssagMDkjY??UiAH9Kb1FgIQ?c",
    "at-main": "Atza|IwEBIO92cFCkZUBPNU77aExHRArakpB05s0gtzdONR0LC_9ImsTegmhue2_fAV8sP0UtKGGBvVDY810io19Ao7NRAah0k6nm1VbHPIZ0LvDrxW0YEz-kZ5DpiQ7L38pGGxITohzP4jRqIay4icj9mQ63D-8aXpT0MlaxN1AvFYHoD5dMzuSsOV8imxgzM0-kx7TrACJFl6xgVQUVQJZT35gvkuyg0ZXhHl8T60DdjC1wx60qySdA8PWDEQNhysdIM-6xWhU",
    "sess-at-main": "2O1lPqp43Pd0oeUvVdq0SGLxnE1K5mNvYdv/gtlNeI4=",
    "likely_has_account": "true",
    "_session_id2": "4cb74df2b59f2989f81fe9fdbbc1fcae",
    "jwt_token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImtpZCI6ImZSNXpfWTVjYXZQMllsaXU3eks0YUNJVEJPcVBWdGtxTE9XVURfV3dGOTQifQ.eyJpc3MiOiJodHRwczovL3d3dy5nb29kcmVhZHMuY29tIiwic3ViIjoia2NhOi8vcHJvZmlsZTpnb29kcmVhZHMvQTE5MzBBT05CQ09RTlUiLCJhdWQiOiI2M2RjMjRlN2M2MTFlNmYxNzkyZjgxMzA1OGYyMTU2MGJkOGM2OTM4ZDU0YS5nb29kcmVhZHMuY29tIiwidXNlcl9pZCI6MTkyMzI4MjkyLCJyb2xlIjoidXNlciIsIm5vbmNlIjpudWxsLCJleHAiOjE3NTMyODU1OTIsImlhdCI6MTc1MzI4NTI5Mn0.jLk1Q6Did_zPUkQNC5khGgQWxlKPE8V_aWVECZ3SAAGf28zK8eVXhxfBg6hdlG7_nGvcJpnunpOQRwaaphtnqk2CJdSzvU5H3jLEMZyzjNuFHa2XX4FerdpSdJrJs14mcvK0cDfe6woBwNbYyHzKnci5SjPxeB-DqhQ4nDF7i59fQwBXk7DMz5GkLA5axJb2xWMOOhgLToR_smMQbSVkD4OPVGEIGTdup47XfNZhm30q3bCoePpQ90LEbW9taFESHj1oXMTJzHSGPqCp7btLJUS5_s7-37cy1CgyRbx6Aftnd1Md37QqbsHV5w1NfJfAGC11z7m5w9mlhzNqc8_8eA"
}

# --- Perform the search ---
keyword = "data science"
search_url = f"https://www.goodreads.com/search?q={requests.utils.quote(keyword)}&ref=nav_sb_noss_l_4"

books = []  # List to store all book details

response = requests.get(search_url, headers=headers, cookies=cookies)
if response.status_code == 200:
    print(f"Searched for '{keyword}' successfully.")
    soup = BeautifulSoup(response.text, "html.parser")
    # Try to find the results table
    results_table = soup.select_one("table.tableList")
    if results_table:
        rows = results_table.select("tr")
        print(f"Found {len(rows)} search results.")
        for idx, row in enumerate(rows[:10], 1):  # Show up to 10 results
            # Extract book title and URL
            title_elem = row.select_one("a.bookTitle")
            title = title_elem.get_text(strip=True) if title_elem else "N/A"
            book_url = (
                "https://www.goodreads.com" + title_elem["href"]
                if title_elem and title_elem.has_attr("href")
                else "N/A"
            )

            # Extract author name and URL
            author_elem = row.select_one("a.authorName")
            author = author_elem.get_text(strip=True) if author_elem else "N/A"
            author_url = (
                author_elem["href"] if author_elem and author_elem.has_attr("href") else "N/A"
            )

            # Extract image source (cover)
            image_elem = row.select_one("img.bookCover")
            image_src = image_elem["src"] if image_elem and image_elem.has_attr("src") else "N/A"

            # Extract minirating and other details
            minirating_elem = row.select_one("span.minirating")
            minirating = minirating_elem.get_text(strip=True) if minirating_elem else "N/A"

            # Extract publication year if available
            pub_year = "N/A"
            greytext_elem = row.select_one("span.greyText.smallText.uitext")
            if greytext_elem:
                import re
                match = re.search(r'published\s+(\d{4})', greytext_elem.get_text())
                if match:
                    pub_year = match.group(1)

            # Extract number of editions if available
            editions_elem = row.select_one("a.greyText[href*='/work/editions/']")
            editions = editions_elem.get_text(strip=True) if editions_elem else "N/A"

            # Append book details to the list
            books.append({
                "title": title,
                "book_url": book_url,
                "author": author,
                "author_url": author_url,
                "image_src": image_src,
                "minirating": minirating,
                "pub_year": pub_year,
                "editions": editions
            })

            print(f"{idx}. {title} by {author}")
        # Optionally, print the books list or use it elsewhere
        # print(books)
    else:
        print("No search results table found on the page.")
else:
    print(f"Failed to search. Status code: {response.status_code}")

Searched for 'data science' successfully.
Found 20 search results.
1. Data Science for Business: What You Need to Know about Data Mining and Data-Analytic Thinking by Foster Provost
2. Data Smart: Using Data Science to Transform Information into Insight by John W. Foreman
3. Data Science from Scratch: First Principles with Python by Joel Grus
4. R for Data Science: Import, Tidy, Transform, Visualize, and Model Data by Hadley Wickham
5. Mindmasters: The Data-Driven Science of Predicting and Changing Human Behavior by Sandra Matz
6. Doing Data Science: Straight Talk from the Frontline by Cathy O'Neil
7. Machine Learning For Absolute Beginners: A Plain English Introduction (Second Edition) by Oliver Theobald
8. Python Data Science Handbook: Essential Tools for Working with Data by Jake VanderPlas
9. Numsense! Data Science for the Layman: No Math Added by Annalyn Ng
10. Data Science (The MIT Press Essential Knowledge series) by John D. Kelleher


In [56]:
url = "https://www.goodreads.com/book/show/17912916-data-science-for-business?from_search=true&from_srp=true&qid=EVSW0x6JUR&rank=1"
resp = requests.get(url)
with open("page_content.html", "w") as f:
    f.write(resp.text)

In [1]:
from bs4 import BeautifulSoup
import json
import re

with open("page_content.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

extracted = {}

# --- Title ---
# 1. h1 tag
title = None
h1 = soup.find("h1", {"data-testid": "bookTitle"})
if h1:
    title = h1.get_text(strip=True)
# 2. meta og:title
if not title:
    meta_og_title = soup.find("meta", property="og:title")
    if meta_og_title and meta_og_title.get("content"):
        title = meta_og_title["content"]
# 3. meta twitter:title
if not title:
    meta_tw_title = soup.find("meta", attrs={"name": "twitter:title"})
    if meta_tw_title and meta_tw_title.get("content"):
        title = meta_tw_title["content"]
# 4. Schema.org script
schema_title = None
schema_data = None
for script in soup.find_all("script", type="application/ld+json"):
    try:
        data = json.loads(script.string)
        if isinstance(data, dict) and data.get("@type") == "Book":
            schema_data = data
            schema_title = data.get("name")
            break
    except Exception:
        continue
if not title and schema_title:
    title = schema_title
extracted["title"] = title

# --- Authors ---
authors = []
# 1. ContributorLink__name spans
for a in soup.select("a.ContributorLink span.ContributorLink__name[data-testid='name']"):
    name = a.get_text(strip=True)
    if name:
        authors.append(name)
# 2. Schema.org
if schema_data and "author" in schema_data:
    schema_authors = schema_data["author"]
    if isinstance(schema_authors, dict):
        schema_authors = [schema_authors]
    for author in schema_authors:
        if isinstance(author, dict) and author.get("@type") == "Person":
            name = author.get("name")
            if name and name not in authors:
                authors.append(name)
extracted["authors"] = authors

# --- Rating ---
rating = None
rating_count = None
if schema_data and "aggregateRating" in schema_data:
    agg = schema_data["aggregateRating"]
    rating = agg.get("ratingValue")
    rating_count = agg.get("ratingCount")
extracted["rating"] = rating
extracted["rating_count"] = rating_count

# --- Description ---
description = None
for meta_name in ["description", "og:description", "twitter:description"]:
    meta = soup.find("meta", attrs={"name": meta_name}) or soup.find("meta", property=meta_name)
    if meta and meta.get("content"):
        description = meta["content"]
        break
extracted["description"] = description

# --- Genres ---
genres = []
# Try to find genre links or tags
for genre_tag in soup.select("a[href*='/genres/'], span.BookPageGenreLink, a.BookPageGenreLink"):
    genre = genre_tag.get_text(strip=True)
    if genre and genre not in genres:
        genres.append(genre)
# Sometimes genres are in divs with role="list" and data-testid="genresList"
for genre_tag in soup.select("[data-testid='genresList'] a, [data-testid='genresList'] span"):
    genre = genre_tag.get_text(strip=True)
    if genre and genre not in genres:
        genres.append(genre)
extracted["genres"] = genres

# --- Number of Pages ---
num_pages = None
if schema_data and "numberOfPages" in schema_data:
    num_pages = schema_data["numberOfPages"]
if not num_pages:
    # Try to find in text
    m = re.search(r"(\d+)\s+pages", html)
    if m:
        num_pages = m.group(1)
extracted["number_of_pages"] = num_pages

# --- Published Data ---
published = None
if schema_data and "datePublished" in schema_data:
    published = schema_data["datePublished"]
if not published:
    # Try to find in text
    pub_match = re.search(r"Published\s+([A-Za-z]+\s+\d{1,2}[a-z]{0,2},?\s+\d{4})", html)
    if pub_match:
        published = pub_match.group(1)
extracted["published"] = published

# --- ISBN ---
isbn = None
if schema_data and "isbn" in schema_data:
    isbn = schema_data["isbn"]
else:
    # Try to find ISBN in text
    m = re.search(r"ISBN(?:13)?:?\s*([\d\-]+)", html)
    if m:
        isbn = m.group(1)
extracted["isbn"] = isbn

# --- Original Title ---
original_title = None
# Look for "Original Title" label
orig_title_elem = soup.find(string=re.compile(r"Original Title", re.I))
if orig_title_elem:
    parent = orig_title_elem.find_parent()
    if parent:
        next_sib = parent.find_next_sibling()
        if next_sib:
            original_title = next_sib.get_text(strip=True)
extracted["original_title"] = original_title

# --- Edition Details ---
edition_details = None
for h4 in soup.find_all("h4", class_="Text__title4"):
    if "This edition" in h4.get_text():
        # Get next sibling or parent block
        sib = h4.find_next_sibling()
        if sib:
            edition_details = sib.get_text(strip=True)
        break
extracted["edition_details"] = edition_details

# --- Other Editions Links ---
other_editions_links = []
for h4 in soup.find_all("h4", class_="Text__title4"):
    if "More editions" in h4.get_text():
        # Find links under this section
        for a in h4.find_all_next("a", href=True, limit=10):
            if "/work/editions/" in a["href"]:
                other_editions_links.append(a["href"])
        break
extracted["other_editions_links"] = other_editions_links

# --- More Information Tags ---
more_info = {}
for h4 in soup.find_all("h4", class_="Text__title4"):
    if "More information" in h4.get_text():
        # Get next sibling or parent block
        sib = h4.find_next_sibling()
        if sib:
            for li in sib.find_all("li"):
                key = li.find("span")
                val = li.find("div")
                if key and val:
                    more_info[key.get_text(strip=True)] = val.get_text(strip=True)
        break
extracted["more_information"] = more_info

# --- Book Statistics Tags ---
book_stats = {}
# Try to find stats near "Ratings & Reviews"
for h2 in soup.find_all("h2", class_="Text__title2"):
    if "Ratings & Reviews" in h2.get_text():
        stats_block = h2.find_next("div")
        if stats_block:
            for stat in stats_block.find_all("span"):
                txt = stat.get_text(strip=True)
                if re.match(r"[\d,]+", txt):
                    book_stats.setdefault("stats", []).append(txt)
        break
extracted["book_statistics"] = book_stats

# --- People read or reading ---
people_reading = []
for h3 in soup.find_all("h3", class_="Text__title3"):
    if "Friends & Following" in h3.get_text():
        # Find user avatars or names in this section
        section = h3.find_parent()
        if section:
            for user in section.find_all("a", href=True):
                if "/user/show/" in user["href"]:
                    name = user.get_text(strip=True)
                    if name:
                        people_reading.append(name)
        break
extracted["people_reading"] = people_reading

# --- Suggested Books Comments ---
suggested_books = []
for h3 in soup.find_all("h3", class_="Text__title3"):
    if "Readers also enjoyed" in h3.get_text():
        section = h3.find_parent()
        if section:
            for a in section.find_all("a", href=True):
                if "/book/show/" in a["href"]:
                    suggested_books.append({
                        "title": a.get_text(strip=True),
                        "url": a["href"]
                    })
        break
extracted["suggested_books"] = suggested_books

# --- Comments/Discussion ---
comments = []
for h2 in soup.find_all("h2", class_="Text__h2"):
    if "Join the discussion" in h2.get_text():
        section = h2.find_parent()
        if section:
            for comment in section.find_all("div", class_=re.compile("Comment")):
                txt = comment.get_text(strip=True)
                if txt:
                    comments.append(txt)
        break
extracted["comments"] = comments

# --- Print or Save the extracted data ---
import pprint
pprint.pprint(extracted)


{'authors': ['Foster Provost', 'Tom Fawcett', 'Foster Provost'],
 'book_statistics': {},
 'comments': [],
 'description': 'Read 177 reviews from the world’s largest community \n'
                '    for readers. Written by renowned data science experts '
                'Foster Provost and Tom Fawcett, Data Science for …',
 'edition_details': None,
 'genres': ['Business',
            'Nonfiction',
            'Technology',
            'Science',
            'Computer Science',
            'Programming',
            'Technical',
            'GenresBusinessNonfictionTechnologyScienceComputer '
            'ScienceProgrammingTechnical',
            'Genres',
            '...more'],
 'isbn': '9781449361327',
 'more_information': {},
 'number_of_pages': 413,
 'original_title': None,
 'other_editions_links': [],
 'people_reading': [],
 'published': None,
 'rating': 4.13,
 'rating_count': 2586,
 'suggested_books': [],
 'title': 'Data Science for Business: What You Need to Know about Data Min

In [65]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

# Set up Selenium WebDriver (headless for efficiency)
chrome_options = Options()
chrome_binary = "/opt/google/chrome/chrome"
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('--headless')         # Run Chrome without GUI
chrome_options.add_argument('--disable-gpu') 

driver = uc.Chrome(options=chrome_options, browser_executable_path=chrome_binary)
# driver = webdriver.Chrome(options=chrome_options)

def safe_get_text(element):
    return element.text.strip() if element else None

try:
    url = "https://www.goodreads.com/book/show/17912916-data-science-for-business?from_search=true&from_srp=true&qid=EVSW0x6JUR&rank=1"
    driver.get(url)
    time.sleep(2)  # Let the page load

    # Click all 'more' and 'Show more' buttons
    while True:
        buttons = driver.find_elements(By.XPATH, "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'more')]")
        clickable_buttons = [btn for btn in buttons if btn.is_displayed() and btn.is_enabled()]
        if not clickable_buttons:
            break
        for btn in clickable_buttons:
            try:
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.5)
            except Exception:
                continue

    info = {}

    # Title
    try:
        title_elem = driver.find_element(By.XPATH, "//h1[@data-testid='bookTitle']")
        info["title"] = safe_get_text(title_elem)
    except Exception:
        info["title"] = None

    # Authors
    authors = []
    try:
        author_elems = driver.find_elements(By.XPATH, "//a[contains(@class, 'ContributorLink')]//span[@data-testid='name']")
        for a in author_elems:
            name = safe_get_text(a)
            if name:
                authors.append(name)
    except Exception:
        pass
    info["authors"] = authors if authors else None

    # Book image
    try:
        img_elem = driver.find_element(By.XPATH, "//img[contains(@class, 'ResponsiveImage')]")
        info["image_src"] = img_elem.get_attribute("src")
    except Exception:
        info["image_src"] = None

    # Description (from meta tag for reliability)
    try:
        desc_elem = driver.find_element(By.XPATH, "//meta[@name='description']")
        info["description"] = desc_elem.get_attribute("content")
    except Exception:
        info["description"] = None

    # Genres
    genres = []
    try:
        genre_elems = driver.find_elements(By.XPATH, "//a[@data-testid='genreLink']")
        for g in genre_elems:
            genre = safe_get_text(g)
            if genre:
                genres.append(genre)
    except Exception:
        pass
    info["genres"] = genres if genres else None

    # Ratings (from schema.org script tag)
    try:
        # Find the <script type="application/ld+json"> with Book schema
        script_elems = driver.find_elements(By.XPATH, "//script[@type='application/ld+json']")
        schema_data = None
        for s in script_elems:
            try:
                data = json.loads(s.get_attribute("innerHTML"))
                if isinstance(data, dict) and data.get("@type") == "Book":
                    schema_data = data
                    break
            except Exception:
                continue
        if schema_data:
            agg = schema_data.get("aggregateRating", {})
            info["ratingValue"] = agg.get("ratingValue")
            info["ratingCount"] = agg.get("ratingCount")
            info["numberOfPages"] = schema_data.get("numberOfPages")
            info["isbn"] = schema_data.get("isbn")
            info["schema_org_name"] = schema_data.get("name")
            # Authors from schema
            if "author" in schema_data:
                schema_authors = schema_data["author"]
                if isinstance(schema_authors, list):
                    info["schema_authors"] = [a.get("name") for a in schema_authors if "name" in a]
                elif isinstance(schema_authors, dict):
                    info["schema_authors"] = [schema_authors.get("name")]
    except Exception:
        pass

    # Minirating (try to find a rating summary)
    try:
        minirating_elem = driver.find_element(By.XPATH, "//span[contains(@class, 'Text__body3')]")
        info["minirating"] = safe_get_text(minirating_elem)
    except Exception:
        info["minirating"] = None

    # Details (pages, publication, etc.)
    details = {}
    try:
        details_section = driver.find_element(By.XPATH, "//div[@data-testid='bookDetails']")
        rows = details_section.find_elements(By.XPATH, "./div")
        for row in rows:
            try:
                label = row.find_element(By.XPATH, ".//div[contains(@class, 'BookDetailsRow__label')]")
                value = row.find_element(By.XPATH, ".//div[contains(@class, 'BookDetailsRow__value')]")
                details[safe_get_text(label)] = safe_get_text(value)
            except Exception:
                continue
    except Exception:
        pass
    if details:
        info["details"] = details

    # Original title, edition details, more editions, more info, book stats, etc.
    # These are often in <h4> or <div> blocks with nearby text
    try:
        h4s = driver.find_elements(By.XPATH, "//h4[contains(@class, 'Text__title4')]")
        for h4 in h4s:
            h4_text = safe_get_text(h4)
            if not h4_text:
                continue
            if "This edition" in h4_text:
                # Edition details are likely in the next sibling
                try:
                    edition_details = h4.find_element(By.XPATH, "following-sibling::*[1]")
                    info["edition_details"] = safe_get_text(edition_details)
                except Exception:
                    pass
            elif "More editions" in h4_text:
                try:
                    more_editions = h4.find_element(By.XPATH, "following-sibling::*[1]")
                    links = more_editions.find_elements(By.XPATH, ".//a")
                    info["more_editions_links"] = [l.get_attribute("href") for l in links if l.get_attribute("href")]
                except Exception:
                    pass
            elif "More information" in h4_text:
                try:
                    more_info = h4.find_element(By.XPATH, "following-sibling::*[1]")
                    info["more_information"] = safe_get_text(more_info)
                except Exception:
                    pass
    except Exception:
        pass

    # Book statistics (ratings, pages, etc. already included above)

    # People read or reading (look for "Friends & Following" or similar)
    try:
        friends_elem = driver.find_element(By.XPATH, "//h3[contains(@class, 'Text__title3') and contains(., 'Friends & Following')]")
        info["friends_and_following_section"] = safe_get_text(friends_elem)
    except Exception:
        pass

    # Suggested books comments ("Readers also enjoyed")
    try:
        readers_enjoyed_elem = driver.find_element(By.XPATH, "//h3[contains(@class, 'Text__title3') and contains(., 'Readers also enjoyed')]")
        # Get book links in this section
        parent = readers_enjoyed_elem.find_element(By.XPATH, "following-sibling::*[1]")
        links = parent.find_elements(By.XPATH, ".//a")
        info["readers_also_enjoyed_links"] = [l.get_attribute("href") for l in links if l.get_attribute("href")]
    except Exception:
        pass

    # Comments/discussion ("Join the discussion")
    try:
        discussion_elem = driver.find_element(By.XPATH, "//h2[contains(@class, 'Text__h2') and contains(., 'Join the discussion')]")
        info["discussion_section"] = safe_get_text(discussion_elem)
    except Exception:
        pass

    print(info)

finally:
    driver.quit()


{'title': None, 'authors': None, 'image_src': None, 'description': None, 'genres': None, 'minirating': None}


In [None]:
# Playwright async code to extract book info from Goodreads

import asyncio
from playwright.async_api import async_playwright
import json
import re

url = "https://www.goodreads.com/book/show/17912916-data-science-for-business?from_search=true&from_srp=true&qid=EVSW0x6JUR&rank=1"

async def extract_schema_org_data(page):
    # Extract the Schema.org JSON-LD script
    scripts = await page.query_selector_all('script[type="application/ld+json"]')
    for script in scripts:
        try:
            data = json.loads(await script.inner_text())
            if isinstance(data, dict) and data.get("@type") == "Book":
                return data
        except Exception:
            continue
    return {}

async def safe_text(el):
    try:
        return (await el.inner_text()).strip()
    except Exception:
        return None

async def safe_attr(el, attr):
    try:
        return await el.get_attribute(attr)
    except Exception:
        return None

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000)
        await page.wait_for_load_state("networkidle")

        info = {}

        # --- TITLE ---
        # Try h1 first
        title_el = await page.query_selector('h1.Text.Text__title1[data-testid="bookTitle"]')
        info["title"] = await safe_text(title_el) if title_el else None

        # Try meta tags as backup
        if not info["title"]:
            meta_title = await page.query_selector('meta[property="og:title"]')
            if not meta_title:
                meta_title = await page.query_selector('meta[name="twitter:title"]')
            info["title"] = await safe_attr(meta_title, "content") if meta_title else None

        # Try Schema.org as last resort
        schema = await extract_schema_org_data(page)
        if not info["title"] and schema:
            info["title"] = schema.get("name")

        # --- AUTHORS ---
        authors = []
        # Try to get authors from the visible page
        # The author block is under h3.Text.Text__title3.Text__regular, then a.ContributorLink > span.ContributorLink__name[data-testid="name"]
        author_block = await page.query_selector('h3.Text.Text__title3.Text__regular:has-text("Author")')
        if author_block:
            # Find all following sibling a.ContributorLink > span.ContributorLink__name[data-testid="name"]
            author_spans = await page.query_selector_all('a.ContributorLink span.ContributorLink__name[data-testid="name"]')
            for a in author_spans:
                name = await safe_text(a)
                if name and name not in authors:
                    authors.append(name)
        else:
            # Fallback: try all a.ContributorLink span.ContributorLink__name[data-testid="name"]
            author_spans = await page.query_selector_all('a.ContributorLink span.ContributorLink__name[data-testid="name"]')
            for a in author_spans:
                name = await safe_text(a)
                if name and name not in authors:
                    authors.append(name)
        # Fallback: try Schema.org
        if not authors and schema:
            schema_authors = schema.get("author", [])
            if isinstance(schema_authors, dict):
                schema_authors = [schema_authors]
            for a in schema_authors:
                if isinstance(a, dict) and a.get("@type") == "Person" and a.get("name"):
                    authors.append(a["name"])
        info["authors"] = authors

        # --- DESCRIPTION ---
        desc = None
        for sel in [
            'meta[name="description"]',
            'meta[property="og:description"]',
            'meta[name="twitter:description"]'
        ]:
            desc_meta = await page.query_selector(sel)
            if desc_meta:
                desc = await safe_attr(desc_meta, "content")
                if desc:
                    break
        info["description"] = desc

        # --- GENRES ---
        # Try to find genre chips or similar
        genres = []
        # Try to find all links to /genres/ that are visible
        genre_els = await page.query_selector_all('a[href*="/genres/"]')
        for g in genre_els:
            genre = await safe_text(g)
            if genre and genre not in genres:
                genres.append(genre)
        # If not found, try to find genre chips by class (sometimes genres are in span or div with role="listitem")
        if not genres:
            genre_chips = await page.query_selector_all('[data-testid="genresList"] [role="listitem"]')
            for chip in genre_chips:
                genre = await safe_text(chip)
                if genre and genre not in genres:
                    genres.append(genre)
        info["genres"] = genres

        # --- EDITION DETAILS ---
        # Look for h4.Text.Text__title4:has-text("This edition")
        edition_details = None
        h4_edition = await page.query_selector('h4.Text.Text__title4:has-text("This edition")')
        if h4_edition:
            sibling = await h4_edition.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                edition_details = await page.evaluate("el => el && el.innerText", sibling)
        info["edition_details"] = edition_details

        # --- MORE EDITIONS LINKS ---
        more_editions_links = []
        h4_more_editions = await page.query_selector('h4.Text.Text__title4:has-text("More editions")')
        if h4_more_editions:
            sibling = await h4_more_editions.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                # Only get <a> tags with hrefs that look like book links
                links = await page.query_selector_all('h4.Text.Text__title4:has-text("More editions") + * a[href]')
                for l in links:
                    href = await safe_attr(l, "href")
                    if href and "/book/show/" in href and href not in more_editions_links:
                        more_editions_links.append(href)
        info["more_editions_links"] = more_editions_links

        # --- MORE INFORMATION ---
        more_information = None
        h4_more_info = await page.query_selector('h4.Text.Text__title4:has-text("More information")')
        if h4_more_info:
            sibling = await h4_more_info.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                more_information = await page.evaluate("el => el && el.innerText", sibling)
        info["more_information"] = more_information

        # --- FRIENDS & FOLLOWING ---
        friends_section = None
        friends_h3 = await page.query_selector('h3.Text.Text__title3:has-text("Friends & Following")')
        if friends_h3:
            # Try to get the next sibling, which may contain the list or count
            sibling = await friends_h3.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                friends_section = await page.evaluate("el => el && el.innerText", sibling)
            else:
                friends_section = await safe_text(friends_h3)
        info["friends_and_following_section"] = friends_section

        # --- READERS ALSO ENJOYED ---
        readers_links = []
        readers_h3 = await page.query_selector('h3.Text.Text__title3:has-text("Readers also enjoyed")')
        if readers_h3:
            sibling = await readers_h3.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                # Only get <a> tags with hrefs that look like book links
                links = await page.query_selector_all('h3.Text.Text__title3:has-text("Readers also enjoyed") + * a[href]')
                for l in links:
                    href = await safe_attr(l, "href")
                    if href and "/book/show/" in href and href not in readers_links:
                        readers_links.append(href)
        info["readers_also_enjoyed_links"] = readers_links

        # --- DISCUSSION SECTION ---
        discussion_section = None
        discussion_h2 = await page.query_selector('h2.Text.Text__h2:has-text("Join the discussion")')
        if discussion_h2:
            sibling = await discussion_h2.evaluate_handle("el => el.nextElementSibling")
            if sibling:
                discussion_section = await page.evaluate("el => el && el.innerText", sibling)
            else:
                discussion_section = await safe_text(discussion_h2)
        info["discussion_section"] = discussion_section

        # --- BOOK STATISTICS (ratings, pages, etc.) ---
        # Use schema.org if available
        if schema:
            info["rating_value"] = schema.get("aggregateRating", {}).get("ratingValue")
            info["rating_count"] = schema.get("aggregateRating", {}).get("ratingCount")
            info["number_of_pages"] = schema.get("numberOfPages")
            info["isbn"] = schema.get("isbn")
            info["published_date"] = schema.get("datePublished")
            info["original_title"] = schema.get("name")
        else:
            info["rating_value"] = None
            info["rating_count"] = None
            info["number_of_pages"] = None
            info["isbn"] = None
            info["published_date"] = None
            info["original_title"] = None

        # Try to get rating from visible page if not in schema
        if not info.get("rating_value"):
            # Try to find the rating value near the "Ratings & Reviews" h2
            rating_value = None
            rating_h2 = await page.query_selector('h2.Text.Text__title2:has-text("Ratings & Reviews")')
            if rating_h2:
                # Look for a span with data-testid="ratingStar" or similar nearby
                rating_span = await page.query_selector('span[data-testid="ratingStar"]')
                if rating_span:
                    rating_value = await safe_text(rating_span)
                else:
                    # Try to find a span with class "RatingStatistics__rating"
                    rating_span = await page.query_selector('span.RatingStatistics__rating')
                    if rating_span:
                        rating_value = await safe_text(rating_span)
            info["rating_value"] = rating_value

        # Print the extracted info
        print(json.dumps(info, indent=2, ensure_ascii=False))

        await browser.close()

# If running in a notebook, use asyncio.run only if not already in an event loop
import sys

if sys.platform == "win32":
    asyncio.run(main())
else:
    try:
        import nest_asyncio
        nest_asyncio.apply()
        loop = asyncio.get_event_loop()
        if loop.is_running():
            task = loop.create_task(main())
        else:
            loop.run_until_complete(main())
    except RuntimeError:
        asyncio.run(main())


{
  "title": null,
  "authors": [],
  "description": null,
  "genres": [],
  "edition_details": null,
  "more_editions_links": [],
  "more_information": null,
  "friends_and_following_section": null,
  "readers_also_enjoyed_links": [],
  "discussion_section": null,
  "rating_value": null,
  "rating_count": null,
  "number_of_pages": null,
  "isbn": null,
  "published_date": null,
  "original_title": null
}
