#### Import Libraries

In [11]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumwire import webdriver as wiredriver
from bs4 import BeautifulSoup
import time

#### Define Functions

In [12]:
def get_driver():
    options = Options()
    options.headless = True
    return wiredriver.Chrome(options=options)

def get_site_data(driver, url, scrolls_num, sleep_time):
    driver.get(url)

    # Scroll down to trigger dynamic content loading
    body = driver.find_element(By.TAG_NAME, 'body')
    for _ in range(scrolls_num):  # You might need to adjust the number of scrolls
        body.send_keys(Keys.END)
        time.sleep(sleep_time)  # Add a delay to allow content to load

    # Add a delay to allow content to load
    time.sleep(sleep_time)

    return driver


#### Gather HTML Data From Goodreads -> goodreads_html_data

In [13]:
url = "https://www.goodreads.com/review/list/167714888-roee-bar?utf8=%E2%9C%93&utf8=%E2%9C%93&shelf=&title=roee-bar&per_page=infinite"
driver = get_driver()
driver = get_site_data(driver, url, scrolls_num = 8, sleep_time = 5)
goodreads_html_data = driver.page_source


#### Get Books URL From Gathered Data -> href_values

In [14]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(goodreads_html_data, 'html.parser')

# Find the div with class 'js-dataTooltip'
data_div = soup.find('div', class_='js-dataTooltip')

# Find the table with class 'table stacked' within the 'js-dataTooltip' div
books_table = data_div.find('table', class_='table stacked')

# Check if the table is found
if not(books_table):
    print("Table not found.")
    exit(1)

# Find all td elements with class 'field cover' within the 'books_table'
cover_tds = books_table.find_all('td', class_='field cover')

# Extract the href attribute from each 'a' element within the 'cover_tds'
# Using the map function to extract the value of the 'href' attribute from the 'a' tag in each 'td' element
# The lambda function takes a 'td' element and finds the 'a' tag within it, then extracts the 'href' attribute
# The result is a list of 'href' attribute values corresponding to each 'td' element in 'cover_tds'
href_values = list(map(lambda td:td.find('a')['href'], cover_tds))
    
    

for href_value in href_values:
    print(href_value)

/book/show/40549476-this-is-marketing
/book/show/324748.The_Dip
/book/show/53479927-the-practice
/book/show/7235533-the-way-of-kings
/book/show/37903770-norse-mythology
/book/show/10339170-the-10x-rule
/book/show/59616977-building-a-second-brain
/book/show/50542735-your-next-five-moves
/book/show/115625.The_Psychology_of_Selling
/book/show/155981.Psycho_Cybernetics_A_New_Way_to_Get_More_Living_Out_of_Life
/book/show/25054961-dotcom-secrets
/book/show/199017130-the-diary-of-a-ceo
/book/show/52967057-copywriting-secrets
/book/show/26535513-storytelling-with-data
/book/show/578736.Zig_Ziglar_s_Secrets_of_Closing_the_Sale
/book/show/43306206-the-courage-to-be-disliked
/book/show/3063393-pragmatic-thinking-and-learning
/book/show/123005124-the-32-principles
/book/show/8576838-the-win-without-pitching-manifesto
/book/show/66693.The_Crystal_Shard
/book/show/33004.The_Weekend_Millionaire_s_Secrets_to_Investing_in_Real_Estate
/book/show/25454056-sword-of-destiny
/book/show/18656031-baptism-of-f

#### Gather HTML Data From A Book Page -> goodreads_html_data

In [15]:
url = "https://www.goodreads.com" + href_values[0]

driver = get_site_data(driver, url, scrolls_num = 5, sleep_time = 5)
book_html_data = driver.page_source

#### Defain Book Page Functions

In [19]:
# Gets the book's name from the book's page soup
def get_book_title(soup):
    # Find the div with class 'BookPageTitleSection__title'
    title_section_div = soup.find('div', class_='BookPageTitleSection__title')

    # Find the h1 aria-label content within the 'BookPageTitleSection__title' div 
    book_title = title_section_div.find('h1')['aria-label']

    return book_title

# Gets the book's genres from the book's page soup
def get_book_genres(soup):
    # Find all spans with class 'BookPageMetadataSection__genreButton'
    genre_buttons = soup.find_all('span', class_='BookPageMetadataSection__genreButton')

    # Using the map function to extract the text content of the 'span' tag with the specified class within each 'genre_button' element
    # The lambda function takes a 'genre_button' element and finds the 'span' tag with the specified class within it
    # The result is a list of text content corresponding to each 'span' tag within 'genre_buttons'

    return list(map(lambda button:button.find('span',class_ = 'Button__labelItem').text,genre_buttons))

#### Gather Relevent Data From A Book's Page HTML Data

In [20]:
# Parse the updated HTML with BeautifulSoup
soup = BeautifulSoup(book_html_data, 'html.parser')

# Gets the book's name from the book's page soup
book_title = get_book_title(soup)

# Gets the book's genres from the book's page soup
books_genres = get_book_genres(soup)


print(books_genres)

['Business', 'Nonfiction', 'Entrepreneurship', 'Audiobook', 'Buisness', 'Leadership', 'Psychology']


In [18]:
driver.quit()