## 1. Installing the necessary libraries and Browser Driver

In [None]:
# Import the required libraries
import csv
import os
import re
import time
import lxml.html
from lxml import etree
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup

## 2. getting metadata
Here, we still use Selenium to simulate the human action of browsing a webpage to scrape book reviews, i.e., username, time of the book review, content of the book review, user label, rating, and support of the book review (i.e., the number of likes).

Since the Goodreads Website is only allowed to display the top 10 pages of book reviews for every book, with 30 reviews on every page, the maximum number of book reviews that can be displayed for each book is 300. To deal with this situation, we decide to split into two processing methods: one is for the case where the amount of book reviews is greater than or equal to 300, and the other is for the case where the amount of book reviews is less than 300. This approach will help us to reduce the complexity of code,and reduce the chance of errors.

The code can be reused, just replace the url of science fiction. The url list of science fiction has been scraped in get_metadata.

**If the amount of book reviews is greater than or equal to 300,** please choose to use the first code.

In [None]:
# Specify the path to the ChromeDriver executable.
chromedriver_path = '/Users/wanshuo/Desktop/Master/DH_MA_thesis/dataset/chromedriver-mac-arm64/chromedriver'

# Create an instance of ChromeOptions to set ChromeDriver options.
chrome_options = webdriver.ChromeOptions()
# Specify the preference for displaying images
# 1 for displaying images, 2 for not displaying images. When images are not needed to be crawled, they can be set to not load images to save time.
prefs = {"profile.managed_default_content_settings.images":2}
# Add the experimental option to the ChromeOptions instance to apply the preferences.
chrome_options.add_experimental_option("prefs", prefs)
# Add the ChromeDriver path as an argument to ChromeOptions.
chrome_options.add_argument(f'--webdriver-path={chromedriver_path}')

# Initialize the WebDriver with the specified options.
driver = webdriver.Chrome(options=chrome_options)
# Open the specified URL in the browser.
driver.get('https://www.goodreads.com/book/show/63103521-ruins-of-sea-and-souls')
# Wait for 10 seconds to allow the page to fully load.
time.sleep(10)
# Maximize the browser window to ensure all elements are visible.
driver.maximize_window()

# Define the XPath for the close button of a popup (if it exists).
close_button_xpath = '/html/body/div[3]/div/div[1]/div/div/button'
# Attempt to locate the close button using the specified XPath and click it.
try:
    close_button = driver.find_element(By.XPATH, close_button_xpath)
    close_button.click()
# If the close button is not found, handle the exception and continue.
except NoSuchElementException:
    pass

# Wait for 2 seconds to ensure any actions following the button click are ready to proceed.
time.sleep(2)

# Loop 9 times to interact with the 'Load More' button on the page to reveal additional content.
for i in range(9):
    # For the first iteration, use the XPath for the 'Load More' button located in the ReviewsSection.
    if i == 0:
        load_more_button_xpath = '//*[@id="ReviewsSection"]/div[6]/div[4]/a'
    # For subsequent iterations, use the XPath for a different 'Load More' button.
    else:
        load_more_button_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[5]/div[5]/div/button'

    # Find the 'Load More' button element using the defined XPath.
    load_more_button = driver.find_element(By.XPATH, load_more_button_xpath)
    # Create an ActionChains instance to perform complex user interactions.
    actions = ActionChains(driver)
    # Move to the 'Load More' button element and click it.
    actions.move_to_element(load_more_button).click().perform()
    # Wait for 10 seconds to allow the new content to load.
    time.sleep(10)

# Get the current page source after loading all content.
html_content = driver.page_source
# Parse the page source using BeautifulSoup to create a soup object for easy HTML parsing.
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize lists to store data extracted from the page.
name_list = []
review_count_list = []
followers_list = []
rating_list = []
date_list = []
likes_list = []
tag_list = []
review_texts_list = []

# Extract and store the names of reviewers from the page.
for name_element in soup.find_all('div', class_='ReviewerProfile__name'):
    name_list.append(name_element.a.text)

# Extract and store the review counts of reviewers from the page.
for review_count_element in soup.find_all('div', class_='ReviewerProfile__meta'):
    # Get the text that includes the review count.
    review_count_text = review_count_element.span.text
    # Filter out only the figure from the review count text.
    review_count = ''.join(filter(str.isdigit, review_count_text))
    # Append the review count to the list.
    review_count_list.append(review_count)

# Extract and store the number of followers for each reviewer from the page.
for followers_element in soup.find_all('div', class_='ReviewerProfile__meta'):
    # Find all span elements within the current reviewer profile section.
    span_elements = followers_element.find_all('span')
    # Initialize a variable to hold the followers span element.
    followers_span = None
    # Search for the span element containing the text 'followers' or 'follower'.
    for span in span_elements:
        if 'followers' in span.text or 'follower' in span.text:
            followers_span = span
            break
    # If a span element with followers information is found.
    if followers_span:
        # Get the text containing the number of followers.
        followers_text = followers_span.text
        # Filter out only figure from the followers text.
        followers_count = ''.join(filter(str.isdigit, followers_text))
        # If the text contains 'k', assume the count is in thousands and adjust accordingly.
        if 'k' in followers_text:
            followers_count = followers_count + "000"
    else:
        # If no followers information is found, set the followers count to None.
        followers_count = None
    # Append the followers count to the list.
    followers_list.append(followers_count)

# Parse the HTML content into an lxml object for XPath processing.
root = html.fromstring(html_content)
# Use XPath to find all review sections on the page.
reviews = root.xpath('//section[@class="ReviewCard__row"]')
# Iterate over each review section found.
for i, review in enumerate(reviews, start=1):
    # Use XPath to find the rating span element within the current review.
    rating_span = review.xpath('.//span[@class="RatingStars RatingStars__small"]')
    # If the rating span element is found.
    if rating_span:
        # Get the 'aria-label' attribute from the span, which contains the rating information.
        rating_text = rating_span[0].attrib.get('aria-label', '')
        # Extract the rating value from the text if it contains the word 'Rating'.
        rating = rating_text.split()[1] if 'Rating' in rating_text else None
    else:
        # If no rating span element is found, set the rating to None.
        rating = None
    # Append the extracted rating to the rating list.
    rating_list.append(rating)

# Extract and store the review dates from the page using BeautifulSoup.
for date_element in soup.find_all('span', class_='Text Text__body3'):
    try:
        # Append the date text to the list if it is found.
        date_list.append(date_element.a.text)
    # If an AttributeError occurs, pass silently.
    except AttributeError:
        pass

# Re-parse the HTML content using lxml for further XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all tag sections within the reviews.
html_tags = root.xpath('//section[@class="ReviewCard__tags"]')
# Iterate over each tag section found.
for i, html_tag in enumerate(html_tags, start=1):
    # Use XPath to find all tag elements within the current tag section.
    tags = html_tag.xpath('.//a[contains(@class, "Button--tag-inline")]/span[@class="Button__labelItem"]/text()')
    # Join the tags into a single string, separated by semicolons.
    tags_string = "Ôºõ".join(tags)
    # Append the joined tag string to the tag list.
    tag_list.append(tags_string)

# Parse the HTML content into an lxml object for XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all 'SocialFooter' sections, which likely contain like counts.
html_likes = root.xpath('//footer[@class="SocialFooter"]')
# Iterate over each 'SocialFooter' section found.
for i, html_like in enumerate(html_likes, start=1):
    # Use XPath to find the like count button within the current 'SocialFooter'.
    likes_text = html_like.xpath('.//button[contains(@class, "Button Button--inline Button--small Button--subdued")]/span[@class="Button__labelItem"]/text()')
    # If the likes_text list is not empty and contains 'likes' or 'like'.
    if likes_text and ('likes' in likes_text[0] or 'like' in likes_text[0]):
        # Filter out only the figure from the likes text to get the like count.
        likes_count = ''.join(filter(str.isdigit, likes_text[0]))
    else:
        # If no likes information is found, set the likes count to None.
        likes_count = None

    # Append the extracted likes count to the likes list.
    likes_list.append(likes_count)

# Re-parse the HTML content using lxml for further XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all review content sections.
html_contents = root.xpath('//section[@class="ReviewText__content"]')
# Iterate over each review content section found.
for i, div_content in enumerate(html_contents, start=1):
    # Use XPath to extract the review text from the specified div and span elements.
    content = ''.join(div_content.xpath('.//div[contains(@class, "TruncatedContent__text TruncatedContent__text--large")]/span[@class="Formatted"]/text()'))
    # Append the stripped review text to the review texts list.
    review_texts_list.append(content.strip())

# Iterate over all extracted data lists simultaneously using the zip function.
for index, (name, review_count, followers, rating, date, tags, likes, div_content) in enumerate(
       zip(name_list, review_count_list, followers_list, rating_list, date_list, tag_list, likes_list, review_texts_list)):
    # Print the details of each review in a formatted string.
    print(f"Review {index + 1}: Reviewer: {name}, Review Count: {review_count}, Followers: {followers}, Rating: {rating}, Date: {date}, Tags: {tags}, Likes: {likes}, Content: {div_content}")


Review 1: Reviewer: Maddi, Review Count: 111, Followers: 3, Rating: 5, Date: July 29, 2023, Tags: , Likes: 35, Content: Ahem. May I have your attention please? Do you enjoy badass magic systems, strong FMCs, court politics, found family, sexy-broody-winged fae, steamy and secret romance, beautiful and cohesive world building, and phenomenally written character growth? No? Then you will HATE this series. Move along.Now, onto our review for those of you that have good taste.I just wanted to give Creon a hug throughout this whole book. He is such a great MMC and he and Em just complement each other so beautifully, even when they are butting heads and working through their trauma together. Their relationship really developed so much in this book; I am so impressed by how Lisette manages to hold on to just the right amount of tension and conflict without compromising the feeling of a realistic, loving relationship.I LOVED getting to see all of our beloved side characters in one place and ta

**If the amount of book reviews is less than 300,** please choose to use the second code.

In [None]:
# Specify the path to the ChromeDriver executable.
chromedriver_path = '/Users/wanshuo/Desktop/Master/DH_MA_thesis/dataset/chromedriver-mac-arm64/chromedriver'

# Create an instance of ChromeOptions to set ChromeDriver options.
chrome_options = webdriver.ChromeOptions()
# Specify the preference for displaying images
# 1 for displaying images, 2 for not displaying images. When images are not needed to be crawled, they can be set to not load images to save time.
prefs = {"profile.managed_default_content_settings.images":2}
# Add the experimental option to the ChromeOptions instance to apply the preferences.
chrome_options.add_experimental_option("prefs", prefs)
# Add the ChromeDriver path as an argument to ChromeOptions.
chrome_options.add_argument(f'--webdriver-path={chromedriver_path}')

# Initialize the WebDriver with the specified options.
driver = webdriver.Chrome(options=chrome_options)
# Open the specified URL in the browser.
driver.get('https://www.goodreads.com/book/show/95585170-gilmat')
# Wait for 10 seconds to allow the page to fully load.
time.sleep(10)
# Maximize the browser window to ensure all elements are visible.
driver.maximize_window()

# Define the XPath for the close button of a popup (if it exists).
close_button_xpath = '/html/body/div[3]/div/div[1]/div/div/button'
# Attempt to locate the close button using the specified XPath and click it.
try:
    close_button = driver.find_element(By.XPATH, close_button_xpath)
    close_button.click()
# If the close button is not found, handle the exception and continue.
except NoSuchElementException:
    pass
# Wait for 2 seconds to ensure any actions following the button click are ready to proceed.
time.sleep(2)

# Define the XPath for the initial 'Load More' button in the reviews section.
load_more_button_xpath = '//*[@id="ReviewsSection"]/div[6]/div[4]/a'
# Define the XPath for the subsequent 'Load More' button after the first one.
other_load_more_button_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[5]/div[5]/div/button'

# Define a function to scroll to the next page by clicking the 'Load More' button.
def scroll_to_next_page(load_more_button_xpath):
    # Find the 'Load More' button element using the provided XPath.
    load_more_button = driver.find_element(By.XPATH, load_more_button_xpath)
    # Create an ActionChains instance to perform complex user interactions.
    actions = ActionChains(driver)
    # Move to the 'Load More' button element and click it.
    actions.move_to_element(load_more_button).click().perform()
    # Wait for 10 seconds to allow the new reviews to load.
    time.sleep(10)

# Use a loop to continuously click the 'Load More' button until no more buttons are found.
while True:
    try:
        # Try to scroll to the next page by clicking the 'Load More' button.
        scroll_to_next_page(load_more_button_xpath)
    except NoSuchElementException:
        # If the 'Load More' button is not found, break out of the loop.
        break
    # After the first click, change the XPath to the subsequent 'Load More' button.
    load_more_button_xpath = other_load_more_button_xpath


# Get the current page source after loading all content.
html_content = driver.page_source
# Parse the page source using BeautifulSoup to create a soup object for easy HTML parsing.
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize lists to store data extracted from the page.
name_list = []
review_count_list = []
followers_list = []
rating_list = []
date_list = []
likes_list = []
tag_list = []
review_texts_list = []

# Extract and store the names of reviewers from the page.
for name_element in soup.find_all('div', class_='ReviewerProfile__name'):
    name_list.append(name_element.a.text)

# Extract and store the review counts of reviewers from the page.
for review_count_element in soup.find_all('div', class_='ReviewerProfile__meta'):
    # Get the text that includes the review count.
    review_count_text = review_count_element.span.text
    # Filter out only the figure from the review count text.
    review_count = ''.join(filter(str.isdigit, review_count_text))
    # Append the review count to the list.
    review_count_list.append(review_count)

# Extract and store the number of followers for each reviewer from the page.
for followers_element in soup.find_all('div', class_='ReviewerProfile__meta'):
    # Find all span elements within the current reviewer profile section.
    span_elements = followers_element.find_all('span')
    # Initialize a variable to hold the followers span element.
    followers_span = None
    # Search for the span element containing the text 'followers' or 'follower'.
    for span in span_elements:
        if 'followers' in span.text or 'follower' in span.text:
            followers_span = span
            break
    # If a span element with followers information is found.
    if followers_span:
        # Get the text containing the number of followers.
        followers_text = followers_span.text
        # Filter out only figure from the followers text.
        followers_count = ''.join(filter(str.isdigit, followers_text))
        # If the text contains 'k', assume the count is in thousands and adjust accordingly.
        if 'k' in followers_text:
            followers_count = followers_count + "000"
    else:
        # If no followers information is found, set the followers count to None.
        followers_count = None
    # Append the followers count to the list.
    followers_list.append(followers_count)

# Parse the HTML content into an lxml object for XPath processing.
root = html.fromstring(html_content)
# Use XPath to find all review sections on the page.
reviews = root.xpath('//section[@class="ReviewCard__row"]')
# Iterate over each review section found.
for i, review in enumerate(reviews, start=1):
    # Use XPath to find the rating span element within the current review.
    rating_span = review.xpath('.//span[@class="RatingStars RatingStars__small"]')
    # If the rating span element is found.
    if rating_span:
        # Get the 'aria-label' attribute from the span, which contains the rating information.
        rating_text = rating_span[0].attrib.get('aria-label', '')
        # Extract the rating value from the text if it contains the word 'Rating'.
        rating = rating_text.split()[1] if 'Rating' in rating_text else None
    else:
        # If no rating span element is found, set the rating to None.
        rating = None
    # Append the extracted rating to the rating list.
    rating_list.append(rating)

# Extract and store the review dates from the page using BeautifulSoup.
for date_element in soup.find_all('span', class_='Text Text__body3'):
    try:
        # Append the date text to the list if it is found.
        date_list.append(date_element.a.text)
    # If an AttributeError occurs, pass silently.
    except AttributeError:
        pass

# Re-parse the HTML content using lxml for further XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all tag sections within the reviews.
html_tags = root.xpath('//section[@class="ReviewCard__tags"]')
# Iterate over each tag section found.
for i, html_tag in enumerate(html_tags, start=1):
    # Use XPath to find all tag elements within the current tag section.
    tags = html_tag.xpath('.//a[contains(@class, "Button--tag-inline")]/span[@class="Button__labelItem"]/text()')
    # Join the tags into a single string, separated by semicolons.
    tags_string = "Ôºõ".join(tags)
    # Append the joined tag string to the tag list.
    tag_list.append(tags_string)

# Parse the HTML content into an lxml object for XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all 'SocialFooter' sections, which likely contain like counts.
html_likes = root.xpath('//footer[@class="SocialFooter"]')
# Iterate over each 'SocialFooter' section found.
for i, html_like in enumerate(html_likes, start=1):
    # Use XPath to find the like count button within the current 'SocialFooter'.
    likes_text = html_like.xpath('.//button[contains(@class, "Button Button--inline Button--small Button--subdued")]/span[@class="Button__labelItem"]/text()')
    # If the likes_text list is not empty and contains 'likes' or 'like'.
    if likes_text and ('likes' in likes_text[0] or 'like' in likes_text[0]):
        # Filter out only the figure from the likes text to get the like count.
        likes_count = ''.join(filter(str.isdigit, likes_text[0]))
    else:
        # If no likes information is found, set the likes count to None.
        likes_count = None

    # Append the extracted likes count to the likes list.
    likes_list.append(likes_count)

# Re-parse the HTML content using lxml for further XPath processing.
root = lxml.html.fromstring(html_content)
# Use XPath to find all review content sections.
html_contents = root.xpath('//section[@class="ReviewText__content"]')
# Iterate over each review content section found.
for i, div_content in enumerate(html_contents, start=1):
    # Use XPath to extract the review text from the specified div and span elements.
    content = ''.join(div_content.xpath('.//div[contains(@class, "TruncatedContent__text TruncatedContent__text--large")]/span[@class="Formatted"]/text()'))
    # Append the stripped review text to the review texts list.
    review_texts_list.append(content.strip())

# Iterate over all extracted data lists simultaneously using the zip function.
for index, (name, review_count, followers, rating, date, tags, likes, div_content) in enumerate(
       zip(name_list, review_count_list, followers_list, rating_list, date_list, tag_list, likes_list, review_texts_list)):
    # Print the details of each review in a formatted string.
    print(f"Review {index + 1}: Reviewer: {name}, Review Count: {review_count}, Followers: {followers}, Rating: {rating}, Date: {date}, Tags: {tags}, Likes: {likes}, Content: {div_content}")


Review 1: Reviewer: Mara, Review Count: 1785, Followers: 4115, Rating: 3, Date: March 5, 2023, Tags: subscription, Likes: 16, Content: The weakest of the series, unfortunately, but still delightful and I liked the way the macro plot wrapped up. I sincerely hope we get more in this world!!
Review 2: Reviewer: Erin, Review Count: 3238, Followers: 476, Rating: 4, Date: April 28, 2023, Tags: , Likes: 11, Content: This is the 7th installment in the series. Each book follows another couple and so they can be read as standalone. I accessed the series with my Kindle Unlimited membership. This is Gilmat and Julie's story. Julie Watson and her mother were referred to in several other books in the series and so I was familiar with her back story. Her mother was trying to marry her off to an eligible bachelor. Gilmat is mentioned as the ranch gardener and keeps to himself. Julie is kidnapped by his brothers and delivered to his doorstep. I think it is safe to say that the connection between the tw

## 3.Storing book reviews locally

In [None]:
# Specify the filename for the CSV file where the data will be saved
csv_filename = "reviews-gilmat.csv"

# Open the specified file in write mode with UTF-8 encoding
# 'w' mode means the file is opened for writing (and will be created if it doesn't exist)
# newline='' ensures that newlines are handled correctly across different operating systems
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:

    # Create a CSV writer object which will be used to write data to the file
    writer = csv.writer(file)

    # Write the header row to the CSV file
    # This row contains the column names: 'Reviewer', 'Review Count', 'Followers', 'Rating', 'Date', 'Tags', 'Likes', 'Content'
    writer.writerow(['Reviewer', 'Review Count', 'Followers', 'Rating', 'Date', 'Tags', 'Likes', 'Content'])

    # Iterate over the combined data lists using enumerate for index tracking
    # zip() function combines the lists so that we can iterate over corresponding elements together
    for index, (name, review_count, followers, rating, date, tags, likes, div_content) in enumerate(
       zip(name_list, review_count_list, followers_list, rating_list, date_list, tag_list, likes_list, review_texts_list)):

        # Write a row to the CSV file with the current reviewer's data
        writer.writerow([name, review_count, followers, rating, date, tags, likes, div_content])

# Print a message indicating that the CSV file has been successfully created
print("CSV file has been successfully created!")


CSV file has been successfully created!
