<a href="https://colab.research.google.com/github/Ojochideee/Data_Scraping/blob/main/NovelNuggets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time

In [17]:
# URL of the page you want to scrape
bestbook_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"

# Set a custom user agent header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

In [18]:
# Send a GET request to the initial page
page = requests.get(bestbook_url, headers=headers)
page.raise_for_status()  # Raise an error for bad responses

In [19]:
# Parse the page content
soup = BeautifulSoup(page.text, 'html.parser')

# Locate the table element
table = soup.find('table', class_='tableList js-dataTooltip')

In [20]:
# Extracting data
numbers = [int(number.text) for number in table.find_all('td', {'class': "number"})]
authors = [name.text.strip() for name in table.find_all('a', {'class': "authorName"})]
bookTitle = [book.text.strip() for book in table.find_all('a', {'class': "bookTitle"})]

In [21]:
avrRatings = []
ratings = []
for rating in table.find_all('span', class_="minirating"):
    rating_text = rating.text.replace(' avg rating', '').replace(' ratings', '')
    rating_value = re.findall(r'\d+\.\d+', rating_text)
    if rating_value:
        avrRatings.append(float(rating_value[0]))
    rating_count = rating_text.split('—')[1].strip()
    ratings.append(float(rating_count.replace(',', '')))

scores = [int(score.text.strip().split(': ')[-1].replace(',', '')) for score in table.find_all('a', string=lambda text: text and 'score:' in text.lower())]
peopleVotes = [int(''.join(filter(str.isdigit, vote.text.strip().split(': ')[-1]))) for vote in table.find_all('a', string=lambda text: text and 'people voted' in text.lower())]

In [26]:
# Extract and construct full URLs for book links
base_url = 'https://www.goodreads.com'
book_links = [base_url + link.get('href') for link in table.find_all('a', class_='bookTitle')]

In [30]:
# Store additional details
additional_details = {
    'Publication Date': [],
    'Number of Pages': [],
}

def extract_additional_details(book_url):
    try:
        response = requests.get(book_url, headers=headers)
        response.raise_for_status()
        book_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract number of pages
        pages_tag = book_soup.find('p', {'data-testid': 'pagesFormat'})
        pages = pages_tag.get_text(strip=True).split(' ')[0] if pages_tag else 'N/A'

        # Extract publication date
        pub_date_tag = book_soup.find('p', {'data-testid': 'publicationInfo'})
        pub_date = pub_date_tag.get_text(strip=True).replace('First published ', '') if pub_date_tag else 'N/A'

        return pub_date, pages
    except Exception as e:
        print(f"Error processing {book_url}: {e}")
        return 'N/A', 'N/A'

# Loop through each book link and extract additional details
for link in book_links:
    pub_date, pages = extract_additional_details(link)
    additional_details['Publication Date'].append(pub_date)
    additional_details['Number of Pages'].append(pages)
    time.sleep(1)  # Add a delay between requests to avoid overwhelming the server

In [31]:
# Creating a DataFrame
data = {
    'Number': numbers,
    'Author name': authors,
    'Book Title': bookTitle,
    'Average Rating': avrRatings,
    'Ratings': ratings,
    'Score': scores,
    'Votes': peopleVotes,
    'Book Link': book_links,
    'Publication Date': additional_details['Publication Date'],
    'Number of Pages': additional_details['Number of Pages']
}
df = pd.DataFrame(data)

# Categorize ratings
bins_category = [0, 3, 4, 5]
bin_labels = ['Low', 'Medium', 'High']
df['Rating Category'] = pd.cut(df['Average Rating'], bins=bins_category, labels=bin_labels, include_lowest=True)

In [32]:
# Save the DataFrame to a CSV file
df.to_csv('best_books.csv', index=False)

In [33]:
df

Unnamed: 0,Number,Author name,Book Title,Average Rating,Ratings,Score,Votes,Book Link,Publication Date,Number of Pages,Rating Category
0,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,9232450.0,4024922,40937,https://www.goodreads.com/book/show/2767052-th...,"September 14, 2008",374,High
1,2,J.K. Rowling,Harry Potter and the Order of the Phoenix (Har...,4.50,3585721.0,3197636,32665,https://www.goodreads.com/book/show/2.Harry_Po...,"June 21, 2003",912,High
2,3,Jane Austen,Pride and Prejudice,4.29,4481431.0,2835937,29072,https://www.goodreads.com/book/show/1885.Pride...,"January 28, 1813",279,High
3,4,Harper Lee,To Kill a Mockingbird,4.26,6510108.0,2504431,25577,https://www.goodreads.com/book/show/2657.To_Ki...,"July 11, 1960",323,High
4,5,Markus Zusak,The Book Thief,4.39,2723259.0,1886139,19382,https://www.goodreads.com/book/show/19063.The_...,"September 1, 2005",592,High
...,...,...,...,...,...,...,...,...,...,...,...
95,96,Frank McCourt,"Angela’s Ashes (Frank McCourt, #1)",4.15,643810.0,306736,3425,https://www.goodreads.com/book/show/252577.Ang...,"September 5, 1996",452,High
96,97,Anne Rice,Interview with the Vampire (The Vampire Chroni...,4.02,620460.0,304198,3395,https://www.goodreads.com/book/show/43763.Inte...,"April 12, 1976",346,High
97,98,Miguel de Cervantes Saavedra,Don Quixote,3.90,287698.0,300558,3405,https://www.goodreads.com/book/show/3836.Don_Q...,"January 1, 1605",1023,Medium
98,99,Ernest Hemingway,The Old Man and the Sea,3.81,1226062.0,297572,3409,https://www.goodreads.com/book/show/2165.The_O...,"September 1, 1952",96,Medium
