In [1]:
import pandas as pd
import numpy as np

In [2]:
from selenium import webdriver  
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  
from bs4 import BeautifulSoup as soup

# Initialize the WebDriver correctly
service = Service(ChromeDriverManager().install())  # Automatically downloads/upgrades ChromeDriver
browser = webdriver.Chrome(service=service)
browser.get("https://books.toscrape.com/catalogue/category/books/science_22/index.html") 
# The driver.page_source will return the full page HTML code.
html_source = browser.page_source  
browser.quit()

In [3]:
page_soup = soup(html_source,'html.parser')
page_soup

<html class="no-js" lang="en-us"><!--<![endif]--><head>
<title>
    Science | 
     Books to Scrape - Sandbox

</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="
    
" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="../../../../static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="../../../../static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="../../../../static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
<link href="../../../../static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
</head>
<body class="default" id="default">
<header class="header

In [15]:
# Now let's see how many HTML product are present in this link:
product_container = page_soup.findAll("article", { "class": "product_pod"})
print(len(product_container))

14


In [24]:
# Let get the image of the product
product_container[0].findAll("div", {"class": "image_container"})[0].img['src']

'../../../../media/cache/d4/8d/d48d5122a15347e9fe2b15ad354d69bf.jpg'

In [21]:
# Let get the price of the product
product_container[0].findAll("p", {"class": "price_color"})[0].text.strip()

'£42.96'

In [22]:
# Let get the availability of the product
product_container[0].findAll("p", {"class": "instock availability"})[0].text.strip()

'In stock'

In [26]:
# Let get the rating of the product
product_container[0].findAll("p", {"class": "star-rating"})[0].text.strip()

''

In [28]:
# Let get the title of the product
product_container[0].findAll("div", {"class": "image_container"})[0].img['alt']

"The Most Perfect Thing: Inside (and Outside) a Bird's Egg"

In [55]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

# Configure Selenium WebDriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Website URL
base_url = "https://books.toscrape.com/catalogue/category/books/science_22/index.html"
browser.get(base_url)

# Initialize lists
products = []

# Mapping rating words to numbers
rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}

# Scrape Data
try:
    html_source = browser.page_source
    page_soup = soup(html_source, "html.parser")
    product_container = page_soup.findAll("article", {"class": "product_pod"})
    
    for product in product_container:
        img_tag = product.find("img")
        price = product.find("p", {"class": "price_color"})
        availability = product.find("p", {"class": "instock availability"})
        rating = product.find("p", {"class": "star-rating"})
        
        products.append({
            "Title": img_tag["alt"] if img_tag and "alt" in img_tag.attrs else None,
            "Image": img_tag["src"].replace("../../", base_url + "media/") if img_tag else None,
            "Price (£)": float(price.text.strip().replace("£", "")) if price else None,
            "Availability": availability.text.strip() if availability else None,
            "Rating": rating_map.get(rating["class"][1], None) if rating and "class" in rating.attrs else None,
        })

except TimeoutException:
    print("Timeout while loading page.")
except Exception as e:
    print(f"Error occurred: {e}")
finally:
    browser.quit()

# Convert to DataFrame
df = pd.DataFrame(products)
df.drop_duplicates(inplace=True)
df.to_csv("BooksList.csv", index=False)

print("Scraping completed. Data saved to BooksList.csv.")

Scraping completed. Data saved to BooksList.csv.


In [56]:
from fpdf import FPDF
import pandas as pd

# Load the CSV file
file_path = "BookToScrape.csv"
df = pd.read_csv(file_path)

# Ensure column names match the CSV
title_col = "Title"
rating_col = "Rating"
price_col = "Price (£)"  # Fixed column name
availability_col = "Availability"

# Check if necessary columns exist
missing_columns = [col for col in [title_col, rating_col, price_col, availability_col] if col not in df.columns]
if missing_columns:
    raise KeyError(f"Missing columns in CSV: {missing_columns}")

# Initialize PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", style='B', size=14)
pdf.cell(200, 10, "Book List", ln=True, align='C')
pdf.ln(10)

# Set column widths
col_widths = [80, 20, 30, 40]

# Add table headers
headers = ["Title", "Rating", "Price", "Availability"]
pdf.set_font("Arial", style='B', size=12)
for i, header in enumerate(headers):
    pdf.cell(col_widths[i], 10, header, border=1, align='C')
pdf.ln()

# Add rows to the table
pdf.set_font("Arial", size=10)
row_color = False  # Alternate row colors

for _, row in df.iterrows():
    if row_color:
        pdf.set_fill_color(240, 240, 240)  # Light gray background
    else:
        pdf.set_fill_color(255, 255, 255)  # White background
    row_color = not row_color  # Toggle row color

    # Title (multi-cell for wrapping)
    pdf.multi_cell(col_widths[0], 10, row['Title'], border=1, fill=True)

    # Move cursor back to next column
    pdf.set_x(pdf.l_margin + col_widths[0])

    # Other columns
    pdf.cell(col_widths[1], 10, str(row['Rating']), border=1, align='C', fill=True)
    pdf.cell(col_widths[2], 10, f"£{row['Price (£)']:.2f}", border=1, align='C', fill=True)
    pdf.cell(col_widths[3], 10, str(row['Availability']), border=1, align='C', fill=True)
    pdf.ln()

# Save the PDF
pdf.output("BooksList.pdf")
print("PDF successfully created as BooksList.pdf")


PDF successfully created as BooksList.pdf
