# Books to Scrape - Scraping Using BeautifulSoup: Advanced Version (V2)
___

## Import Library

In [1]:
import requests
from bs4 import BeautifulSoup as BfS4
import wget
import os
import re
from pathlib import Path

print("Libraries imported successfully.")

Libraries imported successfully.


## Scraping Image

In [2]:
# Fungsi untuk sanitasi nama folder atau file (menghilangkan karakter yang tidak valid)
def sanitize_filename(name):
    return re.sub(r'[<>:"/\|?*]', '_', name)

print("Sanitization function loaded.")

Sanitization function loaded.


In [3]:
# Fungsi untuk menyimpan gambar dalam folder kategori
def save_image(title, image_url, category):
    # Tentukan path folder berdasarkan kategori
    path = f"images/category/{category}/"
    
    # Membuat folder kategori jika belum ada
    Path(path).mkdir(parents=True, exist_ok=True)

    try:
        # Debugging kategori
        print(f"Category: {category}")  # Memastikan kategori yang diambil benar
        
        # Sanitasi nama file untuk menghindari karakter tidak valid
        sanitized_title = sanitize_filename(title)
        image_filename = f"{path}{sanitized_title}.jpg"  # Menentukan nama file gambar yang disimpan
        
        # Cek apakah gambar sudah ada di folder kategori
        if not os.path.exists(image_filename):
            # Mengunduh gambar menggunakan wget
            wget.download(image_url, image_filename, bar=None)
            print(f"Image for '{sanitized_title}' saved in '{category}' folder.")
        else:
            print(f"Image for '{sanitized_title}' already exists, skipping download.")
    
    except Exception as e:
        print(f"Error downloading image for '{title}': {e}")

print("Image saving function loaded.")

Image saving function loaded.


In [4]:
# scrape all links of the categories even for multiple pages:
def scraping_category():
    print("---------- Starting category scraping ----------")
    url = "http://books.toscrape.com/"
    response = requests.get(url)
    if response.ok:
        links_of_categories_all = []
        soup = BfS4(response.content, "html.parser")
        categories = soup.select(".side_categories a")
        for category in categories:
            href = category["href"]
            link = f"http://books.toscrape.com/{href}"
            links_of_categories_all.append(link)

            # Cek jika ada halaman tambahan
            if not href == "catalogue/category/books_1/index.html":
                response = requests.get(link)
                if response.ok:
                    soup = BfS4(response.content, "html.parser")
                    next_page = soup.findAll('ul', class_='pager')
                    if next_page:
                        for page in next_page:
                            all_num_page = page.find("li", class_="current").text
                            num_page = int(all_num_page.strip()[10:])
                            counter = 2
                            while num_page > 1:
                                link_next_page = f"{link.replace('index.html', '')}page-{counter}.html"
                                links_of_categories_all.append(link_next_page)
                                num_page -= 1
                                counter += 1

        links_of_categories = links_of_categories_all[1:]
        print(f"Found {len(links_of_categories)} category links.")
        return links_of_categories

print("Category scraping function loaded.")

Category scraping function loaded.


In [5]:
# get all links of the books in one category:
def scrape_links_of_books_in_category(category_links):
    print("---------- Starting book scraping in categories ----------")
    books_in_category = []
    for link in category_links:
        response = requests.get(link.strip())
        if response.ok:
            soup = BfS4(response.content, "html.parser")
            articles = soup.find_all("article", class_="product_pod")
            for article in articles:
                a = article.find("a")
                a_link = a["href"]
                books_in_category.append(
                    f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}'
                )
    print(f"Found {len(books_in_category)} book links in categories.")
    return books_in_category

print("Book link scraping function loaded.")


Book link scraping function loaded.


In [6]:
# Functions for scraping data from book pages in a single category
def scrape_books_from_category_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

    soup = BfS4(response.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')
    book_data = []

    for book in books:
        title = book.h3.a['title']
        image = book.find('img')['src']
        image_url = image.replace("../../", "http://books.toscrape.com/")
        book_link = 'https://books.toscrape.com/catalogue/' + book.h3.a['href'].replace('../../../', '')
        category = soup.find("a", attrs={"href": re.compile("/category/books/")}).string.strip()
        
        book_data.append({
            'Title': title,
            'Category': category,
            'Image URL': image_url
        })
        save_image(title, image_url, category)

    print(f"Scraped {len(book_data)} books from category page.")
    return book_data

print("Book data scraping function loaded.")

Book data scraping function loaded.


In [7]:
# Retrieving category information
def category_info(links):
    information = []
    for link in links:
        book_info = scrape_books_from_category_page(link)
        information.append(book_info)
    print(f"Collected information for {len(information)} categories.")
    display(book_info)
    return information

print("Category information function loaded.")

Category information function loaded.


## Main Process

In [8]:
all_categories = scraping_category()
display(all_categories)

---------- Starting category scraping ----------
Found 80 category links.


['http://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/page-2.html',
 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/page-2.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/page-3.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/page-4.html',
 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html',
 'http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html',
 'http://books.toscrape.com/catalogue/category/books/romance_8/index.html',
 'http://books.toscrape.com/ca

In [9]:
links = scrape_links_of_books_in_category(all_categories)
display(links)

---------- Starting book scraping in categories ----------
Found 1000 book links in categories.


['http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html',
 'http://books.toscrape.com/catalogue/full-moon-over-noahs-ark-an-odyssey-to-mount-ararat-and-beyond_811/index.html',
 'http://books.toscrape.com/catalogue/see-america-a-celebration-of-our-national-parks-treasured-sites_732/index.html',
 'http://books.toscrape.com/catalogue/vagabonding-an-uncommon-guide-to-the-art-of-long-term-world-travel_552/index.html',
 'http://books.toscrape.com/catalogue/under-the-tuscan-sun_504/index.html',
 'http://books.toscrape.com/catalogue/a-summer-in-europe_458/index.html',
 'http://books.toscrape.com/catalogue/the-great-railway-bazaar_446/index.html',
 'http://books.toscrape.com/catalogue/a-year-in-provence-provence-1_421/index.html',
 'http://books.toscrape.com/catalogue/the-road-to-little-dribbling-adventures-of-an-american-in-britain-notes-from-a-small-island-2_277/index.html',
 'http://books.toscrape.com/catalogue/neither-here-nor-there-travels-in-europe_198/index.html',
 'h

In [10]:
book_data = category_info(links)
display(book_data)

Category: Travel
Image for 'Libertarianism for Beginners' saved in 'Travel' folder.
Category: Travel
Image for 'Mesaerion_ The Best Science Fiction Stories 1800-1849' saved in 'Travel' folder.
Category: Travel
Image for 'Olio' saved in 'Travel' folder.
Category: Travel
Image for 'Our Band Could Be Your Life_ Scenes from the American Indie Underground, 1981-1991' saved in 'Travel' folder.
Category: Travel
Image for 'Rip it Up and Start Again' saved in 'Travel' folder.
Category: Travel
Image for 'Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)' saved in 'Travel' folder.
Scraped 6 books from category page.
Category: Travel
Image for 'It's Only the Himalayas' saved in 'Travel' folder.
Category: Travel
Image for 'Libertarianism for Beginners' already exists, skipping download.
Category: Travel
Image for 'Mesaerion_ The Best Science Fiction Stories 1800-1849' already exists, skipping download.
Category: Travel
Image for 'Olio' already exists, skipping download.
Category: Travel
Image

[{'Title': 'Dark Notes',
  'Category': 'Crime',
  'Image URL': 'http://books.toscrape.com/media/cache/6e/4e/6e4e8f4f4abd94356a9be840e4681e65.jpg'},
 {'Title': 'Amid the Chaos',
  'Category': 'Crime',
  'Image URL': 'http://books.toscrape.com/media/cache/52/46/524655fade1d9fe1475395a3eaff827a.jpg'},
 {'Title': "Equal Is Unfair: America's Misguided Fight Against Income Inequality",
  'Category': 'Crime',
  'Image URL': 'http://books.toscrape.com/media/cache/00/11/001153d2a22d889837efac1703e10a5e.jpg'},
 {'Title': 'Why the Right Went Wrong: Conservatism--From Goldwater to the Tea Party and Beyond',
  'Category': 'Crime',
  'Image URL': 'http://books.toscrape.com/media/cache/db/1b/db1babd3c09b84da800b0e9897fe0097.jpg'},
 {'Title': 'The Art and Science of Low Carbohydrate Living',
  'Category': 'Crime',
  'Image URL': 'http://books.toscrape.com/media/cache/4b/d4/4bd43108fb070ad8ebba9cdb00b14069.jpg'},
 {'Title': '10-Day Green Smoothie Cleanse: Lose Up to 15 Pounds in 10 Days!',
  'Category'

[[{'Title': 'Libertarianism for Beginners',
   'Category': 'Travel',
   'Image URL': 'http://books.toscrape.com/media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg'},
  {'Title': 'Mesaerion: The Best Science Fiction Stories 1800-1849',
   'Category': 'Travel',
   'Image URL': 'http://books.toscrape.com/media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg'},
  {'Title': 'Olio',
   'Category': 'Travel',
   'Image URL': 'http://books.toscrape.com/media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg'},
  {'Title': 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
   'Category': 'Travel',
   'Image URL': 'http://books.toscrape.com/media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg'},
  {'Title': 'Rip it Up and Start Again',
   'Category': 'Travel',
   'Image URL': 'http://books.toscrape.com/media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg'},
  {'Title': "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
   'Category': 'Travel',
  