# Books to Scrape - Scraping: Advanced Version (V2)
___

## Import Library

In [1]:
import requests
from bs4 import BeautifulSoup as BfS4
import wget
import os
import re
import time
from pathlib import Path
import csv
import pandas as pd

print("Libraries imported successfully.")

Libraries imported successfully.


## Scraping Image

In [2]:
# Fungsi untuk sanitasi nama folder atau file (menghilangkan karakter yang tidak valid)
def sanitize_filename(name):
    return re.sub(r'[<>:"/\|?*]', '_', name)

print("Sanitization function loaded.")

Sanitization function loaded.


In [3]:
# Fungsi untuk menyimpan gambar dalam folder kategori
def save_image(title, image_url, category):
    # Tentukan path folder berdasarkan kategori
    path = f"images/category/{category}/"
    
    # Membuat folder kategori jika belum ada
    Path(path).mkdir(parents=True, exist_ok=True)

    try:
        # Debugging kategori
        print(f"Category: {category}")  # Memastikan kategori yang diambil benar
        
        # Sanitasi nama file untuk menghindari karakter tidak valid
        sanitized_title = sanitize_filename(title)
        image_filename = f"{path}{sanitized_title}.jpg"  # Menentukan nama file gambar yang disimpan
        
        # Cek apakah gambar sudah ada di folder kategori
        if not os.path.exists(image_filename):
            # Mengunduh gambar menggunakan wget
            wget.download(image_url, image_filename, bar=None)
            print(f"Image for '{sanitized_title}' saved in '{category}' folder.")
        else:
            print(f"Image for '{sanitized_title}' already exists, skipping download.")
    
    except Exception as e:
        print(f"Error downloading image for '{title}': {e}")

print("Image saving function loaded.")

Image saving function loaded.


In [4]:
# scrape all links of the categories even for multiple pages:
def scraping_category():
    print("---------- Starting category scraping ----------")
    url = "http://books.toscrape.com/"
    response = requests.get(url)
    if response.ok:
        links_of_categories_all = []
        soup = BfS4(response.content, "html.parser")
        categories = soup.select(".side_categories a")
        for category in categories:
            href = category["href"]
            link = f"http://books.toscrape.com/{href}"
            links_of_categories_all.append(link)

            # Cek jika ada halaman tambahan
            if not href == "catalogue/category/books_1/index.html":
                response = requests.get(link)
                if response.ok:
                    soup = BfS4(response.content, "html.parser")
                    next_page = soup.findAll('ul', class_='pager')
                    if next_page:
                        for page in next_page:
                            all_num_page = page.find("li", class_="current").text
                            num_page = int(all_num_page.strip()[10:])
                            counter = 2
                            while num_page > 1:
                                link_next_page = f"{link.replace('index.html', '')}page-{counter}.html"
                                links_of_categories_all.append(link_next_page)
                                num_page -= 1
                                counter += 1

        links_of_categories = links_of_categories_all[1:]
        print(f"Found {len(links_of_categories)} category links.")
        return links_of_categories

print("Category scraping function loaded.")

Category scraping function loaded.


In [5]:
# get all links of the books in one category:
def scrape_links_of_books_in_category(category_links):
    print("---------- Starting book scraping in categories ----------")
    books_in_category = []
    for link in category_links:
        response = requests.get(link.strip())
        if response.ok:
            soup = BfS4(response.content, "html.parser")
            articles = soup.find_all("article", class_="product_pod")
            for article in articles:
                a = article.find("a")
                a_link = a["href"]
                books_in_category.append(
                    f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}'
                )
    print(f"Found {len(books_in_category)} book links in categories.")
    return books_in_category

print("Book link scraping function loaded.")


Book link scraping function loaded.


In [6]:
# Functions for scraping data from book pages in a single category
def scrape_books_from_category_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

    soup = BfS4(response.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')
    book_data = []

    for book in books:
        title = book.h3.a['title']
        image = book.find('img')['src']
        image_url = image.replace("../../", "http://books.toscrape.com/")
        book_link = 'https://books.toscrape.com/catalogue/' + book.h3.a['href'].replace('../../../', '')
        category = soup.find("a", attrs={"href": re.compile("/category/books/")}).string.strip()
        
        book_data.append({
            'Title': title,
            'Category': category,
            'Image URL': image_url
        })
        save_image(title, image_url, category)

    print(f"Scraped {len(book_data)} books from category page.")
    return book_data

print("Book data scraping function loaded.")

Book data scraping function loaded.


In [7]:
# Retrieving category information
def category_info(links):
    information = []
    for link in links:
        book_info = scrape_books_from_category_page(link)
        information.append(book_info)
    print(f"Collected information for {len(information)} categories.")
    display(book_info)
    return information

print("Category information function loaded.")

Category information function loaded.


## Scraping Data

In [8]:
# Function to retrieve data from the book's detail page
def scrape_book_details(book_link):
    try:
        response = requests.get(book_link)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return {}

    soup = BfS4(response.text, 'html.parser')
    description = soup.find('meta', {'name': 'description'})
    description = description['content'] if description else 'No description available'

    price_incl_tax = price_excl_tax = price_tax = 'N/A'
    for price_elem in ['Price (incl. tax)', 'Price (excl. tax)', 'Tax']:
        price = soup.find('th', text=price_elem)
        if price:
            setattr(locals(), price_elem.lower().replace(" ", "_"), price.find_next_sibling('td').text.strip())

    return {
        'description': description,
        'price_incl_tax': price_incl_tax,
        'price_excl_tax': price_excl_tax,
        'price_tax': price_tax
    }

print("Book detail scraping function loaded.")

Book detail scraping function loaded.


In [None]:
# Function to retrieve book links from catalogs
def scrape_links_of_books_from_page(page_url):
    books_in_page = []
    response = requests.get(page_url)
    if response.ok:
        soup = BfS4(response.content, "html.parser")
        # Take all the articles with the "product_pod" class that contains the book's information
        articles = soup.find_all("article", class_="product_pod")
        for article in articles:
            a = article.find("a")
            a_link = a["href"]
            # Create a full link to the book's detail page
            books_in_page.append(f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}')
    return books_in_page

print("Retrieve book links from catalogs function loaded.")

In [None]:
# Function to retrieve the detailed data of a single book
def scrape_book_data(book_link):
    print(f"Scraping {book_link} ...")
    response = requests.get(book_link)
    if response.ok:
        soup = BfS4(response.content, "html.parser")
        image = soup.find("img")
        image_url = image["src"].replace("../../", "http://books.toscrape.com/")  # Changing relative urls to absolute
        title = image["alt"]
        price = soup.find('p', class_='price_color').text
        availability = soup.find("th", text="Availability").find_next_sibling("td").string.strip()
        rating = soup.find("p", attrs={'class': 'star-rating'}).get("class")[1]
        details = scrape_book_details(book_link)
        
        data = {
            "Title": title,
            "Price": price,
            "Price including tax": details['price_incl_tax'],
            "Price excluding tax": details['price_excl_tax'],
            "Price Tax": details['price_tax'],
            "Availability": availability,
            "Product Description": details['description'],
            "Rating": rating,
            "Image URL": image_url,
            "Link": book_link
        }
        return data
    return None

print("Retrieve the detailed data of a single book function loaded.")

In [None]:
# Functions for scraping books from multiple catalog pages
def scrape_books_from_pages(base_url, total_pages):
    all_books = []
    for page in range(1, total_pages + 1):
        if page == 1:
            url = base_url  # First Page
        else:
            url = f"{base_url}catalogue/page-{page}.html"  # Next Page

        print(f"Scraping page {page}: {url}")
        
        # Grab all the book links from this page
        books_in_page = scrape_links_of_books_from_page(url)
        for book_link in books_in_page:
            book_data = scrape_book_data(book_link)
            if book_data:
                all_books.append(book_data)

        time.sleep(1)  # Provides a pause to avoid too many requests

    return all_books

print("Scraping books from multiple catalog pages function loaded.")

In [12]:
# Function to save scraping results to a CSV file
def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)
    print(f"Data saved to {filename}")

print("CSV saving function loaded.")

CSV saving function loaded.


## Main Process

### Scrape Image

In [None]:
all_categories = scraping_category()
display(all_categories)

In [None]:
links = scrape_links_of_books_in_category(all_categories)
display(links)

In [None]:
book_data = category_info(links)
display(book_data)

### Scrape Data

In [None]:
base_url = 'http://books.toscrape.com/'  # URL dasar untuk katalog buku
total_pages = 3  # Jumlah halaman yang ingin di-scrape, bisa Anda ubah sesuai kebutuhan

# Scrape buku dari beberapa halaman
books_data = scrape_books_from_pages(base_url, total_pages)
display(books_data)

In [None]:
save_to_csv(books_data, 'books_data.csv')

In [None]:
print("Main scraping process Success.")

In [None]:
# def main():
#     # Scrape Image
#     all_categories = scraping_category()
#     display(all_categories)
#     print("=================")
#     links = scrape_links_of_books_in_category(all_categories)
#     display(links)
#     print("=================")
#     book_data = category_info(links)
#     display(book_data)
#     print("=================")
    
#     # Scrape Data
#     base_url = 'http://books.toscrape.com/'  # URL dasar untuk katalog buku
#     total_pages = 3  # Jumlah halaman yang ingin di-scrape, bisa Anda ubah sesuai kebutuhan
#     # Scrape buku dari beberapa halaman
#     books_data = scrape_books_from_pages(base_url, total_pages)
#     save_to_csv(books_data, 'books_data.csv')

# print("Main scraping process loaded.")

# # Running the main function to execute scraping
# main()

In [None]:
# Read the CSV file and display the first few rows
df = pd.read_csv('books_data.csv')
df.head()