# Books to Scrape - Scraping: Advanced Version (V2)
___

## Import Library

In [None]:
import requests
from bs4 import BeautifulSoup as BfS4
import wget
import os
import re
import time
from pathlib import Path
import csv
import pandas as pd

print("Libraries imported successfully.")

## Scraping Image

In [None]:
# Fungsi untuk sanitasi nama folder atau file (menghilangkan karakter yang tidak valid)
def sanitize_filename(name):
    return re.sub(r'[<>:"/\|?*]', '_', name)

print("Sanitization function loaded.")

In [None]:
# Fungsi untuk menyimpan gambar dalam folder kategori
def save_image(title, image_url, category):
    # Tentukan path folder berdasarkan kategori
    path = f"images/category/{category}/"
    
    # Membuat folder kategori jika belum ada
    Path(path).mkdir(parents=True, exist_ok=True)

    try:
        # Debugging kategori
        print(f"Category: {category}")  # Memastikan kategori yang diambil benar
        
        # Sanitasi nama file untuk menghindari karakter tidak valid
        sanitized_title = sanitize_filename(title)
        image_filename = f"{path}{sanitized_title}.jpg"  # Menentukan nama file gambar yang disimpan
        
        # Cek apakah gambar sudah ada di folder kategori
        if not os.path.exists(image_filename):
            # Mengunduh gambar menggunakan wget
            wget.download(image_url, image_filename, bar=None)
            print(f"Image for '{sanitized_title}' saved in '{category}' folder.")
        else:
            print(f"Image for '{sanitized_title}' already exists, skipping download.")
    
    except Exception as e:
        print(f"Error downloading image for '{title}': {e}")

print("Image saving function loaded.")

In [None]:
# scrape all links of the categories even for multiple pages:
def scraping_category():
    print("---------- Starting category scraping ----------")
    url = "http://books.toscrape.com/"
    response = requests.get(url)
    if response.ok:
        links_of_categories_all = []
        soup = BfS4(response.content, "html.parser")
        categories = soup.select(".side_categories a")
        for category in categories:
            href = category["href"]
            link = f"http://books.toscrape.com/{href}"
            links_of_categories_all.append(link)

            # Cek jika ada halaman tambahan
            if not href == "catalogue/category/books_1/index.html":
                response = requests.get(link)
                if response.ok:
                    soup = BfS4(response.content, "html.parser")
                    next_page = soup.findAll('ul', class_='pager')
                    if next_page:
                        for page in next_page:
                            all_num_page = page.find("li", class_="current").text
                            num_page = int(all_num_page.strip()[10:])
                            counter = 2
                            while num_page > 1:
                                link_next_page = f"{link.replace('index.html', '')}page-{counter}.html"
                                links_of_categories_all.append(link_next_page)
                                num_page -= 1
                                counter += 1

        links_of_categories = links_of_categories_all[1:]
        print(f"Found {len(links_of_categories)} category links.")
        return links_of_categories

print("Category scraping function loaded.")

In [None]:
# get all links of the books in one category:
def scrape_links_of_books_in_category(category_links):
    print("---------- Starting book scraping in categories ----------")
    books_in_category = []
    for link in category_links:
        response = requests.get(link.strip())
        if response.ok:
            soup = BfS4(response.content, "html.parser")
            articles = soup.find_all("article", class_="product_pod")
            for article in articles:
                a = article.find("a")
                a_link = a["href"]
                books_in_category.append(
                    f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}'
                )
    print(f"Found {len(books_in_category)} book links in categories.")
    return books_in_category

print("Book link scraping function loaded.")


In [None]:
# Functions for scraping data from book pages in a single category
def scrape_books_from_category_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

    soup = BfS4(response.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')
    book_data = []

    for book in books:
        title = book.h3.a['title']
        image = book.find('img')['src']
        image_url = image.replace("../../", "http://books.toscrape.com/")
        book_link = 'https://books.toscrape.com/catalogue/' + book.h3.a['href'].replace('../../../', '')
        category = soup.find("a", attrs={"href": re.compile("/category/books/")}).string.strip()
        
        book_data.append({
            'Title': title,
            'Category': category,
            'Image URL': image_url
        })
        save_image(title, image_url, category)

    print(f"Scraped {len(book_data)} books from category page.")
    return book_data

print("Book data scraping function loaded.")

In [None]:
# Retrieving category information
def category_info(links):
    information = []
    for link in links:
        book_info = scrape_books_from_category_page(link)
        information.append(book_info)
    print(f"Collected information for {len(information)} categories.")
    display(book_info)
    return information

print("Category information function loaded.")

## Main Process

In [None]:
all_categories = scraping_category()
display(all_categories)

In [None]:
links = scrape_links_of_books_in_category(all_categories)
display(links)

In [None]:
book_data = category_info(links)
display(book_data)

In [None]:
# Read the CSV file and display the first few rows
df = pd.read_csv('books_data.csv')
df.head()