In [None]:
import requests
from bs4 import BeautifulSoup as BfS4
import wget
import os
import re
import time
from pathlib import Path
import csv

In [None]:
# Fungsi untuk sanitasi nama folder atau file (menghilangkan karakter yang tidak valid)
def sanitize_filename(name):
    return re.sub(r'[<>:"/\\|?*]', '_', name)

In [None]:
# Fungsi untuk menyimpan gambar dalam folder kategori
def save_image(title, image_url, category):
    # Tentukan path folder berdasarkan kategori
    path = f"images/category/{category}/"
    
    # Membuat folder kategori jika belum ada
    Path(path).mkdir(parents=True, exist_ok=True)

    try:
        # Debugging kategori
        print(f"Category: {category}")  # Memastikan kategori yang diambil benar
        
        # Sanitasi nama file untuk menghindari karakter tidak valid
        sanitized_title = sanitize_filename(title)
        image_filename = f"{path}{sanitized_title}.jpg"  # Menentukan nama file gambar yang disimpan
        
        # Cek apakah gambar sudah ada di folder kategori
        if not os.path.exists(image_filename):
            # Mengunduh gambar menggunakan wget
            wget.download(image_url, image_filename, bar=None)
            print(f"Image for {sanitized_title} saved in {category} folder.")
        else:
            print(f"Image for {sanitized_title} already exists, skipping download.")
    
    except Exception as e:
        print(f"Error downloading image for {title}: {e}")

In [None]:
# scrape all links of the categories even for multiple pages:
def scraping_category():
    print("----------start category----------")
    print(" Please wait ... ")
    url = "http://books.toscrape.com/"
    response = requests.get(url)
    if response.ok:
        # create a list for all links of the categories:
        links_of_categories_all = []
        soup = BfS4(response.content, "html.parser")
        # take information for the sidebar: categories
        categories = soup.select(".side_categories a")
        for category in categories:
            href = category["href"]
            link = f"http://books.toscrape.com/{href}"
            # create one link of each book:
            links_of_categories_all.append(link)

            # start from the second link, start with Travel:
            if not href == "catalogue/category/books_1/index.html":
                response = requests.get(link)
                if response.ok:
                    soup = BfS4(response.content, "html.parser")
                    # check if for a next page, take the info: page 1 of 2:
                    next_page = soup.findAll('ul', class_='pager')
                    if next_page:
                        for page in next_page:
                            all_num_page = page.find("li", class_="current").text
                            # get the last number of info, to know how many pages will be there:
                            num_page = int(all_num_page.strip()[10:])

                            counter = 2
                            while num_page > 1:
                                link_next_page = f"{link.replace('index.html', '')}page-{counter}.html"
                                links_of_categories_all.append(link_next_page)
                                num_page -= 1
                                counter += 1

        # start from the second link in the list:
        links_of_categories = links_of_categories_all[1:]
        # all links including multiple pages
        return links_of_categories

In [None]:
# get all links of the books in one category:
def scrape_links_of_books_in_category(category_links):
    print("----------start books in category----------")
    print(" Please wait ... ")
    # read information to get the book link of each book in one category:
    # create a list for all links of books inside a category:
    books_in_category = []
    for link in category_links:
        book_url = link.strip()
        response = requests.get(book_url)
        if response.ok:
            soup = BfS4(response.content, "html.parser")
            # find all <article class="product_pod">:
            articles = soup.find_all("article", class_="product_pod")
            for article in articles:
                a = article.find("a")
                a_link = a["href"]
                # create link of each book:
                books_in_category.append(
                    f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}'
                )

    return books_in_category

In [None]:
# Fungsi untuk scraping data dari setiap halaman buku
def scrape_books_from_category_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Memeriksa apakah permintaan berhasil
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

    soup = BfS4(response.text, 'html.parser')  # Menggunakan built-in html.parser

    # Mengambil semua buku
    books = soup.find_all('article', class_='product_pod')
    book_data = []

    for book in books:
        # Mengambil judul buku
        title = book.h3.a['title']

        # Mengambil URL gambar sampul
        image = book.find('img')['src']
        image_url = image.replace("../../", "http://books.toscrape.com/")

        # Mengambil URL halaman detail buku
        book_link = 'https://books.toscrape.com/catalogue/' + book.h3.a['href'].replace('../../../', '')

        # Mengambil kategori buku (menggunakan BeautifulSoup)
        category = soup.find("a", attrs={"href": re.compile("/category/books/")}).string.strip()
        
        # Menyimpan data buku dalam bentuk dictionary
        book_data.append({
            'Title': title,
            'Category': category,
            'Image URL': image_url
        })

        # Menyimpan gambar menggunakan fungsi dari ImageScraper.py
        save_image(title, image_url, category)

    return book_data

In [None]:
# Fungsi untuk scraping beberapa halaman
def scrape_multiple_pages(base_url, total_pages):
    all_books = []

    for page in range(1, total_pages + 1):
        if page == 1:
            url = base_url  # Halaman pertama
        else:
            url = f"{base_url}catalogue/page-{page}.html"  # Halaman berikutnya
        print(f"Scraping page {page}: {url}")
        books = scrape_books_from_category_page(url)
        if books:
            all_books.extend(books)
        time.sleep(1)  # Memberikan jeda untuk menghindari terlalu banyak request

    return all_books

In [None]:
def category_info(links):
    information = []
    for link in links:
        book_info = scrape_books_from_category_page(link)
        information.append(book_info)
    return information

In [None]:
# Fungsi untuk mengambil data dari halaman detail buku
def scrape_book_details(book_link):
    try:
        response = requests.get(book_link)
        response.raise_for_status()  # Memeriksa apakah permintaan berhasil
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return {'description': 'No description available', 'price_incl_tax': 'N/A', 'price_excl_tax': 'N/A', 'price_tax': 'N/A'}

    soup = BfS4(response.text, 'html.parser')

    # Mengambil deskripsi produk
    description = soup.find('meta', {'name': 'description'})
    description = description['content'] if description else 'No description available'

    # Mengambil harga produk (price including tax, price excluding tax, price tax)
    price_incl_tax = 'N/A'
    price_excl_tax = 'N/A'
    price_tax = 'N/A'

    price_incl_tax_elem = soup.find('th', text='Price (incl. tax)')
    if price_incl_tax_elem:
        price_incl_tax = price_incl_tax_elem.find_next_sibling('td').text.strip()

    price_excl_tax_elem = soup.find('th', text='Price (excl. tax)')
    if price_excl_tax_elem:
        price_excl_tax = price_excl_tax_elem.find_next_sibling('td').text.strip()

    price_tax_elem = soup.find('th', text='Tax')
    if price_tax_elem:
        price_tax = price_tax_elem.find_next_sibling('td').text.strip()

    return {
        'description': description,
        'price_incl_tax': price_incl_tax,
        'price_excl_tax': price_excl_tax,
        'price_tax': price_tax
    }

In [None]:
# Fungsi untuk mengambil link buku dari katalog
def scrape_links_of_books_from_page(page_url):
    books_in_page = []
    response = requests.get(page_url)
    if response.ok:
        soup = BfS4(response.content, "html.parser")
        # Ambil semua artikel dengan kelas "product_pod" yang berisi informasi buku
        articles = soup.find_all("article", class_="product_pod")
        for article in articles:
            a = article.find("a")
            a_link = a["href"]
            # Membuat link lengkap ke halaman detail buku
            books_in_page.append(f'http://books.toscrape.com/catalogue/{a_link.replace("../../../", "")}')
    return books_in_page

In [None]:
# Fungsi untuk mengambil data detail satu buku
def scrape_book_data(book_link):
    print(f"Scraping {book_link} ...")
    response = requests.get(book_link)
    if response.ok:
        soup = BfS4(response.content, "html.parser")
        image = soup.find("img")
        image_url = image["src"].replace("../../", "http://books.toscrape.com/")  # Mengubah url relatif menjadi absolut
        title = image["alt"]
        price = soup.find('p', class_='price_color').text
        availability = soup.find("th", text="Availability").find_next_sibling("td").string.strip()
        rating = soup.find("p", attrs={'class': 'star-rating'}).get("class")[1]
        details = scrape_book_details(book_link)
        
        data = {
            "Title": title,
            "Price": price,
            "Price including tax": details['price_incl_tax'],
            "Price excluding tax": details['price_excl_tax'],
            "Price Tax": details['price_tax'],
            "Availability": availability,
            "Product Description": details['description'],
            "Rating": rating,
            "Image URL": image_url,
            "Link": book_link
        }
        return data
    return None

In [None]:
# Fungsi untuk scraping buku dari beberapa halaman katalog
def scrape_books_from_pages(base_url, total_pages):
    all_books = []
    for page in range(1, total_pages + 1):
        if page == 1:
            url = base_url  # Halaman pertama
        else:
            url = f"{base_url}catalogue/page-{page}.html"  # Halaman berikutnya

        print(f"Scraping page {page}: {url}")
        
        # Ambil semua link buku dari halaman ini
        books_in_page = scrape_links_of_books_from_page(url)
        for book_link in books_in_page:
            book_data = scrape_book_data(book_link)
            if book_data:
                all_books.append(book_data)

        time.sleep(1)  # Memberikan jeda untuk menghindari terlalu banyak request

    return all_books

In [None]:
# Fungsi untuk menyimpan hasil scraping ke file CSV
def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)
    print(f"Data saved to {filename}")

In [None]:
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    # start the program:
    # get first all categories with category_scrape:
    all_categories = scraping_category()
    
    # Menyimpan gambar menggunakan fungsi dari ImageScraper.py
    # save_image(image_url, title, category)
    
    links = scrape_links_of_books_in_category(all_categories)
    
    category_info(links)

In [None]:
def main():
    base_url = 'http://books.toscrape.com/'  # URL dasar untuk katalog buku
    total_pages = 3  # Jumlah halaman yang ingin di-scrape, bisa Anda ubah sesuai kebutuhan
    
    # Scrape buku dari beberapa halaman
    books_data = scrape_books_from_pages(base_url, total_pages)

    # Simpan hasil ke file CSV
    save_to_csv(books_data, 'books_data.csv')

if __name__ == "__main__":
    main()

In [None]:
# Fungsi utama untuk memulai proses scraping dan menyimpan ke file CSV
def main():
    base_url = 'http://books.toscrape.com/'  # URL dasar untuk katalog buku
    total_pages = 3  # Jumlah halaman yang ingin di-scrape, bisa Anda ubah sesuai kebutuhan

    # Scrape buku dari beberapa halaman
    all_categories = scraping_category()
    links = scrape_links_of_books_in_category(all_categories)
    books_data = []
    for link in links:
        book_data = scrape_book_data(link)
        if book_data:
            books_data.append(book_data)

    # Simpan hasil ke file CSV
    save_to_csv(books_data, 'books_data.csv')
    print("Scraping selesai.")
if __name__ == "__main__":
    main()
