In [None]:
# BLOQUE 1: Importaciones y configuraci√≥n inicial

import requests
from bs4 import BeautifulSoup
import sqlite3
import pandas as pd
import time
import re
from urllib.parse import urljoin, quote
import json

print("‚úÖ Librer√≠as importadas correctamente")
print("üï∑Ô∏è Comenzando la infiltraci√≥n en Books To Scrape...")


# BLOQUE 2: Funciones de Web Scraping 

def get_all_categories ():
    ''' OBTIENE TODAS LAS CATEGORIAS DEL SITIO'''
    url = "https://books.toscrape.com"
    respuesta = requests.get(url) # se descarga la web 
    soup = BeautifulSoup(respuesta.content, 'html.parser')  
    # convierte texto HTML ilegible, en un objeto inteligente que se pueda navegar 
    
    categories = []
    nav_list = soup.find('ul', class_='nav nav-list')
    if nav_list:
        category_links = nav_list.find_all ('a'[1:]) # Saltar "Books"
        for link in category_links:
            category_name = link.text.strip()
            category_url = urljoin(url, link['href'])
            categories.append({
                'name' : category_name,
                'url' : category_url
            })
            
    print (f"üéØ Encontradas {len(categories)} categorias")
    return categories 


def book_quantity (book_url): 
    ''' OBTIENE LA CANTIDAD EN STOCK DE UN LIBRO DESDE SU PAGINA DE DETALLES  '''
    try:
        soup_quantity = BeautifulSoup(requests.get(book_url).content,'html.parser') 
        quantity_text = soup_quantity.select_one('p.instock.availability').get_text(strip=True)
        match = re.search(r'\((\d+))', quantity_text)
        if match:
            return int (match.group(1)) # devuelve la cantidad encontrada 
        else:
            return 0 # si no se encuentra la cantidad, devuelve 0 
    except Exception as e :
        print (f"‚ùå Erorr obteniendo cantidad para {book_url}:{e}")
        return 0 # en caso de error, devuelve 0 
    


def scrape_books_from_page(page_url): 
    ''' SCRAPE LIBROS DE UNA PAGINA ESPECIFICA  '''
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content,'html.parser' )
    
    books = []
    book_containers = soup.find_all ('article', class_ ='product_pod')
    
    for book in book_containers: 
        try:
            # TITULO
            title_element = book.find('h3').find('a')
            title = title_element['title']

            # URL DEL LIBRO PARA MAS DETALLES 
            book_url = urljoin(page_url, title_element['href'])
            
            price_element = book.find('p', class_= 'price_color')
            price_text = price_element.text.strip() if price_element else "¬£0.00"
            price = float (price_text.lstrip('√Ç¬£'))
            
            # RATING 
            rating_element = book.find ('p', class_= 'star-rating')
            rating_class = rating_element['class'][1] if rating_element else 'Zero'
            rating_map = {'One': 1, 'Two' : 2 , 'Three': 3, 'Four': 4, 'Five': 5, "Zero" : 0 } 
            rating = rating_map.get(rating_class, 0)
            
            
            # STOCK 
            stock_element = book.find ('p', class_= 'instock availability')
            in_stock = 'In stock' in stock_element.text if stock_element else False 
            quantity = book_quantity(book_url)
            
            
            books.append({
                
                'title': title,
                'price': price,
                'rating' : rating,
                'in_stock' : in_stock,
                'quantity': quantity,
                'url': book_url
            })
            
        except Exception as e:
            print(f"‚ùå Error procesando libro: {e}") 
            continue
    return books 

def scrape_all_books (): 
    ''' SCRAPE TODOS LOS LIBROS DEL SITIO '''
    all_books = []
    categories = get_all_categories()
    for i, category in enumerate(categories):
        print (f"Procesando categoria {i+1}/ {len(categories)}: {category['name']}")
        
        page_num = 1 
        current_url = category['url']
        
        while current_url: 
            print (f" Pagina {page_num}")
            books_on_page = scrape_books_from_page(current_url)
            
            for book in books_on_page:
                book['category'] = category['name']
                
            all_books.extend(books_on_page)
            
            # BUSCAR SIGUIENTE PAGINA 
            response = requests.get(current_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            next_button = soup.find('li', class_= 'next')
            
            if next_button and next_button.find('a'):
                next_url = next_button.find('a')['href']
                current_url = urljoin(current_url, next_url)
                page_num += 1 
            else:
                current_url = None 
                
            time.sleep (0.5) # ser amigables con el servidor 
        
    print (f" üéâ Scraping completado: {len(all_books)} libros encontrados")
    return all_books


# BLOQUE 3: EJECUTAR EL SCRAPING 
books_data = scrape_all_books

# Mostrar muestra de datos 
print ("\n Muestra de los primeros 3 libros: ")
for i, book in enumerate(books_data[:3]):
    print(f"{i+1}. {book['title']} - {book['price']} - ‚≠ê{book['rating']} - {book['category']} - {book['quantity']}")


In [None]:
# Creacion de base de datos ( DDL) 