In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Paso 1: Realizar el web scraping para obtener los datos
url = 'http://books.toscrape.com/'
response = requests.get(url)

In [7]:
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    # Paso 2: Almacenar los datos en listas
    titles = []
    prices = []

    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text
        titles.append(title)
        prices.append(price)
else:
    print('Error al obtener la página')

In [8]:
#paso 3: convertir las listas en un dataFrame 
data = {
    'title': titles, 
    'price': prices
}

df_book = pd.DataFrame(data)

In [10]:
df_book

Unnamed: 0,title,price
0,A Light in the Attic,Â£51.77
1,Tipping the Velvet,Â£53.74
2,Soumission,Â£50.10
3,Sharp Objects,Â£47.82
4,Sapiens: A Brief History of Humankind,Â£54.23
5,The Requiem Red,Â£22.65
6,The Dirty Little Secrets of Getting Your Dream...,Â£33.34
7,The Coming Woman: A Novel Based on the Life of...,Â£17.93
8,The Boys in the Boat: Nine Americans and Their...,Â£22.60
9,The Black Maria,Â£52.15


Leer varias paginas del sitio web

In [16]:
# Función para extraer datos de una página
def extract_data(url):
    response = requests.get(url)
    titles = []
    prices = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        books = soup.find_all('article', class_='product_pod')

        for book in books:
            title = book.h3.a['title']
            price = book.find('p', class_='price_color').text
            titles.append(title)
            prices.append(price)
    else:
        print(f'Error al obtener la página {url}')
    
    return titles, prices

# Lista para almacenar todos los datos
all_titles = []
all_prices = []

In [19]:
paginas = ['page-1', 'page-2','page-3','page-4','page-5','page-6','page-7','page-8','page-9','page-10']
base_url = 'http://books.toscrape.com/catalogue/'

for pagina in paginas:
    url = base_url + pagina + '.html'
    titles, prices = extract_data(url)
    all_titles.extend(titles)
    all_prices.extend(prices)

In [20]:
# Convertir las listas en un DataFrame de pandas
data = {
    'Title': all_titles,
    'Price': all_prices
}

df_books = pd.DataFrame(data)

In [21]:
df_books

Unnamed: 0,Title,Price
0,A Light in the Attic,Â£51.77
1,Tipping the Velvet,Â£53.74
2,Soumission,Â£50.10
3,Sharp Objects,Â£47.82
4,Sapiens: A Brief History of Humankind,Â£54.23
...,...,...
195,Eureka Trivia 6.0,Â£54.59
196,Drive: The Surprising Truth About What Motivat...,Â£34.95
197,Done Rubbed Out (Reightman & Bailey #1),Â£37.72
198,Doing It Over (Most Likely To #1),Â£35.61


In [23]:
df_books.to_csv("Data_books.csv")

In [24]:
df_books.dtypes

Title    object
Price    object
dtype: object

In [25]:
#funcion para la limpieza de la data de precios
def limpieza_precios(numero):
    #verificando si el texto es de tipo flotante
    if isinstance(numero, float):
        numero = str(numero)
    
    numeros = "0123456789."
    numero_actual = ""
    
    #convertir texto a cadena de texto antes de iterar sobre sus caracteres
    for caracter in str(numero):
        #verificando si el caracter es un digito o un punto decimal
        if caracter == numeros:
            numero_actual += caracter #agregamos el caracter a numero_actual
    
    #convertimos el numero resultante a float si no esta vacio
    return float(numero_actual) if numero_actual else None
            
        
        

In [26]:
#aplicando funcion de limpieza al dataframe
df_books['Price'] = df_books['Price'].apply(limpieza_precios)
df_books

Unnamed: 0,Title,Price
0,A Light in the Attic,
1,Tipping the Velvet,
2,Soumission,
3,Sharp Objects,
4,Sapiens: A Brief History of Humankind,
...,...,...
195,Eureka Trivia 6.0,
196,Drive: The Surprising Truth About What Motivat...,
197,Done Rubbed Out (Reightman & Bailey #1),
198,Doing It Over (Most Likely To #1),
