## Web scraping en inglés

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website
url = 'https://www.goodhousekeeping.com/home/craft-ideas/g1389/diy-kids-activities/'

# Send a GET request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # List to hold the data
    activities = []

    # Find all activity blocks
    articles = soup.find_all('div', class_='simple-item')

    for article in articles:
        title = article.find('h2')
        description = article.find('p')

        activity_data = {
            'activity': title.get_text(strip=True) if title else None,
            'description': description.get_text(strip=True) if description else None,
            'age': None,               # Will try to extract later if possible
            'how_many_people': None,   # Will try to infer
            'price': None              # Will try to infer
        }
        
        activities.append(activity_data)
    
    # Create a DataFrame
    df = pd.DataFrame(activities)
    
    # Show the DataFrame
    print(df)

else:
    print('Failed to retrieve the page:', response.status_code)


Empty DataFrame
Columns: []
Index: []


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL
url = 'https://www.goodhousekeeping.com/home/craft-ideas/g1389/diy-kids-activities/'

# Headers to avoid getting blocked
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# Get the page
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all slides (each activity)
slides = soup.find_all('div', class_='slide')

activities = []

for slide in slides:
    # Extract title
    title_tag = slide.find(['h2', 'h3'])
    title = title_tag.get_text(strip=True) if title_tag else None
    
    # Extract description
    description_tag = slide.find('p')
    description = description_tag.get_text(strip=True) if description_tag else None
    
    # Append to the list
    activities.append({
        'activity': title,
        'description': description,
        'age': None,               # To infer later
        'how_many_people': None,   # To infer later
        'price': None              # To infer later
    })

# Create a DataFrame
df = pd.DataFrame(activities)

# Show the DataFrame
print(df.head())


Empty DataFrame
Columns: []
Index: []


In [None]:
# BASH
#pip install selenium pandas webdriver-manager


In [8]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Setup Selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # run in background
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open the page
url = 'https://www.goodhousekeeping.com/home/craft-ideas/g1389/diy-kids-activities/'
driver.get(url)

# Give it time to load JavaScript
time.sleep(5)

# Find all slides (each activity)
slides = driver.find_elements(By.CLASS_NAME, 'slide')

activities = []

for slide in slides:
    # Extract title
    try:
        title = slide.find_element(By.TAG_NAME, 'h2').text
    except:
        try:
            title = slide.find_element(By.TAG_NAME, 'h3').text
        except:
            title = None
    
    # Extract description
    try:
        description = slide.find_element(By.TAG_NAME, 'p').text
    except:
        description = None
    
    activities.append({
        'activity': title,
        'description': description,
        'age': None,
        'how_many_people': None,
        'price': None
    })

# Close browser
driver.quit()

# Create DataFrame
df = pd.DataFrame(activities)

# Show
df.head()


### https://indyschild.com/70-things-to-do-with-kids-now-that-were-all-stuck-at-home/

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Fetch the webpage content
url = 'https://indyschild.com/70-things-to-do-with-kids-now-that-were-all-stuck-at-home/'
headers = {
    'User-Agent': 'Mozilla/5.0'
}
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Step 2: Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Extract activities
    # Find the main content area
    content_div = soup.find('div', class_='entry-content')
    activities = []

    if content_div:
        # Find all list items within the content
        list_items = content_div.find_all('li')
        for item in list_items:
            activity_text = item.get_text(strip=True)
            activities.append({
                'activity': activity_text,
                'age': None,
                'how_many_people': None,
                'price': None
            })

    # Step 4: Create a DataFrame
    df = pd.DataFrame(activities)

    # Display the first few rows
    print(df.head())
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Empty DataFrame
Columns: []
Index: []


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = 'https://indyschild.com/70-things-to-do-with-kids-now-that-were-all-stuck-at-home/'
MAIN_URL = BASE_URL + "/madrid/es/que-hacer"

headers = {
    "User-Agent": "Mozilla/5.0"
}

def get_event_links():
    res = requests.get(MAIN_URL, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    cards = soup.find_all("a", href=True)

    # Filtrar solo URLs de eventos
    event_links = [BASE_URL + a["href"] for a in cards if "/madrid/es/que-hacer/" in a["href"]]
    return list(set(event_links))  # Eliminar duplicados

def parse_event(url):
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    def safe_text(selector):
        tag = soup.select_one(selector)
        return tag.get_text(strip=True) if tag else "N/A"

    nombre_evento = safe_text("h1, h2")
    descripcion = soup.get_text(" ", strip=True).lower()

    # Reglas básicas por palabras clave
    discapacidad = "visual" if "lengua de signos" in descripcion else "ninguna"
    modalidad = "exterior" if "aire libre" in descripcion else "interior"
    costo = "gratis" if "gratis" in descripcion else "pago"

    # Valores manuales o heurísticos
    edad_dirigida = "todas las edades" if "familiar" in descripcion or "niños" in descripcion else "N/A"
    min_integrantes = "N/A"
    ubicacion = safe_text("[data-testid*=location]") or "N/A"
    categoria = safe_text("ul.breadcrumbs li:nth-last-child(1)") or "N/A"

    return {
        "nombre_evento": nombre_evento,
        "categoría": categoria,
        "discapacidad": discapacidad,
        "ubicación": ubicacion,
        "costo": costo,
        "edad_dirigida": edad_dirigida,
        "min_integrantes": min_integrantes,
        "modalidad": modalidad,
        "url": url
    }

# Extraer eventos
eventos = []
for link in get_event_links()[:10]: 
    try:
        print(f"Procesando: {link}")
        evento = parse_event(link)
        eventos.append(evento)
    except Exception as e:
        print(f"Error en {link}: {e}")

df = pd.DataFrame(eventos)

df

# Opcional: Guardar en CSV
#df.to_csv("eventos_timeout_madrid.csv", index=False)