In [1]:
from bs4 import BeautifulSoup
import requests
import time
import random

In [4]:
import pandas as pd
import re

In [3]:
# Abre el archivo 'cities' y lee su contenido
with open(r'tests\cities.csv', 'r') as file:
    # Lee todas las líneas del archivo y elimina los saltos de línea
    cities_list = [line.strip() for line in file]

# Imprime la lista de ciudades
print(cities_list)

['Altamonte-Springs', 'Apopka', 'Bartow', 'Celebration', 'Clermont', 'Dunnellon', 'Eatonville', 'Kissimmee', 'Lake-Buena-Vista', 'Lake-Mary', 'Lake-Placid', 'Lake-Wales', 'Lakeland', 'Mount-Dora', 'Ocala', 'Orlando', 'Oviedo', 'Sanford', 'Sebring', 'St-Cloud', 'Tavares', 'Umatilla', 'Wauchula', 'Wildwood', 'Winter-Garden', 'Winter-Haven', 'Winter-Park']


In [7]:
def extract_text_with_spaces(element):
    """
    Extrae texto de un elemento HTML manteniendo espacios correctos.
    
    Args:
        element: Elemento BeautifulSoup a procesar
        
    Returns:
        str: Texto limpio con espacios preservados
    """
    def process_node(node):
        if node.name is None:  # Nodo de texto
            return node.string
        
        processed_parts = []
        for child in node.children:
            text = process_node(child)
            if text:
                processed_parts.append(text.strip())
        
        return ' '.join(processed_parts)
    
    # Obtener texto procesado y limpiar espacios múltiples
    text = process_node(element)
    return re.sub(r'\s+', ' ', text).strip()

In [8]:
data_list = []

# Itera sobre cada ciudad en la lista
for city in cities_list:
    # URL de la página web para la ciudad actual
    url = f'https://www.visitflorida.com/places-to-go/central/{city}/'
    
    # Realiza una solicitud HTTP a la página web
    response = requests.get(url)
    
    # Verifica que la solicitud fue exitosa
    if response.status_code == 200:
        # Analiza el contenido HTML de la página
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encuentra los elementos con la clase 'left-container sv-col'
        elements = soup.find_all(class_='left-container sv-col')
        
        # Extrae y almacena el contenido de esos elementos en un diccionario
        for element in elements:
            data = {
                'city': city,
                'content': extract_text_with_spaces(element)
            }
            data_list.append(data)
    else:
        print(f'Error al acceder a la página de {city}: {response.status_code}')
    
    # Espera un tiempo aleatorio entre 1 y 5 segundos antes de la siguiente solicitud
    time.sleep(random.uniform(1, 5))


In [9]:
print(data_list[:3])

[{'city': 'Altamonte-Springs', 'content': 'Only 15 minutes north of downtown Orlando , 30 minutes north of Walt Disney World , and less than an hour from Daytona Beach , the city of Altamonte Springs is conveniently located near Central Florida’s major attractions while offering plenty to see and do within the city. A wide variety of shopping and dining choices can be found at the business and urban residential district of Uptown Altamonte , home to the Altamonte Mall . Its focal point, lakefront Cranes Roost Park , features an amphitheater , a 61-jet fountain that displays choreographed shows, a European-style plaza , and a 19-foot-tall steel crane sculpture . At the 45-acre park, visitors can stroll or jog along the waterfront, fish , picnic, and attend special events , which include Battle of the Bands , Red Hot & Boom , and Light up the Holidays . Altamonte Springs also offers a cutting-edge way to get around town with CraneRIDES , its autonomous vehicle shuttle. The service runs t

In [10]:
import json

In [11]:
ruta_archivo = 'tests/data.json'

In [12]:
with open(ruta_archivo, "w", encoding="utf-8") as archivo:
    json.dump(data_list, archivo, ensure_ascii=False, indent=4)