In [19]:
import pandas as pd 

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

from time import sleep

from datetime import datetime


In [20]:
# URL de la página de Wikipedia
url = "https://all.accor.com/ssr/app/accor/hotels/madrid-spain/open/index.es.shtml?compositions=1&stayplus=false&snu=false&hideWDR=false&accessibleRooms=false&hideHotelDetails=false&dateIn=2025-03-01&nights=1&destination=madrid-spain"


In [21]:
def hotel_scrapper(url):

    # Inicializamos el WebDriver
    service = Service(ChromeDriverManager().install())
    options = Options()
    options.add_argument("--start-maximized")  
    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get(url)
    sleep(10)

    dictio_scrap_hotel = {
        "nombre": [],
        "rating": [],
        "precio_noche": [],
        "fecha_reserva": []
    }

    # Obtenemos la fecha actual
    fecha = pd.Timestamp(datetime.today()).strftime('%Y-%m-%d')
    fecha = pd.to_datetime(fecha)

    # Obtenemos los elementos visibles
    nombres_hoteles = driver.find_elements(By.CLASS_NAME, "title")

    for i in range(len(nombres_hoteles)):
        try:
            nombre_hotel = nombres_hoteles[i].text
            nombre_hotel = nombre_hotel.split("\n")[0]
            dictio_scrap_hotel["nombre"].append(nombre_hotel)
        except:
            dictio_scrap_hotel["nombre"].append("No disponible")

        try:
            rating_hotel = driver.find_elements(By.CLASS_NAME, "ratings__score")[i].text
            rating_hotel = float(rating_hotel.split("/")[0])
            dictio_scrap_hotel["rating"].append(rating_hotel)
        except:
            dictio_scrap_hotel["rating"].append("No disponible")
        
        try:
            precio_hotel = driver.find_elements(By.CLASS_NAME, "rate-details__price-wrapper")[i].text
            precio_hotel = int(precio_hotel.split("\n")[1].split("€")[0])
            dictio_scrap_hotel["precio_noche"].append(precio_hotel)
        except:
            dictio_scrap_hotel["precio_noche"].append("No disponible")
        
        dictio_scrap_hotel["fecha_reserva"].append(fecha)
        
    driver.close()
    driver.quit()

    df_hoteles_competencia = pd.DataFrame(dictio_scrap_hotel)

    return df_hoteles_competencia
    

In [22]:
df = hotel_scrapper(url)
df

Unnamed: 0,nombre,rating,precio_noche,fecha_reserva
0,Novotel Madrid Center,4.6,255,2025-02-26
1,ibis budget Madrid Calle 30,4.4,110,2025-02-26
2,ibis budget Madrid Centro las Ventas,4.3,119,2025-02-26
3,Novotel Madrid City Las Ventas,4.6,173,2025-02-26
4,ibis budget Madrid Calle Alcalá,4.3,95,2025-02-26
5,ibis Madrid Calle Alcalá,4.5,119,2025-02-26
6,ibis budget Madrid Vallecas,4.3,107,2025-02-26
7,Pullman Madrid Airport & Feria,4.2,129,2025-02-26
8,Novotel Madrid Campo de las Naciones,4.5,154,2025-02-26
9,ibis Madrid Aeropuerto Barajas,4.4,116,2025-02-26


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   nombre         10 non-null     object        
 1   rating         10 non-null     float64       
 2   precio_noche   10 non-null     int64         
 3   fecha_reserva  10 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 452.0+ bytes


In [24]:
df.to_pickle("../data/web_scrapping/hoteles_competencia_info_scrapping.pkl")

# -- Pruebas --

In [None]:
# Inicializamos el WebDriver
service = Service(ChromeDriverManager().install())
options = Options()
options.add_argument("--start-maximized")  
driver = webdriver.Chrome(service=service, options=options)

In [None]:
# Navegamos a la página
driver.get(url)
sleep(10)

In [None]:
dictio_scrap = {
        "nombre": [],
        "rating": [],
        "precio_noche": [],
        "fecha_reserva": []
    }

In [None]:
nombre_hotel = driver.find_element(By.CLASS_NAME, "title__link").text
nombre_hotel = nombre_hotel.split("\n")[0]
dictio_scrap["nombre"].append(nombre_hotel)

In [None]:
rating_hotel = driver.find_element(By.CLASS_NAME, "ratings__score").text
rating_hotel = rating_hotel.split("/\n")[0]
dictio_scrap["rating"].append(rating_hotel)

In [None]:
precio_hotel = driver.find_element(By.CLASS_NAME, "rate-details__price-wrapper").text
precio_hotel = precio_hotel.split("\n")[1].split("€")[0]
dictio_scrap["precio_noche"].append(precio_hotel)


In [None]:
fecha = pd.Timestamp(datetime.today()).strftime('%Y-%m-%d')
dictio_scrap["fecha_reserva"].append(fecha)


In [None]:
dictio_scrap

{'nombre': ['Novotel Madrid Center'],
 'rating': ['4.6'],
 'precio_noche': ['255'],
 'fecha_reserva': ['2025-02-25']}

In [None]:
nombres_hoteles = driver.find_elements(By.CLASS_NAME, "title") # Con "title__link" me sacaba 20 valores --> sacaba también el valor "Ver el hotel" por cada nombre de hotel
nombres_hoteles

[<selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.73")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.74")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.75")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.76")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.77")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d57

In [None]:
len(nombres_hoteles)

10

In [None]:
ratings_hoteles = driver.find_elements(By.CLASS_NAME, "ratings__score")
ratings_hoteles

[<selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.71")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.83")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.84")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.85")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.86")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d57

In [None]:
len(ratings_hoteles)

10

In [None]:
precios_hoteles = driver.find_elements(By.CLASS_NAME, "rate-details__price-wrapper")
precios_hoteles

[<selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.72")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.92")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.93")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.94")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d574269f8f", element="f.CE14995718BA26FFC8405D358FA78D23.d.A8E478354CB591B5C895BE7C96BAD8B3.e.95")>,
 <selenium.webdriver.remote.webelement.WebElement (session="3ce9c1c8ffabaad3c787c1d57

In [None]:
len(precios_hoteles)

10

In [None]:
dictio_scrap_hotel = {
        "nombre": [],
        "rating": [],
        "precio_noche": [],
        "fecha_reserva": []
    }

# Obtenemos la fecha actual
fecha = pd.Timestamp(datetime.today()).strftime('%Y-%m-%d')

# Obtenemos los elementos visibles
nombres_hoteles = driver.find_elements(By.CLASS_NAME, "title")

for i in range(len(nombres_hoteles)):
    try:
        nombre_hotel = nombres_hoteles[i].text
        nombre_hotel = nombre_hotel.split("\n")[0]
        dictio_scrap_hotel["nombre"].append(nombre_hotel)
    except:
        dictio_scrap_hotel["nombre"].append("No disponible")

    try:
        rating_hotel = driver.find_elements(By.CLASS_NAME, "ratings__score")[i].text
        rating_hotel = rating_hotel.split("/")[0]
        dictio_scrap_hotel["rating"].append(rating_hotel)
    except:
        dictio_scrap_hotel["rating"].append("No disponible")
    
    try:
        precio_hotel = driver.find_elements(By.CLASS_NAME, "rate-details__price-wrapper")[i].text
        precio_hotel = precio_hotel.split("\n")[1].split("€")[0]
        dictio_scrap_hotel["precio_noche"].append(precio_hotel)
    except:
        dictio_scrap_hotel["precio_noche"].append("No disponible")
    
    dictio_scrap_hotel["fecha_reserva"].append(fecha)


{'nombre': ['Novotel Madrid Center', 'ibis budget Madrid Calle 30', 'ibis Madrid Centro las Ventas', 'ibis budget Madrid Centro las Ventas', 'Novotel Madrid City Las Ventas', 'ibis budget Madrid Calle Alcalá', 'ibis Madrid Calle Alcalá', 'ibis budget Madrid Vallecas', 'Pullman Madrid Airport & Feria', 'Novotel Madrid Campo de las Naciones'], 'rating': ['4.6', '4.4', '4.5', '4.3', '4.6', '4.3', '4.5', '4.3', '4.2', '4.5'], 'precio_noche': ['255', '110', '172', '119', '173', '95', '119', '107', '129', '154'], 'fecha_reserva': ['2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25', '2025-02-25']}


In [None]:
dictio_scrap_hotel

In [None]:
driver.close()
driver.quit()