In [6]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from time import sleep
from time import time

In [53]:
def scraping_airbnb(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-search-engine-choice-screen")
    options.add_argument("--headless")
    options.add_argument('--lang=en')
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Esperar que la página cargue y gestionar popup de traducción
    try:
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//div[@class='c1lbtiq8 atm_mk_stnw88 atm_9s_1txwivl atm_fq_1tcgj5g atm_wq_kb7nvz atm_tk_1tcgj5g dir dir-ltr']"))
        ).click()
    except:
        pass

        # Obtener datos del elemento de reseñas
    try:
        data = driver.find_element(By.XPATH, "//a[contains(@href, 'reviews')]")
        sleep(2)

        # Verificar si el elemento de reseñas tiene texto
        if data.text.strip() == "":
            # Si no hay reseñas, devolver valores predeterminados
            print("No hay reseñas disponibles.")
            return None, None, 0, None, None, None, [], None

        data_list = data.text.split(" ")
        
        # Verificar que la lista tiene suficientes elementos
        if len(data_list) < 2:
            print("Formato inesperado en el texto de reseñas.")
            return None, None, 0, None, None, None, [], None

        # Intentar extraer el número de reseñas
        number_reviews = data_list[-2].split("\n")[-1]
        
        # Si el número de reseñas es mayor a 6, proceder a extraer las reseñas
        if int(number_reviews) > 6:
            return extract_reviews(driver, data_list, number_reviews)
        
        # Extraer la información base (favorito del huésped, rating, etc.)
        guest_favorite, rating, type_host, hosting_time, price, title = extract_base_info(driver, data_list)
        
        # Extraer las reseñas si están disponibles
        reviews_list = driver.find_elements(By.XPATH, "//div[@class='r1bctolv atm_c8_1sjzizj atm_g3_1dgusqm atm_26_lfmit2_13uojos atm_5j_1y44olf_13uojos dir dir-ltr']")
        all_reviews = [review.text for review in reviews_list if len(review.text) > 3]

    except Exception as e:
        guest_favorite = False
        rating = np.nan
        number_reviews = 0
        type_host = "new_host"
        hosting_time = "new"
        price = extract_price(driver)
        all_reviews = []
        title = extract_title(driver)
        return guest_favorite, rating, number_reviews, type_host, hosting_time, price, list(set(all_reviews)), title

    return guest_favorite, rating, number_reviews, type_host, hosting_time, price, list(set(all_reviews)), title



def extract_base_info(driver, data_list):
    # if data_list = 
    guest_favorite = data_list[0] == "Guest\nfavorite\nRated"
    rating = data_list[1] if guest_favorite else np.nan
    type_host = "new_host"
    hosting_time = "new"
    price = extract_price(driver)
    title = extract_title(driver)

    return guest_favorite, rating, type_host, hosting_time, price, title


def extract_price(driver):
    try:
        price_element = driver.find_element(By.XPATH, "//span[@class='a8jt5op atm_3f_idpfg4 atm_7h_hxbz6r atm_7i_ysn8ba atm_e2_t94yts atm_ks_zryt35 atm_l8_idpfg4 atm_vv_1q9ccgz atm_vy_t94yts aze35hn atm_mk_stnw88 atm_tk_idpfg4 dir dir-ltr']")
        return price_element.get_attribute('textContent')
    except:
        try:
            return driver.find_element(By.XPATH, "//span[@class = '_11jcbg2']").text
        except:
            return np.nan


def extract_title(driver):
    try:
        return driver.find_element(By.XPATH, "//div[@class = '_1czgyoo']").text
    except:
        return np.nan


def extract_reviews(driver, data_list, number_reviews):
    guest_favorite = data_list[0] == "Guest\nfavorite\nRated"
    rating = data_list[1]
    type_host, hosting_time = extract_host_info(driver)
    price = extract_price(driver)
    title = extract_title(driver)

    show_more_reviews(driver, number_reviews)
    all_reviews = scroll_reviews(driver)

    return guest_favorite, rating, number_reviews, type_host, hosting_time, price, all_reviews, title


def extract_host_info(driver):
    try:
        superhost_list = driver.find_elements(By.XPATH, "//li[@class='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr']")
        type_host = superhost_list[-2].text if not any(char.isdigit() for char in superhost_list[-2].text) else np.nan
        hosting_time = superhost_list[-1].text
    except:
        type_host = hosting_time = np.nan
    return type_host, hosting_time


def show_more_reviews(driver, number_reviews):
    try:
        more_reviews_button_xpath = f"//button[contains(text(), 'Show all {number_reviews} reviews')]"
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, more_reviews_button_xpath))
        ).click()
        sleep(2)
    except (TimeoutException, NoSuchElementException):
        print("No 'Show More Reviews' button found, proceeding.")


def scroll_reviews(driver):
    scroll_pause_time = 2
    all_reviews = []

    try:
        review_popup = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "_17itzz4")))
        last_height = driver.execute_script("return arguments[0].scrollHeight", review_popup)
    except:
        return all_reviews

    while len(list(set(all_reviews))) < 100:
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", review_popup)
        sleep(scroll_pause_time)
        reviews_list = driver.find_elements(By.XPATH, "//div[@class='r1bctolv atm_c8_1sjzizj atm_g3_1dgusqm atm_26_lfmit2_13uojos atm_5j_1y44olf_13uojos atm_l8_1s2714j_13uojos dir dir-ltr']")
        all_reviews.extend([review.text for review in reviews_list if len(review.text) > 3])

        new_height = driver.execute_script("return arguments[0].scrollHeight", review_popup)
        if new_height == last_height:
            break
        last_height = new_height
    return list(set(all_reviews))


In [20]:
def scraping_urls(city_url):
    # WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-search-engine-choice-screen")
    options.add_argument("--headless")
    browser = webdriver.Chrome(options=options)

    # Airbnb Nottingham
    # url = 'https://www.airbnb.com/s/Nottingham--England--United-Kingdom/homes'
    browser.get(city_url)

    all_urls: list = []

    # Scraping para la pagina en la que está
    def scrape_urls():
        sleep(5) # espera a que carge
        listings = browser.find_elements(By.XPATH, '//a[contains(@href, "/rooms/")]')
        urls = [listing.get_attribute('href') for listing in listings]
        return urls

    # Loop para todas las paginas
    while True:
        page_urls = scrape_urls() # Esto funciona porque cada vez que vamos a una pagina nueva la url principal no cambia
        all_urls.extend(page_urls)

        # hacer click en el boton "next"
        try:
            next_button = browser.find_element(By.XPATH, '//a[@aria-label="Next"]')
            next_button.click()
            sleep(5)  # espera a que carge la pagina
        except NoSuchElementException:
            break

    all_urls = set(all_urls)
    all_urls = list(all_urls)
    return all_urls

In [21]:
url_nottingham = 'https://www.airbnb.com/s/Nottingham--England--United-Kingdom/homes'
url_san_francisco = "https://www.airbnb.com/s/San-Francisco--California--United-States/homes"
urls = scraping_urls(url_san_francisco)

In [50]:
url_guest_favorite = "https://www.airbnb.com/rooms/17618950?adults=1&category_tag=Tag%3A8678&children=0&enable_m3_private_room=true&infants=0&pets=0&photo_id=1258742482&search_mode=regular_search&check_in=2024-10-30&check_out=2024-11-04&source_impression_id=p3_1727423252_P3uSk5kdl3FQVL7_&previous_page_section_name=1000&federated_search_id=29bac5dc-6e88-4a96-9f0f-fae6c05a3597"
url_no_guest_favorite = "https://www.airbnb.com/rooms/1226376809052320405?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-21&check_out=2024-10-26&source_impression_id=p3_1727424454_P3wkNr7x-Ud7bO04&previous_page_section_name=1000&federated_search_id=f573770f-c781-45af-a464-dd123435c95f"
url_new = "https://www.airbnb.com/rooms/27701970?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-11-14&check_out=2024-11-19&source_impression_id=p3_1727424233_P3x3d1pTlPM2C8Zb&previous_page_section_name=1000&federated_search_id=b1b3ca1e-4c9f-42f9-9a32-d8fbd3b10aa3"
url_2 = "https://www.airbnb.com/rooms/5413930?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-12-04&check_out=2024-12-09&source_impression_id=p3_1727425011_P3toaJxneSLX1aej&previous_page_section_name=1000&federated_search_id=f573770f-c781-45af-a464-dd123435c95f"
url_problema = "https://www.airbnb.com/rooms/41538482?adults=1&category_tag=Tag%3A8678&children=0&enable_m3_private_room=true&infants=0&pets=0&photo_id=1076432064&search_mode=regular_search&check_in=2024-10-30&check_out=2024-11-04&source_impression_id=p3_1727690290_P34aOaVgGAgAFFpB&previous_page_section_name=1000&federated_search_id=ad80992d-5bc2-4687-b24d-0d1f6c79cb68"
url_3 = "https://www.airbnb.com/rooms/1199915616176105267?adults=1&category_tag=Tag%3A8678&children=0&enable_m3_private_room=true&infants=0&pets=0&photo_id=1955356580&search_mode=regular_search&check_in=2024-11-04&check_out=2024-11-09&source_impression_id=p3_1727690322_P3f_su6KFa0YS6tG&previous_page_section_name=1000&federated_search_id=04476faa-a009-4449-8a1e-e7dfbf28f27a"
url_900_reviews = "https://www.airbnb.com/rooms/14996188?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-11-09&check_out=2024-11-14&source_impression_id=p3_1727696700_P31z7erJwV7ODI40&previous_page_section_name=1000&federated_search_id=64ab16be-1707-46e7-bce7-955ee4d6bd89"
url_4 = "https://www.airbnb.com/rooms/47476769?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-11-18&check_out=2024-11-23&source_impression_id=p3_1727697208_P3G-OrMWYXvW785E&previous_page_section_name=1000&federated_search_id=64ab16be-1707-46e7-bce7-955ee4d6bd89"
url_5 = "https://www.airbnb.com/rooms/3712191?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-28&check_out=2024-11-02&source_impression_id=p3_1727697705_P3VtOyl8YBojuvt-&previous_page_section_name=1000&federated_search_id=a3da65d1-f655-4932-b203-00d7ea5e1447"
url_no_reviews = "https://www.airbnb.com/rooms/1254652310065298533?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-01&check_out=2024-10-06&source_impression_id=p3_1727697624_P3xRvRTXu_LjJG4n&previous_page_section_name=1000&federated_search_id=33e9c87d-3ce4-41b0-9447-3f9b75712890"
guest_favorite, rating, number_reviews, type_host, hosting_time, price, all_reviews, title = scraping_airbnb(url_no_reviews)

print(f"guest_favorite: {guest_favorite}")
print(f"rating: {rating}")
print(f"number_of_reviews: {number_reviews}")
print(f"type_host: {type_host}")
print(f"hosting_time: {hosting_time}")
print(f"Price: {price}")
print(all_reviews)
print(f"title: {title}")
print(len(all_reviews))

print(f"guest_favorite: {type(guest_favorite)}")
print(f"rating: {type(rating)}")
print(f"number_of_reviews: {type(number_reviews)}")
print(f"type_host: {type(type_host)}")
print(f"hosting_time: {type(hosting_time)}")
print(f"Price: {type(price)}")
print(type(all_reviews))
# print(len(all_reviews))

guest_favorite: False
rating: nan
number_of_reviews: 0
type_host: new_host
hosting_time: new
Price: € 46 per night
[]
title: 1-C Private Room near Freeway & Commercial Area
0
guest_favorite: <class 'bool'>
rating: <class 'float'>
number_of_reviews: <class 'int'>
type_host: <class 'str'>
hosting_time: <class 'str'>
Price: <class 'str'>
<class 'list'>


In [51]:
df = pd.DataFrame(columns=['guest_favorite', 'rating', 'number_reviews', 'type_host', 'hosting_time', 'price', 'all_reviews', 'title'])

In [54]:
# df = pd.DataFrame()
n = 0
for url in urls:
    n = n + 1
    print(f"{n} {url}")
    guest_favorite, rating, number_reviews, type_host, hosting_time, price, all_reviews, title = scraping_airbnb(url)
    new_row = pd.DataFrame({
                'guest_favorite': [guest_favorite],
                'rating': [rating],
                'number_reviews': [number_reviews],
                'type_host': [type_host],
                'hosting_time': [hosting_time],
                'price': [price],
                'all_reviews': [all_reviews],
                'title' : [title] 
            })

    df = pd.concat([df, new_row], ignore_index=True)
    # if n > 5:
    #     break

df.to_csv("SanFrancisco1.csv")
df

1 https://www.airbnb.com/rooms/825620580149284302?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-14&check_out=2024-10-19&source_impression_id=p3_1727697718_P3VBM12cUxZlwvGM&previous_page_section_name=1000&federated_search_id=c05d25ac-6301-4fb8-99d9-58c001b9e728
2 https://www.airbnb.com/rooms/6884454?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-03&check_out=2024-10-08&source_impression_id=p3_1727697554_P3cRyCMtmMl-vvOx&previous_page_section_name=1000&federated_search_id=f68223e9-b6ba-4e7e-b158-452486a08239
3 https://www.airbnb.com/rooms/602128451319116463?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-10-15&check_out=2024-10-20&source_impression_id=p3_1727697611_P35LAI25Y7u-jk6U&previous_page_section_name=1000&federated_search_id=b54e66a4-3091-4bac-8a6a-1b84d1ed61f7
4 https://www.airbnb.com/rooms/88929677

Unnamed: 0,guest_favorite,rating,number_reviews,type_host,hosting_time,price,all_reviews,title
0,True,4.94,67,Superhost,· 2 years hosting,€ 178 per night,[Highly recommend Christine’s place!! We had a...,Entire Guest Suite in Noe Valley
1,True,4.98,48,Superhost,· 10 years hosting,,[Had wonderful time at this place and Carly is...,"Lux Large 2BR, Separate Entry, New Renovation"
2,True,4.95,112,Superhost,· 9 years hosting,,"[Awesome place to stay! Clean, cozy, and safe!...",Cozy Guest Suite Mission District
3,True,4.97,62,Superhost,· 1 year hosting,€ 117 per night,[Angella is a friendly host who made us feel v...,Nob Hill Studio
4,False,reviews,107,,7 months hosting,€ 82 per night,[The hotel is really nice and beautifully deco...,Building boasts 100-yr-old façade & arched win...
...,...,...,...,...,...,...,...,...
265,False,reviews,129,Superhost,· 1 year hosting,€ 67 per night,"[Eric’s place was great, the only issues I had...",Sunset in the City 2
266,True,4.91,46,Superhost,· 12 years hosting,€ 49 per night,[Jia was a wonderful guest. She goes out her w...,Female host Laminate floor spacious lockable
267,False,,0,new_host,new,€ 85 per night,[],Monroe Studio 303
268,False,reviews,114,Superhost,· 2 years hosting,,"[Had a nice, quiet stay during a recent busine...","Rm 2 Close to Bart, Bus, cafe, anyway great lo..."
