In [54]:
!pip install selenium



In [55]:
# all imports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import json
import time

import pandas as pd

from multiprocessing import Pool

import os

In [56]:
airbnb_url = 'https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-02-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&query=Madrid&place_id=ChIJgTwKgJcpQg0RaSKMYcHeNsQ&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click'

In [57]:
soup = BeautifulSoup(requests.get(airbnb_url).content, 'html.parser')

In [58]:
listings = soup.find_all('div', 'cy5jw6o')

In [59]:
# First Generation :)
def extract_basic_features(listing_html):
    features_dict = {}


    name = listing_html.find("div", {"class": "t1jojoys"}).get_text()
    header = listing_html.find("div", {"class": "fb4nyux"}).get_text()
    url = listing_html.find('a').get('href')


    features_dict['name'] = name
    features_dict['header'] = header
    features_dict['url'] = url

    return features_dict

In [60]:
extract_basic_features(listings[0])

{'name': 'Apartamento en Centro Madrid',
 'header': 'Alojamiento entero en el centro de Madrid',
 'url': '/rooms/1120220796513899602?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-05-11&check_out=2024-05-16&source_impression_id=p3_1711642001_vhtsToNCbc8IBTCs&previous_page_section_name=1000&federated_search_id=5b410bee-8669-42b2-b849-42bce544f195'}

In [61]:
# too many separate extractions
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'div', 'class': 't1jojoys'},
    'header': {'tag': 'span', 'class': 't6mzqp7'},
    'rating_n_reviews': {'tag': 'span', 'class': 'ru0q88m'},
    'price': {'tag': 'span', 'class': 'a8jt5op'},
    'superhost': {'tag': 'span', 'class': 'dir dir-ltr'},
}

In [62]:
def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]

    # 3. Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [63]:
print(extract_element(listings[13], RULES_SEARCH_PAGE['header']))
print(extract_element(listings[13], RULES_SEARCH_PAGE['url']))
print(extract_element(listings[13], RULES_SEARCH_PAGE['name']))
print(extract_element(listings[13], RULES_SEARCH_PAGE['rating_n_reviews']))
print(extract_element(listings[13], RULES_SEARCH_PAGE['price']))
print(extract_element(listings[13], RULES_SEARCH_PAGE['superhost']))

PUERTA DEL SOL, ESTUDIO LUMINOSO, DISEÑO Y BALCÓN
/rooms/16872465?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-12-15&check_out=2024-12-20&source_impression_id=p3_1711642001_HSGxHnoVxincOgc7&previous_page_section_name=1000&federated_search_id=5b410bee-8669-42b2-b849-42bce544f195
Apartamento en Centro Madrid
4,96 (494)
Recomendación del viajero
Anfitrión particular


In [64]:
# for feature in RULES_SEARCH_PAGE:
 #   print(f"{feature}: {extract_element(listings[13], RULES_SEARCH_PAGE[feature])}")

In [65]:
for feature in RULES_SEARCH_PAGE:
    try:
        print(f"{feature}: {extract_element(listings[0], RULES_SEARCH_PAGE[feature])}")
    except:
        print(f"{feature}: empty")

url: /rooms/1120220796513899602?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-05-11&check_out=2024-05-16&source_impression_id=p3_1711642001_vhtsToNCbc8IBTCs&previous_page_section_name=1000&federated_search_id=5b410bee-8669-42b2-b849-42bce544f195
name: Apartamento en Centro Madrid
header: Alojamiento entero en el centro de Madrid
rating_n_reviews: Nuevo
price: Anfitrión particular
superhost: Anfitrión particular


In [66]:
airbnb_url

'https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-02-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&query=Madrid&place_id=ChIJgTwKgJcpQg0RaSKMYcHeNsQ&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click'

In [67]:
# let's finally write this function
def get_listings(search_page):
    soup = BeautifulSoup(requests.get(search_page).content, 'html.parser')
    listings = soup.find_all('div', 'cy5jw6o')

    return listings

In [68]:
# it works
len(get_listings(airbnb_url))

18

In [69]:
# let's try next page
new_url = airbnb_url + '&items_offset=20'
len(get_listings(new_url))

18

In [70]:
# checking the content, if the data is there
print(extract_element(get_listings(airbnb_url)[0], RULES_SEARCH_PAGE['name']))
print(extract_element(get_listings(new_url)[0], RULES_SEARCH_PAGE['name']))

Apartamento en Centro Madrid
Loft en Centro Madrid


In [71]:
# let's iterate through all 15 pages
all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)

    # let's check if it's scraping
    print(len(all_listings))

18
36
54
72
90
108
126
144
162
180
198
216
234
252
270


In [72]:
# why? maybe Airbnb tries to prevent scraping
# let's wait a couple of seconds after every iteration
import time

all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}&section_offset=3'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)

    # let's check if it's scraping
    print(len(all_listings))

    time.sleep(2)

18
36
54
72
90
108
126
144
162
180
198
216
234
252
270


In [73]:
# another random check, if the data is there
print(extract_element(all_listings[113], RULES_SEARCH_PAGE['name']))

Loft en Salamanca


In [74]:
# 1. build all urls
def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)

    return url_list

In [75]:
# build a list of URLs
url_list = build_urls(airbnb_url)

In [76]:
url_list

['https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-02-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&query=Madrid&place_id=ChIJgTwKgJcpQg0RaSKMYcHeNsQ&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click&items_offset=0',
 'https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-02-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&query=Madrid&place_id=ChIJgTwKgJcpQg0RaSKMYcHeNsQ&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click&items_offset=20',
 'https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-02-01&monthly_length=3&price_filter_input_type=0&channel=EXPLO

In [77]:
# safe function to extract all features from one page
def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'

    return features_dict

In [78]:
extract_page_features(listings[0], RULES_SEARCH_PAGE)

{'url': '/rooms/1120220796513899602?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-05-11&check_out=2024-05-16&source_impression_id=p3_1711642001_vhtsToNCbc8IBTCs&previous_page_section_name=1000&federated_search_id=5b410bee-8669-42b2-b849-42bce544f195',
 'name': 'Apartamento en Centro Madrid',
 'header': 'Alojamiento entero en el centro de Madrid',
 'rating_n_reviews': 'Nuevo',
 'price': 'Anfitrión particular',
 'superhost': 'Anfitrión particular'}

In [79]:
# 2. Iteratively scrape pages
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

In [80]:
# try for one page
base_features = process_search_pages(url_list)

In [81]:
base_features

[{'url': '/rooms/46416127?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-04-11&check_out=2024-04-16&source_impression_id=p3_1711642076_guK%2FG%2FWzhAw1901a&previous_page_section_name=1000&federated_search_id=1cd3a62c-ca4b-440d-9b0b-59ddc53367c1',
  'name': 'Habitación compartida en Centro Madrid',
  'header': 'Madrid Centro: Puerta del Sol, Tirso de Molina',
  'rating_n_reviews': '4,95 (83)',
  'price': 'Recomendación del viajero',
  'superhost': 'Anfitrión profesional'},
 {'url': '/rooms/1120220796513899602?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2024-05-11&check_out=2024-05-16&source_impression_id=p3_1711642076_Kd3%2FENtX4B3eX4YA&previous_page_section_name=1000&federated_search_id=1cd3a62c-ca4b-440d-9b0b-59ddc53367c1',
  'name': 'Apartamento en Centro Madrid',
  'header': 'Alojamiento entero en el centro de Madrid',
  'rating_n_reviews': 'Nuevo',
  'price': 'Anfitrión particular',
  'superhost': 'Anfitrión particular'}

In [82]:
df = pd.DataFrame(base_features)
df

Unnamed: 0,url,name,header,rating_n_reviews,price,superhost
0,/rooms/46416127?adults=1&children=0&enable_m3_...,Habitación compartida en Centro Madrid,"Madrid Centro: Puerta del Sol, Tirso de Molina","4,95 (83)",Recomendación del viajero,Anfitrión profesional
1,/rooms/1120220796513899602?adults=1&children=0...,Apartamento en Centro Madrid,Alojamiento entero en el centro de Madrid,Nuevo,Anfitrión particular,Anfitrión particular
2,/rooms/1019679548854540460?adults=1&children=0...,Alojamiento en Centro Madrid,Habitación céntrica en Chamberí,"4,74 (23)",Anfitrión particular,Anfitrión particular
3,/rooms/1120093500114698390?adults=1&children=0...,Apartamento en Centro Madrid,GuestReady - Madrid Urban Retreat,Nuevo,3 camas «queen»,3 camas «queen»
4,/rooms/1119750870732308638?adults=1&children=0...,Apartamento en Centro Madrid,Patio de La Latina,Nuevo,Anfitrión particular,Anfitrión particular
...,...,...,...,...,...,...
265,/rooms/5285533?adults=1&children=0&enable_m3_p...,Apartamento en Centro Madrid,Apartament entre P. Real y Gran Via,"4,83 (174)",Recomendación del viajero,Anfitrión particular
266,/rooms/1090694911446906109?adults=1&children=0...,Apartamento en Centro Madrid,Super Loft Malasaña Gran Via,"4,67 (3)",Anfitrión particular,Anfitrión particular
267,/rooms/846426208549575549?adults=1&children=0&...,Suite con entrada independiente en Centro Madrid,"diseñado, arte y vista en la latina","4,81 (54)",Anfitrión particular,Anfitrión particular
268,/rooms/771551626281396323?adults=1&children=0&...,Apartamento en Centro Madrid,Maravilloso piso estudio,"4,64 (118)",Anfitrión particular,Anfitrión particular


In [83]:
df['ciudad'] = df['name'].str.replace('Villa en |Cueva en |Vivienda en |Apartamento en |Habitación en |Loft en ', '', regex=True)
df

Unnamed: 0,url,name,header,rating_n_reviews,price,superhost,ciudad
0,/rooms/46416127?adults=1&children=0&enable_m3_...,Habitación compartida en Centro Madrid,"Madrid Centro: Puerta del Sol, Tirso de Molina","4,95 (83)",Recomendación del viajero,Anfitrión profesional,Habitación compartida en Centro Madrid
1,/rooms/1120220796513899602?adults=1&children=0...,Apartamento en Centro Madrid,Alojamiento entero en el centro de Madrid,Nuevo,Anfitrión particular,Anfitrión particular,Centro Madrid
2,/rooms/1019679548854540460?adults=1&children=0...,Alojamiento en Centro Madrid,Habitación céntrica en Chamberí,"4,74 (23)",Anfitrión particular,Anfitrión particular,Alojamiento en Centro Madrid
3,/rooms/1120093500114698390?adults=1&children=0...,Apartamento en Centro Madrid,GuestReady - Madrid Urban Retreat,Nuevo,3 camas «queen»,3 camas «queen»,Centro Madrid
4,/rooms/1119750870732308638?adults=1&children=0...,Apartamento en Centro Madrid,Patio de La Latina,Nuevo,Anfitrión particular,Anfitrión particular,Centro Madrid
...,...,...,...,...,...,...,...
265,/rooms/5285533?adults=1&children=0&enable_m3_p...,Apartamento en Centro Madrid,Apartament entre P. Real y Gran Via,"4,83 (174)",Recomendación del viajero,Anfitrión particular,Centro Madrid
266,/rooms/1090694911446906109?adults=1&children=0...,Apartamento en Centro Madrid,Super Loft Malasaña Gran Via,"4,67 (3)",Anfitrión particular,Anfitrión particular,Centro Madrid
267,/rooms/846426208549575549?adults=1&children=0&...,Suite con entrada independiente en Centro Madrid,"diseñado, arte y vista en la latina","4,81 (54)",Anfitrión particular,Anfitrión particular,Suite con entrada independiente en Centro Madrid
268,/rooms/771551626281396323?adults=1&children=0&...,Apartamento en Centro Madrid,Maravilloso piso estudio,"4,64 (118)",Anfitrión particular,Anfitrión particular,Centro Madrid


In [84]:
ruta = 'Alojamientos_Madrid_Airbnb.csv'
df.to_csv(ruta,index=False)