In [25]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from typing import List, Dict
from urllib.parse import parse_qs, urlparse
import json
import re

## Definitions

In [47]:
class WebScraper:
    def __init__(self, base_url: str, max_workers: int = 5, retries: int = 3, timeout: int = 10):
        """
        Initialize the web scraper.

        :param base_url: The base URL to scrape.
        :param max_workers: The number of threads for concurrent scraping.
        :param retries: Number of retries for failed requests.
        :param timeout: Timeout for each request in seconds.
        """
        self.base_url = base_url
        self.max_workers = max_workers
        self.retries = retries
        self.timeout = timeout
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def fetch_page(self, url: str) -> str:
        """
        Fetch a page and return its HTML content.

        :param url: The URL of the page.
        :return: The HTML content of the page.
        """
        for attempt in range(self.retries):
            try:
                response = requests.get(url, headers=self.headers, timeout=self.timeout)
                response.raise_for_status()
                return response.text
            except requests.exceptions.RequestException as e:
                print(f"Attempt {attempt + 1} failed for URL: {url}, Error: {e}")
                time.sleep(1)  # Backoff before retry
        return ""

    def parse_page(self, html_content: str) -> Dict:
        """
        Parse the HTML content and extract the desired data.

        :param html_content: The HTML content of the page.
        :return: A dictionary with the extracted data.
        """
        soup = BeautifulSoup(html_content, 'lxml')
        # Example: Extract all links and their text
        data = {
            'title': soup.title.string if soup.title else 'No title',
            'links': [(a.get('href'), a.text) for a in soup.find_all('a', href=True)]
        }
        return data
    

    def scrape_pages(self, urls: List[str]) -> List[Dict]:
        """
        Scrape multiple pages concurrently.

        :param urls: A list of URLs to scrape.
        :return: A list of dictionaries with the extracted data.
        """
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_url = {executor.submit(self.fetch_page, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    html_content = future.result()
                    if html_content:
                        data = self.parse_page(html_content)
                        results.append({url: data})
                except Exception as e:
                    print(f"Error fetching {url}: {e}")
        return results

    def scrape_single_page(self, url: str) -> Dict:
        """
        Scrape a single page.

        :param url: The URL of the page.
        :return: A dictionary with the extracted data.
        """
        html_content = self.fetch_page(url)
        if html_content:
            return self.parse_page(html_content)
        return {}
    
    def parse_city_page(self, html_content: str) -> list:
        """
        Parses the page to extract all property listings.
        :param html_content: The HTML content of the page.
        :return: A list of dictionaries containing property details.
        """
        soup = BeautifulSoup(html_content, 'lxml')
        listings = []

        # Find all divs with class "row" which represent listings
        listing_divs = soup.find_all('div', class_='row')
        for listing in listing_divs:
            parsed_listing = self.parse_listing(listing)
            if parsed_listing:
                listings.append(parsed_listing)

        return listings
    
    def parse_listing(self, listing) -> dict:
        """
        Parses a single property listing and extracts details.
        :param listing: A BeautifulSoup element containing the listing HTML.
        :return: A dictionary with the extracted information.
        """
        try:
            # Extract the listing ID
            id_match = re.search(r'Ficha\.asp\?xId=(\d+)', str(listing))
            listing_id = id_match.group(1) if id_match else 'N/A'

            # Extract image URL
            img_tag = listing.find('img')
            img_url = img_tag['src'] if img_tag else 'N/A'
            
            # Extract neighborhood from subtitle
            subtitle_tag = listing.find('p')
            subtitle = subtitle_tag.text.strip() if subtitle_tag else 'N/A'
            neighborhood_match = re.search(r'Anuncio \d+ - (.+)', subtitle)
            neighborhood = neighborhood_match.group(1) if neighborhood_match else 'N/A'

            # Extract price and convert to numeric
            price_tag = listing.find('h3', string=re.compile(r'\$'))
            price_text = price_tag.text.strip() if price_tag else 'N/A'
            price_numeric = re.sub(r'[^\d]', '', price_text)  # Remove non-numeric characters

            # Extract rooms, bathrooms, parking, and area from the second <h3> tag
            h3_tags = listing.find_all('h3')
            if len(h3_tags) > 1:
                second_h3 = h3_tags[1]
                spans = second_h3.find_all('span')
            else:
                spans = []

            rooms = spans[0].text.strip() if len(spans) > 0 else 'N/A'
            bathrooms = spans[1].text.strip() if len(spans) > 1 else 'N/A'
            parkings = spans[2].text.strip() if len(spans) > 2 else 'N/A'
            area = re.sub(r'[^\d]', '', spans[3].text.strip().split('M2')[0]) if len(spans) > 3 else 'N/A'  # Remove non-numeric characters for area

            # Extract the largest <p> tag for the description
            p_tags = listing.find_all('p')
            description = max((p.text.strip() for p in p_tags if p.text.strip()), key=len, default='N/A')

            return {
                'id': listing_id,
                'url': f"{self.base_url}/Ficha.asp?xId={listing_id}",
                'neighborhood': neighborhood,
                'img_url': img_url,
                'price': int(price_numeric) if price_numeric else 'N/A',
                'rooms': int(rooms) if rooms.isdigit() else 'N/A',
                'bathrooms': int(bathrooms) if bathrooms.isdigit() else 'N/A',
                'parkings': int(parkings) if parkings.isdigit() else 'N/A',
                'area': int(area) if area.isdigit() else 'N/A',
                'description': description
            }
        except Exception as e:
            print(f"Error parsing listing: {e}")
            return {}
    
    def scrape_city(self, pCiudad: int, start_offset: int = 0) -> list:
        """
        Iterates through all pages for a given city and scrapes all listings.
        Stops when less than 51 listings (divs with class "row") are found.
        
        :param pCiudad: The city code to scrape.
        :param start_offset: The starting offset for pagination.
        :return: A list of all scraped listings.
        """
        all_listings = []
        offset = start_offset
        max_listings_per_page = 50

        while True:
            # Construct the URL with the current offset
            rental_base_url = 'Resumen_Ciudad_arriendos.asp'
            url = f"{self.base_url}/{rental_base_url}?pCiudad={pCiudad}&pTipoInmueble=&offset={offset}"
            print(f"Scraping: {url}")

            # Fetch and parse the page
            html_content = self.fetch_page(url)
            listings = self.parse_city_page(html_content)
            
            # Add the listings to the total list
            all_listings.extend(listings)
            
            # Stop if fewer than 51 listings found (end of pagination)
            if len(listings) < max_listings_per_page:
                break
            
            # Increase offset for the next page
            offset += max_listings_per_page

        return all_listings


## Initialization

In [48]:
base_url = 'https://www.espaciourbano.com'
scraper = WebScraper(base_url=base_url, max_workers=1)

## 1. Initial extraction of cities

By first we are going to extract initial page which contains all the cities we could extract information

In [None]:
# Single page scrape
data_fetched = scraper.scrape_single_page('https://www.espaciourbano.com/listado_arriendos.asp')

we define a method for extracting code and name of the cities available from the page

In [11]:
def extract_city_info_urllib(url: str) -> dict:
    """
    Extract the city code (pCiudad) and city name (nCiudad) using urllib.
    
    :param url: The URL string containing the city information.
    :return: A dictionary with the city code and name.
    """
    # Parse query parameters
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    
    # Extract city code and name
    city_info = {
        'name': query_params.get('pCiudad', [''])[0],
        'code': query_params.get('nCiudad', [''])[0]
    }
    
    return city_info


In [14]:
def save_to_json(data: dict, file_name: str) -> None:
    """
    Save the extracted city info into a JSON file.
    
    :param data: The dictionary containing the city info.
    :param file_name: The name of the JSON file to save the data.
    """
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Data saved to {file_name}")
  

In [15]:
cities_base_link = '/Resumen_Ciudad_arriendos'
cities_links = [link for link in data_fetched['links'] if link[0].startswith(cities_base_link)]

cities = [extract_city_info_urllib(link[0]) for link in cities_links]

save_to_json(cities, 'data/cities.json')

Data saved to data/cities.json


## 2. Then we need to iterate trhough citites to extract announcements

In [49]:
def extract_rentals_info():
  for city in cities:
    

Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=0
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=50
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=100
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=150
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=200
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=250
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=300
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=350
Scraping: https://www.espaciourbano.com/Resumen_Ciudad_arriendos.asp?pCiudad=10000&pTipoInmueble=&offset=400
Scraping: https://www.