# Web Scraping METROCUADRADO página individual

## Librerías

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import pandas as pd

### Selección de grilla y extracción

Código base

In [13]:
driver = webdriver.Chrome()
driver.get('https://www.metrocuadrado.com/inmueble/venta-casa-chia-na-3-habitaciones-3-banos-2-garajes/11369-M4772309')
# Get the page source after interactions
page_source = driver.page_source
# Parse the page source with Beautiful Soup
soup = BeautifulSoup(page_source, 'lxml')

driver.quit()

house_grill = soup.find('div', class_ = 'page-container')

info1_list = []

caracteristica_list = []

estrato = int(house_grill.find('ul', class_ = 'list-feature-detail').find_all('h2', class_ = 'card-text')[3].text.replace('Estrato',''))
info1_list.append(estrato)

descripcion = house_grill.find('p', class_ = 'card-text').text
info1_list.append(descripcion)

info_principal = house_grill.find('div', class_ = 'card-details').find_all('div', class_ = 'col-lg-3')

for info in info_principal:
    info1_list.append(info.find('p', class_ = 'card-text').text)

gastos_notariales = house_grill.find('p', class_ = 'notary-fees-simulator-container__info__price').text
info1_list.append(gastos_notariales)

caracteristicas = house_grill.find_all('div', class_ = 'featureacordion')[0].find_all('div', class_ = 'col-md-3')

for caracteristica in caracteristicas:
    caracteristica_list.append(caracteristica.text)

info1_list.append(', '.join(map(str, caracteristica_list)))

# Ejecución

In [23]:
def scrape_property_info(url):
    """
    Scrapes property information from a given Metrocuadrado listing URL.

    Parameters
    ----------
    url : str
        The URL of the Metrocuadrado property listing page.

    Returns
    -------
    list
        A list containing various details about the property, such as 
        estrato, description, principal information, notarial fees, 
        and characteristics.

    Examples
    --------
    >>> info = scrape_property_info('https://www.metrocuadrado.com/inmueble/venta-casa-chia-na-3-habitaciones-3-banos-2-garajes/11369-M4772309')
    >>> print(info)
    [3, 'Charming house in Chia...', '$680,000,000', ...]
    """
    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome()
    
    # Load the specified URL
    driver.get(url)
    
    # Get the page source after interactions
    page_source = driver.page_source
    
    # Parse the page source with Beautiful Soup
    soup = BeautifulSoup(page_source, 'lxml')
    
    # Close the WebDriver
    driver.quit()
    
    # Find the main container with the property information
    house_grill = soup.find('div', class_='page-container')
    
    # Initialize an empty list to store the property details
    info1_list = []
    caracteristica_list = []
    
    # Extract the 'estrato' information and append it to the list
    estrato = int(house_grill.find('ul', class_='list-feature-detail').find_all('h2', class_='card-text')[3].text.replace('Estrato', ''))
    info1_list.append(estrato)
    
    # Extract the property description and append it to the list
    descripcion = house_grill.find('p', class_='card-text').text
    info1_list.append(descripcion)
    
    # Extract the main property details and append each to the list
    info_principal = house_grill.find('div', class_='card-details').find_all('div', class_='col-lg-3')
    for info in info_principal:
        info1_list.append(info.find('p', class_='card-text').text)
    
    # Extract notarial fees and append to the list
    gastos_notariales = house_grill.find('p', class_='notary-fees-simulator-container__info__price').text
    info1_list.append(gastos_notariales)
    
    # Extract additional property characteristics and append each to the list
    caracteristicas = house_grill.find_all('div', class_='featureacordion')[0].find_all('div', class_='col-md-3')
    for caracteristica in caracteristicas:
        caracteristica_list.append(caracteristica.text)

    info1_list.append(', '.join(map(str, caracteristica_list)))

    # Add link page
    info1_list.append(url)
    
    # Return the list of property details
    return info1_list


## Web Scraping individual

In [28]:
data_principal = pd.read_csv(r'D:\visualization\web_scraping_metrocuadrado\data\metrocuadrado_principal.csv')

In [24]:
info_out = []
url_list = data_principal.Link.to_list

total_paginas = len(url_list)  # Número total de páginas

for i, url in enumerate(url_list):
    # Calcular cuántas páginas quedan por procesar
    paginas_restantes = total_paginas - (i + 1)
    print(f"Quedan {paginas_restantes} páginas por procesar.")
    
    # Espera de 5 segundos antes de la siguiente iteración, a menos que sea la última página
    if paginas_restantes > 0:
        time.sleep(3)

df = pd.DataFrame(info_out, columns=[
    'Estrato',
    'Descripción',
    'Codigo',
    'Barrio',
    'Precio',
    'Antiguedad',
    'Area_construida',
    'Area_privada',
    'Valor_administracion',
    'Parqueaderos',
    'Gastos_notariales',
    'Caracteristicas',
    'Link'
])

df.to_csv('metrocuadrado_individual.csv',index= False)

Procesando: https://www.metrocuadrado.com/inmueble/venta-casa-chia-na-3-habitaciones-3-banos-2-garajes/11369-M4772309
Quedan 1 páginas por procesar.
Procesando: https://www.metrocuadrado.com/inmueble/venta-casa-turbaco-prado-verde-3-habitaciones-3-banos-1-garajes/17119-M5011824
Quedan 0 páginas por procesar.
