## Scraping data with Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
import pathlib
import pickle

In [2]:
website= "https://spainhomes.com/real-estate/malaga"

In [3]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(website)

In [4]:
#driver.quit()

## Get info from one ad
We want to get: <br>
1. Title 
2. Description
3. Number of bedrooms
4. Number of bathrooms
5. Price
6. url for more details

In [14]:
title = ads[0].find_element(By.XPATH, './/span[@class="title"]').text
title

'Modern Sea View Apartments with Spacious Terraces in Fuengirola'

In [16]:
desc = ads[0].find_element(By.XPATH, './/span[@class="desc"]').text
desc

'The sea-view apartments with large terraces are situated in a prestigious community in Fuengirola, Costa del Sol. The gated and secured complex has amazing facilities and social and sports clubs.'

In [20]:
price = ads[0].find_element(By.XPATH, './/span[@class="fiyat"]').text
price

'FROM\n€435.000'

In [21]:
details = ads[0].find_element(By.XPATH, './/div[@class="other row between-xs middle-xs nowrap"]')

In [22]:
details = details.find_elements(By.XPATH, './/span')
[el.text for el in details]

['FUENGIROLA - MÁLAGA', '1, 2, 3', '1, 2']

In [23]:
url = ads[0].find_element(By.XPATH, './/a[@class="main-emlak-link"]')
url

<selenium.webdriver.remote.webelement.WebElement (session="a1d2efb8398e75ba82c4f5d0a9c8bae6", element="2DD4CECFE3619B093A0C440721CBE7E8_element_347")>

In [24]:
url.get_attribute("href")

'https://spainhomes.com/ad/agp-0732-new-build-apartments-with-sea-views-in-prime-area-of-fuengirola'

## Get info from all ads in one page

In [7]:
def get_info(ad):
    info = {}
    info["title"] = ad.find_element(By.XPATH, './/span[@class="title"]').text
    info["desc"] = ad.find_element(By.XPATH, './/span[@class="desc"]').text
    info["price"] = ad.find_element(By.XPATH, './/span[@class="fiyat"]').text
    details = ad.find_element(By.XPATH, './/div[@class="other row between-xs middle-xs nowrap"]')
    details = details.find_elements(By.XPATH, './/span')
    info["details"] = [el.text for el in details]
    url = ad.find_element(By.XPATH, './/a[@class="main-emlak-link"]')
    info["url"] = url.get_attribute("href")
    return info

In [9]:
page_1_info = [get_info(ad) for ad in ads]

In [22]:
page_1_info[:2] 

[{'title': 'Modern Sea View Apartments with Spacious Terraces in Fuengirola',
  'desc': 'The sea-view apartments with large terraces are situated in a prestigious community in Fuengirola, Costa del Sol. The gated and secured complex has amazing facilities and social and sports clubs.',
  'price': 'FROM\n€435.000',
  'details': ['FUENGIROLA - MÁLAGA', '1, 2, 3', '1, 2'],
  'url': 'https://spainhomes.com/ad/agp-0732-new-build-apartments-with-sea-views-in-prime-area-of-fuengirola'},
 {'title': 'Spacious Villa with Panoramic Sea Views in Benalmadena',
  'desc': 'Villa with panoramic sea views is situated in Benalmadena, Costa del Sol. A spacious three-bedroom villa has a generous garden and a private swimming pool.',
  'price': '€1.395.000',
  'details': ['BENALMÁDENA - MÁLAGA', '3', '3'],
  'url': 'https://spainhomes.com/ad/agp-0785-villa-with-excellent-location-and-sea-view-in-benalmadena'}]

In [14]:
driver.quit()

In [12]:
# making a directory with results
results_dir = pathlib.Path("results")
results_dir.mkdir(parents=True, exist_ok=True)

In [13]:
file = open(results_dir/'malaga_page_1.pickle', 'wb')
pickle.dump(page_1_info , file)
file.close()

### Cliking on the cookie page

In [77]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(website)

In [5]:
def click_accept_cookies(driver):
    cookie_xpath = '//div[@class="cu-butons row middle-xs end-xs flex-wrap"]/a[@class="yes"]'
    button = driver.find_element(By.XPATH, cookie_xpath)
    button.click()

In [6]:
click_accept_cookies(driver)

### getting other pages

In [32]:
def dump_page(i, page_info, results_dir=results_dir):
    filename = results_dir/'malaga_page_{}.pickle'.format(str(i))
    file = open(filename, 'wb')
    pickle.dump(page_info , file)
    file.close()
    print(filename)

In [24]:
def get_url(i):
    url = "https://spainhomes.com/real-estate/malaga?page={}.htm".format(i)
    return url

In [25]:
i = 2
website = get_url(i)
website

'https://spainhomes.com/real-estate/malaga?page=2.htm'

In [26]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(website)
time.sleep(3)

In [27]:
click_accept_cookies(driver)

In [28]:
ads = driver.find_elements(By.XPATH,'//div[@class="right"]')
len(ads)

12

In [29]:
page_i_info = [get_info(ad) for ad in ads]

In [33]:
dump_page(i, page_i_info)

results/malaga_page_2.pickle


In [40]:
driver.quit()