# Idealista Web Scrapping

## Goals and challenges

Before we start using beautiful soup, we need to explore Idealista's browsing mechanics. At first sight, Barcelona city has 15328 properties avalible. However we can only see 60 pages per category with 30 properties displayed per page. That means that we can only access to 30x60 (1800) properties if we choose the Barcelona category. In order to avoid this restrictions, we can subdivide our search into smaller groups of data. In this case Districts.

Barcelona consists of 10 districts:

In [1]:
#Initializing pandas and numpy
import numpy as np
import pandas as pd

In [2]:
#Importing districts and neighborhoods
districts = pd.read_csv("districts.csv")
neighborhoods = pd.read_csv("neighborhoods.csv")

In [3]:
neighborhoods.head(74)

Unnamed: 0,district_id,district_name,neighborhood_id,neighborhood_name,idealista_link
0,0,Ciutat Vella,0,El Raval,https://www.idealista.com/en/venta-viviendas/b...
1,0,Ciutat Vella,1,El Gotic,https://www.idealista.com/en/venta-viviendas/b...
2,0,Ciutat Vella,2,La Barceloneta,https://www.idealista.com/en/venta-viviendas/b...
3,0,Ciutat Vella,3,Sant Pere Santa Caterina i la Ribera,https://www.idealista.com/en/venta-viviendas/b...
4,1,Eixample,4,El Fort Pienc,https://www.idealista.com/en/venta-viviendas/b...
...,...,...,...,...,...
68,9,Sant Marti,68,La Verneda i la Pau,https://www.idealista.com/en/venta-viviendas/b...
69,10,Badalona,69,Badalona,https://www.idealista.com/en/venta-viviendas/b...
70,11,Hospitalet de Llobregat,70,Hospitalet de Llobregat,https://www.idealista.com/en/venta-viviendas/h...
71,12,Santa Coloma de Gramanet,71,Santa Coloma de Gramanet,https://www.idealista.com/en/venta-viviendas/s...


Now that we have segmented the data into smaller groups and links, we can start scraping the data

## Web scrapping

### Obtaining Links

 It's time to start actually webscrapping. Let's initialize our libraries.

In [4]:
from bs4 import BeautifulSoup as bs
import proxy_script as ps
import time
import random
import urllib3
import concurrent.futures

urllib3.disable_warnings()

errorList = []

In [5]:
#Making our request and returning our made soup
def get_soup(link):
    req = ps.api_scrap(link)
    soup = bs(req.content, 'lxml')
    return soup
    

In [6]:
#Getting all ids from our soup
def get_page_ids(soup, n_id):
    articles = soup.find("section", class_="items-container items-list").findAll("article", class_ = 'item')
    page_ids= []
    for article in articles:
        property_info = {
            "neighborhood_id" :n_id,
            "property_id" : article.get("data-adid")
        }
        page_ids.append(property_info)
    return page_ids

In [7]:
#Cheking if it's last page by looking if there isn't a next button
def is_last_page(soup):
    if(soup.find("li", class_="next")!= None):
        return False
    return True

In [8]:
def get_error_page(n_id, page_index):
    error_info = {
        "neighborhood_id" :n_id,
        "page_link" : neighborhoods["idealista_link"][n_id] +"pagina-{}.htm".format(page_index)
    }
    
    return error_info


In [9]:
#looping through pagination and getting all ids of a neighborhood
def get_neighborhood_properties_ids(n_id):
    
    page_index = 1
    neighborhood_properties_ids=[]
    page_errors = 0
    errorList = []
    #To see progress while working
    print(n_id)
    while True:
        try:
            print(page_index)
            soup_link = neighborhoods["idealista_link"][n_id] +"pagina-{}.htm".format(page_index)
            soup = get_soup(soup_link)
            
            neighborhood_properties_ids = neighborhood_properties_ids + get_page_ids(soup, n_id)

            #Checking if we are in the last page so we stop our loop
            if(is_last_page(soup)):
                break
            #Making sure it isn't infinite and avoid unwanted errors.
            if(page_index > 60):
                break
            page_errors = 0
                
        #Looking for errors and retrying 3 times
        except Exception as e:
            page_errors = page_errors +1
            if(page_errors <= 3):
                page_index = page_index - 1
            if(page_errors >3):
                errorList = errorList + get_error_page(n_id, page_index)
                page_errors = 0
            print(f"an error has ocurred: {e}")
            #Waiting some time to avoid errors
            time.sleep(random.uniform(5, 10))
            pass
            
        page_index = page_index +1
        
    return neighborhood_properties_ids    

In [18]:
import threading
import queue

MAX_THREADS = 20

thread_results = [[] for _ in range(MAX_THREADS)]

lock = threading.Lock()

In [19]:
q = queue.Queue()

for n_id in neighborhoods["neighborhood_id"]:
        q.put(n_id) 

In [10]:
def get_all_ids(thread_index):
    global thread_results
    global q
    while not q.empty():
        try:
            n_id = q.get()
            result = get_neighborhood_properties_ids(n_id)
            with lock:
                thread_results[thread_index] += result
        except:
            continue
        time.sleep(random.uniform(1, 3))

In [None]:
for i in range(MAX_THREADS):
    threading.Thread(target=get_all_ids, args=(i,)).start()

In [23]:
final_result = sum(thread_results,[])
df = pd.DataFrame(final_result)
df.to_csv("urls.csv",index=False)

### Scraping Houses

Once we have all property links in barcelona, we start scrapping for the features of each house.

In [10]:
idealista_link = "https://www.idealista.com/en/"
ids = pd.read_csv("urls.csv")

In [11]:
import threading
import queue

MAX_THREADS = 20

thread_results = [[] for _ in range(MAX_THREADS)]
errors = []

lock = threading.Lock()

q = queue.Queue()

for property_id in ids["property_id"]:
    q.put(property_id)

In [13]:
def scrap_house(thread_index):
    global thread_results
    global q
    error_count = 0 
    while not q.empty():
        if(error_count == 0):
            link = q.get()
            error_link = link
        else:
            link = error_link
        soup_link = idealista_link + str(link)
        try:
            print(link)
            soup = get_soup(soup_link)
            feature_lists = list(soup.find_all("div", class_="details-property_features"))
            feats = []
            for feature_list in feature_lists:
                feats.append([feature.get_text(strip=True) for feature in list(feature_list.find_all("li"))])

            features = [
                {
                    "id": link,
                    "title" : soup.find("span", class_= "main-info__title-main").get_text(),
                    "neighborhood" : soup.find("span", class_= "main-info__title-minor").get_text(),
                    "price": soup.find("span", class_= "info-data-price").find("span", class_= "txt-bold").get_text(),
                    "info_features": [feature.get_text(strip = True )for feature in list(soup.find("div", class_= "info-features").find_all("span"))],
                    "features" : feats,
                    "description" : soup.find("div", class_="comment").find("p").get_text(strip = True)   
                }
            ]
            with lock:
                    thread_results[thread_index] += features
            error_count = 0
        except Exception as e:
                print(f"an error has ocurred: {e}")
                error_count += 1
                if(error_count >= 3):
                    error_count = 0
        time.sleep(random.uniform(1, 3))

In [None]:
for i in range(MAX_THREADS):
    threading.Thread(target = scrap_house, args= (i,)).start()
    

In [29]:
final_result = sum(thread_results,[])
df = pd.DataFrame(final_result)
df.to_csv("features_plus.csv",index=False)

