In [None]:
!pip install cloudscraper

In [None]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import cloudscraper
from collections import defaultdict
from datetime import date
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

![](https://s2.glbimg.com/FvxyE6pbsWUIG8kOnd-ceX_db8w=/620x430/e.glbimg.com/og/ed/f/original/2017/08/30/saopaulo.jpg)

# Main code do wabscrap and generate Sao Paulo Apartments price

**Notes about this code**

There are still some changes I need to make. The first one is a bug that sometimes occurs with the ParkingSpaces, which is related to not recognizing some values. Because of this, we can't create a DataFrame and upload the dataset. I manually fixed it the first time (It was just 2 values that weren't recognized so it didn't took too much time, it hardly happens), but I intend to add a function to verify each iteration, checking the array size to ensure all variables are filled. This will help prevent bugs in the future.

Furthermore, I'll set up the code to run every month to generate the dataset. Since I'm a real estate enthusiast and have plans to live in São Paulo someday, this will be a valuable tool.

Other upgrades include adding features such as CEP (a postal code that identifies the address location), neighborhood quality, and more.

In [None]:
scraper = cloudscraper.create_scraper(
    browser={
        "browser": "chrome",
        "platform": "windows",
        "desktop": True,
    }
)

price_r = r'"mainValue"\s*[:]\s*(\d+)\s*'
empty_r = r'"emptyValue":(true|false)'
id_r = r'"id":"\s*(\d+)"'
area_r = r'"usableAreas":"([\d\s\-]+)"'
bedrooms_r = r'"bedrooms":"([\d\s\-]+)"'
bathrooms_r = r'"bathrooms":"([\d\s\-]+)"'
parkingSpaces_r = r'"parkingSpaces":"([\d\s\-]+)"'
created_date_r = r'"createdDate":"([\d\-T:Z]+)"'
address_r = r'"(city|streetNumber|stateAcronym|street|neighborhood)":"([^"]+)"'
below_price_r = r'"belowPrice":(true|false)'

data = {}

prices = []
ids = []
areas = []
bedrooms = []
bathrooms = []
parkingSpaces = []
created_dates = []
addresses = []
below_prices = []
prices = []
below_prices = []
loc = []

# Maximium of 100 pages
for pag in range(1, 2): ## The maximum value here is 101, I used 101 to generate the database
    for rooms in range(1, 3): ## Maximum value is 5
        for baths in range(1, 3): ## Maximum value is 5
            for garage in range(1, 3): ## Maximum value is 5
                url = f"https://www.zapimoveis.com.br/venda/apartamentos/sp+sao-paulo/?__ab=sup-hl-pl:newC,exp-aa-test:control,super-high:new,off-no-hl:new,pos-zap:new,zapproppos:new,nlb-ldp:control,ltroffline:control&transacao=venda&tipos=apartamento_residencial&pagina={pag}&banheiros={baths}&quartos={rooms}&vagas={garage}"

                search_text = 'Não encontramos resultados para a busca'
                r = scraper.get(url)

                if search_text in r.text:
                    last_page = True
                else:
                    last_page = False

                if r.status_code == 200 and last_page is False:
                    page = BeautifulSoup(r.text, 'html5lib')

                    # Inicialização
                    bloco = defaultdict(lambda: '') 
                    loc = []

                    results = re.findall(address_r, page.text)
                    results = [t for t in results if t != ('city', 'sao-paulo')]

                    for chave, valor in results:
                        bloco[chave] = valor
                        
                        if {'city', 'stateAcronym', 'neighborhood'}.issubset(bloco.keys()):
                            # Formatting address
                            city = bloco['city']
                            stateAcronym = bloco['stateAcronym']
                            street = bloco['street']
                            streetNumber = bloco['streetNumber']
                            neighborhood = bloco['neighborhood']

                            full_address = f"{street} {streetNumber}, {neighborhood} - {city}/{stateAcronym}".strip(", ")
                            loc.append(full_address)

                            bloco.clear()

                    prices_raw = re.findall(price_r, page.text)
                    below_prices_raw = re.findall(below_price_r, page.text)
                    empty_values = re.findall(empty_r, page.text)

                    i = 0
                    for j in range(len(empty_values)):
                        if i < len(prices_raw):
                            if empty_values[j] == 'true':
                                prices.append(0)
                                below_prices.append('false')
                            else:
                                prices.append(prices_raw[i])
                                below_prices.append(below_prices_raw[i])
                            i += 1
                        else:
                            prices.append(0)
                            below_prices.append('false')

                    ids += re.findall(id_r, page.text)
                    areas += re.findall(area_r, page.text)
                    bedrooms += re.findall(bedrooms_r, page.text)
                    bathrooms += re.findall(bathrooms_r, page.text)
                    parkingSpaces += re.findall(parkingSpaces_r, page.text)
                    created_dates += re.findall(created_date_r, page.text)
                    addresses += loc

                    data = {
                        'ID': ids,
                        'created_date': created_dates,
                        'Price': prices,
                        'below_price': below_prices,
                        'Area': areas,
                        'Adress': addresses,
                        'Bedrooms': bedrooms,
                        'Bathrooms': bathrooms,
                        'Parking_Spaces': parkingSpaces,
                    }

df = pd.DataFrame(data)

today = date.today()
today.strftime('%Y-%m-%d')

df['extract_date'] = today

## Getting Latidude and longitude
geolocator = Nominatim(user_agent="my_geocoder")

def geocode_address(address):
    try:
        location = geolocator.geocode(address, addressdetails=True, timeout=10)
        if location:
            latitude = location.latitude
            longitude = location.longitude
            return pd.Series([latitude, longitude])
        else:
            return pd.Series([None, None])
    except GeocoderTimedOut:
        return pd.Series([None, None])

df[["Latitude", "Longitude"]] = df["Adress"].apply(geocode_address)

# df.to_csv('file.csv', index=False)

## How to treat to remove rows from apartments on pre-sale or in construction
## If you want to check opportunities to buy before the appartments are built you shouldn't run the lines below

df = df[~df[['Area', 'Bedrooms', 'Bathrooms', 'Parking_Spaces']].apply(lambda x: x.str.contains('-', regex=False)).any(axis=1)]

df

# Thanks for reading until here! Hope this code helps and if you have some time, feel free to add feedbacks os ideas!

If you have any questions, feel free to ask me!