## Importing Useful Libs

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

## Auxiliary code

In [2]:
# gets text as input and returns an int composed by all digits found in order
def extract_numbers(text): 
    return int(''.join(x for x in text if x.isdigit())) 

In [3]:
# translates selling status
selling_status_translation = {'À venda':'For sale',
                             'Para alugar':'For rent'}

# Defining General Parameters

How many pages do we want to scrape?

In [4]:
num_of_pages=50

User-Agent to get access to the html

In [5]:
# sensitive data
headers = {'User-Agent':
           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'}

# Creating DataFrame

Creating the dataframe in which we will register the extracted data

In [6]:
housing_df = pd.DataFrame(columns=['Title','Status','# Rooms','Area','# Car Spots','Condominium Fee',
                                  'City','Neighborhood','Price','Description','URL'])

# Scraping Script

In [7]:
i=0 # index for inputing rows in the DataFrame

for page in range(num_of_pages):
    
    # site's html converted to text
    html_text = requests.get(f'https://pe.olx.com.br/grande-recife/imoveis?o={page}',
                             headers=headers).text

    # main frame of the site
    main_frame = BeautifulSoup(html_text, 'html.parser').find('div',class_='sc-1fcmfeb-0 FBZzf')
    
    # getting all the ad
    ads = main_frame.find_all('li', class_='sc-1fcmfeb-2 fvbmlV')

    for ad in ads:
        
        # this try statement prevents errors when we try to find certain attibutes in non-house related ads and also deals
        # with bad HTML setups
        
        try:
            # finding the title (commented out, possibly sensitive data)
            title=f'House_AD' 
            # title = ad.find('a')['title'] 

            # extracting general info about the house
            general_info=ad.find('div',class_='fnmrjs-6 iNpuEh').span['title'].split(' | ')
            status,rooms,area,spots,fee=[np.nan]*5

            for info in general_info: # seeling status
                if 'venda' in info or 'alugar' in info:
                    status=selling_status_translation[info]
                elif 'quartos' in info: # number of rooms
                    rooms=extract_numbers(info)
                elif 'm²' in info: # total area
                    area=info 
                elif 'vagas' in info: # number of car spots
                    spots=extract_numbers(info)
                elif 'Cond' in info: # condominium fee
                    fee=info.split(': ')[-1]

            # getting localization info
            loc=ad.find('div',class_='sc-7l84qu-0 gmtqTp').span['title'].split(', ')

            city=loc[0] # city
            neighborhood=loc[1] # neighborhood

            # price in BRL
            price=ad.find('div',class_='aoie8y-0 hRScWw').span['aria-label']
            price=price.split(': ')[-1]

            # url to the actual advertisement, since we are scraping OLX's main frame
            ad_url = ad.find('a')['href']
            ad_page = BeautifulSoup(requests.get(ad_url, headers=headers).text)

            # complete description of the advertisement (commented out, possibly sensitive data)
            desc= f'House in {neighborhood}, {city}'
            # desc = ad_page.find('span',class_='sc-1sj3nln-1 eOSweo sc-ifAKCX cmFKIN').text

            housing_df.loc[i]=[title,status,rooms,area,spots,fee,city,neighborhood,price,desc,ad_url]
            i+=1 # we incrment the index only when we correctly insert data in the dataframe
            
        except:
            pass
        
    print(f'Done {page+1}/{num_of_pages} instances')

Done 1/50 instances
Done 2/50 instances
Done 3/50 instances
Done 4/50 instances
Done 5/50 instances
Done 6/50 instances
Done 7/50 instances
Done 8/50 instances
Done 9/50 instances
Done 10/50 instances
Done 11/50 instances
Done 12/50 instances
Done 13/50 instances
Done 14/50 instances
Done 15/50 instances
Done 16/50 instances
Done 17/50 instances
Done 18/50 instances
Done 19/50 instances
Done 20/50 instances
Done 21/50 instances
Done 22/50 instances
Done 23/50 instances
Done 24/50 instances
Done 25/50 instances
Done 26/50 instances
Done 27/50 instances
Done 28/50 instances
Done 29/50 instances
Done 30/50 instances
Done 31/50 instances
Done 32/50 instances
Done 33/50 instances
Done 34/50 instances
Done 35/50 instances
Done 36/50 instances
Done 37/50 instances
Done 38/50 instances
Done 39/50 instances
Done 40/50 instances
Done 41/50 instances
Done 42/50 instances
Done 43/50 instances
Done 44/50 instances
Done 45/50 instances
Done 46/50 instances
Done 47/50 instances
Done 48/50 instances
D

# Visualizing and Saving DataSet

In [8]:
housing_df

Unnamed: 0,Title,Status,# Rooms,Area,# Car Spots,Condominium Fee,City,Neighborhood,Price,Description,URL
0,House_AD,For rent,3,86m²,2,R$ 0,Recife,Boa Viagem,R$ 4.000.,"House in Boa Viagem, Recife",https://pe.olx.com.br/grande-recife/imoveis/ap...
1,House_AD,,,600m²,,R$ 0,Camaragibe,Aldeia dos Camarás,R$ 220.000.,"House in Aldeia dos Camarás, Camaragibe",https://pe.olx.com.br/grande-recife/terrenos/t...
2,House_AD,For sale,3,97m²,2,,Recife,Dois Irmãos,R$ 777.000.,"House in Dois Irmãos, Recife",https://pe.olx.com.br/grande-recife/imoveis/vm...
3,House_AD,For sale,4,134m²,3,R$ 655,Jaboatão dos Guararapes,Candeias,R$ 940.000.,"House in Candeias, Jaboatão dos Guararapes",https://pe.olx.com.br/grande-recife/imoveis/ap...
4,House_AD,For sale,4,124m²,2,R$ 0,Recife,Graças,R$ 1.080.980.,"House in Graças, Recife",https://pe.olx.com.br/grande-recife/imoveis/ap...
...,...,...,...,...,...,...,...,...,...,...,...
1729,House_AD,For sale,4,346m²,4,R$ 450,Gravatá,Alpes Suiços,R$ 1.600.000.,"House in Alpes Suiços, Gravatá",https://pe.olx.com.br/grande-recife/imoveis/ca...
1730,House_AD,For sale,3,306m²,,R$ 0,Gravatá,Nossa Senhora das Graças,R$ 190.000.,"House in Nossa Senhora das Graças, Gravatá",https://pe.olx.com.br/grande-recife/imoveis/ca...
1731,House_AD,For sale,2,57m²,,R$ 268,Recife,Santo Amaro,R$ 458.000.,"House in Santo Amaro, Recife",https://pe.olx.com.br/grande-recife/imoveis/ed...
1732,House_AD,For rent,,35m²,,R$ 0,Recife,Boa Viagem,R$ 1.850.,"House in Boa Viagem, Recife",https://pe.olx.com.br/grande-recife/imoveis/fl...


In [9]:
housing_df.to_csv('../DataSets/OLX_Housing.csv',index_label=False) # save df