<a href="https://colab.research.google.com/github/WittmannF/dev-udemy-deep-learning/blob/master/1_b_Coletar_Dados_Web_Scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Imports
import pandas as pd
import numpy as np
import re
from time import sleep
import requests
from bs4 import BeautifulSoup

In [0]:
# Parametros
ZONAS = ['norte', 'sul', 'leste', 'oeste']
PAGINAS = ['', '_Desde_49', '_Desde_97', '_Desde_145']
URL_PADRAO = 'https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-{}/{}'

# Padroes regex
RE_PRECO_M2 = 'R\$ (.*)  (.*) m²'
RE_QUARTO = ' \| (.*) quarto'

In [0]:
# Funcoes
def pegar_codigos_fontes():
    codigo_fontes = []
    for zona in ZONAS:
        for pg in PAGINAS:
            url = URL_PADRAO.format(zona, pg)
            print(f'Carregando URL: {url}')
            codigo_fonte = requests.get(url).content
            codigo_fontes.append(codigo_fonte)
            sleep(3)
    return codigo_fontes

def html_parse(codigo_fontes):
    soups = []
    for source in codigo_fontes:
        soup = BeautifulSoup(source, 'html.parser')
        soups.append(soup)
    return soups

def pegar_resultados(codigo_fontes_bs):
    all_results = []
    for soup in codigo_fontes_bs:
        result = soup.findAll("li", {"class": "results-item highlighted article grid "})
        all_results.extend(result)
    return all_results

def extrair_atributos(all_results):
    n_per_zonas = len(all_results)//4

    all_zones = []

    for z in ZONAS:
        all_zones.extend([z]*n_per_zonas)

    dados = {'zona': [], 'quartos': [], 'area': [], 'preco': []}

    for r, zona in zip(all_results, all_zones):
        try:
            preco, area = re.findall(RE_PRECO_M2, r.text)[0]
            n_quartos = re.findall(RE_QUARTO, r.text)
            if n_quartos == []:
                n_quartos = 'NaN'
            else:
                n_quartos = n_quartos[0]
            
            dados['zona'].append(zona)
            dados['quartos'].append(n_quartos)
            dados['area'].append(area)
            dados['preco'].append(preco)
        except Exception as e:
            print("Erro, pulando elemento.")
            print("O seguinte erro foi retornado: ")
            print(e)
            
    df = pd.DataFrame(dados)
    return df

def converter_tipos(df):
    df['quartos'] = df['quartos'].astype(float)
    df['area'] = df['area'].astype(float)
    df['preco'] = df['preco'].apply(lambda x: float(x.replace('.', '')))
    return df


In [0]:
# Execucao

# 1. Pegar codigos fontes
codigos_fontes = pegar_codigos_fontes()

Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-norte/
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-norte/_Desde_49
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-norte/_Desde_97
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-norte/_Desde_145
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-sul/
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-sul/_Desde_49
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-sul/_Desde_97
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-sul/_Desde_145
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-paulo/sao-paulo-zona-leste/
Carregando URL: https://imoveis.mercadolivre.com.br/casas/aluguel/sao-

In [0]:
# 2. Converter para Parser do Beautiful Soup
codigo_fontes_bs = html_parse(codigos_fontes)

In [0]:
# 3. Pegar resultados
all_results = pegar_resultados(codigo_fontes_bs)

In [0]:
# 4. Extrair atributos
df = extrair_atributos(all_results)

Erro, pulando elemento.
O seguinte erro foi retornado: 
list index out of range


In [0]:
# 5. Converter tipos de atributos
df = converter_tipos(df)

In [0]:
df.head()

Unnamed: 0,zona,quartos,area,preco
0,norte,1.0,40.0,1100.0
1,norte,2.0,90.0,1100.0
2,norte,1.0,50.0,700.0
3,norte,1.0,70.0,600.0
4,norte,2.0,90.0,1200.0


In [0]:
!pip install easycolab
import easycolab as ec
ec.mount()

Collecting easycolab
  Downloading https://files.pythonhosted.org/packages/7a/ff/017693c8f12c9b586d2bc9965ebed05b8aa1d2ba0ec9f1d88df9f0a70542/easycolab-0.1b29.tar.gz
Building wheels for collected packages: easycolab
  Building wheel for easycolab (setup.py) ... [?25l[?25hdone
  Created wheel for easycolab: filename=easycolab-0.1b29-cp36-none-any.whl size=4472 sha256=a3d0b0b59f0213f809718c692dd2989ac2ad81ca6d716570be5123ca3f5698a5
  Stored in directory: /root/.cache/pip/wheels/1b/c3/03/9d9371c4d3a117eff9caf88148e2f18ddf556543b4475055f1
Successfully built easycolab
Installing collected packages: easycolab
Successfully installed easycolab-0.1b29
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3

In [0]:
mkdir 'CalculadoraImoveis'

In [0]:
cd 'CalculadoraImoveis'

/content/drive/My Drive/CalculadoraImoveis


In [0]:
df.to_csv('dados_calculadora_imoveis.csv', index=False)