##### Import libs

In [1]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from decouple import config
from collections import defaultdict
import time
# Get URL_PRINCIPAL of .env file.
URL_PRINCIPAL = config('URL_PRINCIPAL')
URL_DETAILS = config('URL_DETAILS')
ROOT_PATH = config('ROOT_PATH')
DATA_RAW_PATH = config('DATA_RAW_PATH')

In [2]:
def mount_dataframe_with_ids (URL_PRINCIPAL:str)->pd.DataFrame :
    '''
        This method receives a API URL with filter desejate, and returns
        a dataframe with ids, latitude and longitude of houses/apartments
        of the quintoandar.com.br . 

        Parameters : 
            URL_PRINCIPAL : Url of api, that contains a desejated filter.
        
        Ex :
            mount_dataframe_with_ids(URL_PRINCIPAL)
        Returns : 
            	_index	_type	_id	          _score	                    _source
            0	house	_doc	892986759	    0.0	        {'location': {'lon': -46.6424942, 'lat': -23.6...
            1	house	_doc	893564058	    0.0	        {'location': {'lon': -46.6420724, 'lat': -23.5...
            2	house	_doc	893324382	    0.0	        {'location': {'lon': -46.6649749, 'lat': -23.6...
    '''
    response = requests.get(URL_PRINCIPAL)
    data = response.json()
    dataframe_ = pd.DataFrame.from_dict(data['hits']['hits'])

    return dataframe_

In [3]:
def remove_dict_of_column (dataframe:pd.DataFrame)->tuple[list] :
    '''
        This method remove data of the dict and allocate in list

        Parameters :
            dataframe : A dataframe to remove data of dict.
    '''
    list_long = []
    list_lat = [] 
    for index in range(len(dataframe)) : 
        list_long.append(dataframe['_source'][index]['location']['lon'])
        list_lat.append(dataframe['_source'][index]['location']['lat'])
    return list_long, list_lat

In [4]:
def process_dataframe_with_ids(dataframe:pd.DataFrame)->pd.DataFrame:
    '''
        This method create two columns with data, latitude and longitude.

        Parameters :
            dataframe: A dataframe to create columns
    '''
    list_long, list_lat = remove_dict_of_column(dataframe)
    dataframe['lon'] = list_long
    dataframe['lat'] = list_lat
    dataframe.drop(['_source','_type','_score'],axis=1,inplace=True)
    dataframe.rename(columns={'_index':'type','_id':'id'},inplace=True)
    dataframe.to_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_ids.csv',index=False)

    return dataframe

In [5]:
def scrapper_ (dataframe:pd.DataFrame, column_id:str)-> pd.DataFrame:
    '''
        This method receives a dataframe and a id, to scrapping data about
        this id in the quinto andar site. This method returns a dataframe.

        Parameters :
            dataframe : A dataframe that contains the ids of data
            column_id : column that contains id of data that will want to get.

    '''
    results = []
    for index in range(len(dataframe)) : 
        id = dataframe[column_id][index]
        #url = f'https://www.quintoandar.com.br/imovel/{id}'
        try : 
            response = requests.get(URL_DETAILS+id)
            soup = BeautifulSoup(response.content,'html.parser')
            dict_details = json.loads(soup.find(id='__NEXT_DATA__').string)
            result = dict_details['props']['pageProps']['initialState']['house']['houseInfo']
            results.append(result)
            #time.sleep(1)
            print(f'Getting index {id}.')
        except :
            print(f'Index {id} not found.') 
            continue
    pd.DataFrame(results).to_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_raw_properties.csv',index=False)   
    return  pd.DataFrame(results)
        
        

In [6]:
# Mount raw dataframe with ids of properties
data_tmp = mount_dataframe_with_ids (URL_PRINCIPAL)
# Process raw dataframe
data_tmp = process_dataframe_with_ids(data_tmp)
# Create dataframe with data of the properties
data_ = scrapper_(data_tmp, 'id')

Getting index 893528710.
Getting index 892928127.
Getting index 893337768.
Getting index 893221815.
Getting index 893382732.
Getting index 893304559.
Getting index 893626403.
Getting index 893447986.
Getting index 893090155.
Getting index 893601362.
Getting index 893569518.
Getting index 893208522.
Getting index 893566291.
Getting index 892992665.
Getting index 893647695.
Getting index 893625458.
Getting index 893540483.
Getting index 893625515.
Getting index 893559378.
Getting index 893485590.
Getting index 893528709.
Getting index 892879258.
Getting index 893643704.
Getting index 893657269.
Getting index 893285200.
Getting index 893657371.
Getting index 893113259.
Getting index 893262856.
Getting index 893231214.
Getting index 893262688.
Getting index 892911477.
Getting index 893462486.
Getting index 892976330.
Getting index 893344135.
Getting index 893576268.
Getting index 893167856.
Getting index 892864102.
Getting index 892892725.
Getting index 893306038.
Getting index 893342231.
