##### Import libs

In [3]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from decouple import config
from collections import defaultdict
import time
# Get URL_PRINCIPAL of .env file.
URL_PRINCIPAL = config('URL_PRINCIPAL')

In [4]:
def mount_dataframe_with_ids (URL_PRINCIPAL:str)->pd.DataFrame :
    '''
        This method receives a API URL with filter desejate, and returns
        a dataframe with ids, latitude and longitude of houses/apartments
        of the quintoandar.com.br . 

        Parameters : 
            URL_PRINCIPAL : Url of api, that contains a desejated filter.
        
        Ex :
            mount_dataframe_with_ids(URL_PRINCIPAL)
        Returns : 
            	_index	_type	_id	          _score	                    _source
            0	house	_doc	892986759	    0.0	        {'location': {'lon': -46.6424942, 'lat': -23.6...
            1	house	_doc	893564058	    0.0	        {'location': {'lon': -46.6420724, 'lat': -23.5...
            2	house	_doc	893324382	    0.0	        {'location': {'lon': -46.6649749, 'lat': -23.6...
    '''
    response = requests.get(URL_PRINCIPAL)
    data = response.json()
    dataframe_ = pd.DataFrame.from_dict(data['hits']['hits'])

    return dataframe_

In [5]:
data_tmp = mount_dataframe_with_ids (URL_PRINCIPAL)

In [6]:
def remove_dict_of_column (dataframe:pd.DataFrame)->tuple[list] :
    list_long = []
    list_lat = [] 
    for index in range(len(dataframe)) : 
        list_long.append(dataframe['_source'][index]['location']['lon'])
        list_lat.append(dataframe['_source'][index]['location']['lat'])
    return list_long, list_lat

In [7]:
list_long, list_lat = remove_dict_of_column(data_tmp)
data_tmp['lon'] = list_long
data_tmp['lat'] = list_lat
data_tmp.drop(['_source','_type','_score'],axis=1,inplace=True)
data_tmp.rename(columns={'_index':'type','_id':'id'},inplace=True)

In [8]:
def scrapper_ (dataframe:pd.DataFrame, column_id:str)-> pd.DataFrame:
    '''
        This method receives a dataframe and a id, to scrapping data about
        this id in the quinto andar site. This method returns a dataframe.

        Parameters :
            dataframe : A dataframe that contains the ids of data
            column_id : column that contains id of data that will want to get.

        
    '''
    results = []
    for index in range(len(dataframe)) : 
        id = dataframe[column_id][index]
        url = f'https://www.quintoandar.com.br/imovel/{id}'
        try : 
            response = requests.get(url)
            soup = BeautifulSoup(response.content,'html.parser')
            dict_details = json.loads(soup.find(id='__NEXT_DATA__').string)
            result = dict_details['props']['pageProps']['initialState']['house']['houseInfo']
            results.append(result)
            #time.sleep(1)
            print(f'Getting index {id}.')
        except :
            print(f'Index {id} not found.') 
            continue
    return pd.DataFrame(results)
        
        

In [9]:
data_ = scrapper_(data_tmp, 'id')
#data_.to_csv(r'../data/raw/dataframe_raw.csv')

Getting index 0.
Getting index 1.
Getting index 2.
Getting index 3.
Getting index 4.
Getting index 5.
Index 893567381 not found.
Getting index 7.
Getting index 8.
Getting index 9.
Getting index 10.
Getting index 11.
Getting index 12.
Getting index 13.
Getting index 14.
Getting index 15.
Getting index 16.
Getting index 17.
Getting index 18.
Getting index 19.
Getting index 20.
Getting index 21.
Getting index 22.
