##### Import libs

In [1]:
import pandas as pd
from decouple import config
import numpy as np
import ast
from geopy.geocoders import Nominatim
import requests
# Load enviroment variables.
ROOT_PATH       = config('ROOT_PATH')
DATA_RAW_PATH   = config('DATA_RAW_PATH')
DATA_PROCESSED_PATH = config('DATA_PROCESSED_PATH')
FOURSQUARE_API_KEY = config('FOURSQUARE_API_KEY')
URL_FOURSQUARE_PLACES = config('URL_FOURSQUARE_PLACES')
pd.options.display.max_columns = 100
# headers to API
headers = {
    "Accept": "application/json",
    "Authorization": FOURSQUARE_API_KEY
}

##### Methods

In [2]:
def get_geolocation (address:str)-> tuple[float,float] :
    '''
        This method receives an address and return your geolocation.

        Parameters : 
            address = A address to take geolocation
        Ex :
            return_geolocation('Rua José Gáspar')
        Returns :
            (-23.6624262, -46.6441204)
    '''
    geolocator = Nominatim(user_agent='foursquare_agent')
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

In [3]:
def mount_filtered_dataframe (dataframe:pd.DataFrame, city_to_search:str,list_neighborhood:list)-> pd.DataFrame :
    '''
        This method receives a dataframe, city and list of neighborhood to looking for.

        Parameters : 
            dataframe : A dataframe with data to filter
            city_to_search : A name of city to search properties
            list_neighborhood : A list that contains the neighborhoods to looking for.
        Ex :
            neighborhoods = [
                'Bela Vista','Bom Retiro','Cambuci','Consolação',
                'Sé','Higienópolis','Liberdade','Santa Cecília'
                ]
            mount_filtered_dataframe(data_tmp, 'São Paulo', neighborhoods)
        Returns :
            dataframe with filtered data.
    '''
    dataframe_ = pd.DataFrame()
    for neighborhood in list_neighborhood:
        dataframe_ = pd.concat([dataframe_,dataframe.loc[(dataframe['city'] == city_to_search) & (dataframe['neighborhood'] == neighborhood)]],axis=0)
    return dataframe_.reset_index(drop=True)

In [4]:
def get_around_places (latitude:float, longitude:float,radius:int,*categories)->list[dict] :
    '''
        This method receives a latitude, longitude and radius to search near venues.
        
        Parameters : 
            latitude : Latitude of property from quinto andar.
            longitude : Longitude of property from quinto andar.
            radius : Radius to search venues inside. (in meters)
            categories : The categories will be returned, if want all pass zero|0.
            *Code categories at in https://developer.foursquare.com/docs/categories
        Ex : 
            get_around_places(-23.663303,-46.642494, 500, 10000)
        Returns:
            {'results': [{'fsq_id': '52e56c63498e73cc2b4a8a49',
            'categories': [{'id': 10039,
                'name': 'Music Venue',
                'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/arts_entertainment/musicvenue_',
                'suffix': '.png'}}],
            'chains': [],
            'distance': 993,
            'geocodes': {'main': {'latitude': -23.668759, 'longitude': -46.634906},
                'roof': {'latitude': -23.668759, 'longitude': -46.634906}},
            'link': '/v3/places/52e56c63498e73cc2b4a8a49',
            'location': {'address': 'Rua Bela Vista',
                'country': 'BR',
                'cross_street': '',
                'formatted_address': 'Rua Bela Vista',
                'postcode': ''},
            'name': 'Escola de Samba Vai-Vai',
            'related_places': {},
            'timezone': 'America/Sao_Paulo'} ...
    '''
    if categories[0] == 0 :
        api_url = URL_FOURSQUARE_PLACES+f"?ll={latitude}%2C{longitude}&radius={radius}&limit=50"
        response =  requests.get(api_url,headers=headers).json()
    if len(categories) > 1 :
        api_url = URL_FOURSQUARE_PLACES+f"?ll={latitude}%2C{longitude}&radius={radius}&limit=50&categories={categories[0]}"
        for category in categories[1:] :
            api_url += f"%2C{category}"
        response =  requests.get(api_url,headers=headers).json()
    else : 
        api_url = URL_FOURSQUARE_PLACES+f"?ll={latitude}%2C{longitude}&radius={radius}&limit=50&categories={categories[0]}"
        response =  requests.get(api_url,headers=headers).json()
        
    return response

In [5]:
def get_foursquare_data (
    dataframe:pd.DataFrame,column_latitude:str,column_longitude:str,
    radius:int,category)->pd.DataFrame :
    '''
        This method receives a dataframe and build other dataframe with data from foursquare,
        based in latitude and longitude of property data from quinto andar.

        Parameters : 

            dataframe : A dataframe that contains geolocation of properties.
            column_latitude : Column that contains data of latitude of properties.
            column_longitude : Column that contains data of longitude of properties.
            radius : A radius to search near venues.
            category : A code of category to search.
        Ex :
            dataframe = {'id': 0    892790963
                        Name: id, dtype: int64,
                        'lat': 0   -23.555161
                        Name: lat, dtype: float64,
                        'lon': 0   -46.641136
                        Name: lon, dtype: float64 ...}
            get_foursquare_data(data_[:5],'lat','lon',500,10000)
        Returns :
            {'idx_': 0    0
            Name: idx_, dtype: int64,
            'fsq_id': 0    4bdb61602a3a0f4711c4aeb6
            Name: fsq_id, dtype: object,
            'categories': 0    [{'id': 10043, 'name': 'Theater', 'icon': {'pr...
            Name: categories, dtype: object,
            'chains': 0    []
            Name: chains, dtype: object,
            'distance': 0    21
            Name: distance, dtype: int64,
            'geocodes': 0    {'main': {'latitude': -23.555148, 'longitude':...
            Name: geocodes, dtype: object,
            'link': 0    /v3/places/4bdb61602a3a0f4711c4aeb6
            Name: link, dtype: object,
            'location': 0    {'address': 'Rua Jaceguai, 520', 'country': 'B...
            Name: location, dtype: object,
            'name': 0    Teatro Oficina
            Name: name, dtype: object,
            'related_places': 0    {}
            Name: related_places, dtype: object,
            'timezone': 0    America/Sao_Paulo
            Name: timezone, dtype: object}
    '''
    data_ = pd.DataFrame()
    for index in range(len(dataframe)) :
        latitude = dataframe[column_latitude][index]
        longitude = dataframe[column_longitude][index]
        
        data_tmp = pd.DataFrame.from_dict(get_around_places(latitude,longitude,radius,category)['results'])
        data_tmp.insert(0,'idx_',index)
        data_ = pd.concat([data_,data_tmp],axis=0)

    return data_

##### Load data

In [6]:
data_tmp = pd.read_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_merged.csv')
data_raw = pd.read_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_raw_properties.csv')

  data_raw = pd.read_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_raw_properties.csv')


##### Processing data

In [7]:
# Transforming string in dict
data_raw['address'] = data_raw['address'].apply(lambda x: ast.literal_eval(x))
# Creating columns
data_raw['street'] = data_raw['address'].apply(lambda x: x['street'])
data_raw['neighborhood'] = data_raw['address'].apply(lambda x: x.get('neighborhood',np.nan))

##### Organizing data

In [8]:
# Reorganizing data
data_tmp.insert(2,'lat',data_tmp.pop('lat'))
data_tmp

Unnamed: 0,type,id,lat,lon,BANHEIRA_DE_HIDROMASSAGEM,BOX,VARANDA,PISCINA_PRIVATIVA,ARMARIOS_EMBUTIDOS_NO_QUARTO,ARMARIOS_NOS_BANHEIROS,ARMARIOS_NA_COZINHA,AR_CONDICIONADO,CHUVEIRO_A_GAS,QUARTO_DE_SERVICO,BANHEIRO_DE_SERVICO,VARANDA_GOURMET,APARTAMENTO_COBERTURA,QUARTO_EXTRA_REVERSIVEL,FOGAO_INCLUSO,GELADEIRA_INCLUSO,BANHEIRO_ADAPTADO,CLOSET,COZINHA_AMERICANA,MESAS_E_CADEIRAS_DE_ESCRITORIO,JARDIM,QUARTOS_E_CORREDORES_COM_PORTAS_AMPLAS,QUINTAL,SOMENTE_UMA_CASA_NO_TERRENO,AREA_DE_SERVICO
0,house,892986759,-23.663303,-46.642494,NAO,NAO,NAO_INFORMADO,NAO,NAO,NAO,SIM,NAO,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,SIM
1,house,893513303,-23.671671,-46.691886,NAO,SIM,NAO,NAO,NAO,SIM,SIM,NAO,NAO,SIM,SIM,NAO,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,SIM,SIM,SIM
2,house,893567381,-23.462255,-46.555363,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,SIM
3,house,893426340,-23.554803,-46.596351,SIM,SIM,NAO,NAO,NAO,SIM,SIM,NAO,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO
4,house,893475416,-23.675248,-46.609885,NAO,NAO,SIM,NAO,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,SIM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9974,house,893703984,-23.540389,-46.652610,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,SIM
9975,house,893697876,-23.494918,-46.740010,NAO,SIM,NAO,SIM,NAO,NAO,SIM,NAO,NAO,SIM,SIM,SIM,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO
9976,house,893669474,-23.587723,-46.582399,NAO,NAO,SIM,NAO,SIM,SIM,SIM,NAO,NAO,SIM,SIM,NAO,NAO,SIM,NAO,NAO,SIM,NAO,NAO,NAO,SIM,SIM,SIM,SIM,SIM
9977,house,893688435,-23.625045,-46.688631,NAO,SIM,NAO_INFORMADO,NAO,SIM,SIM,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO,NAO_INFORMADO


In [9]:
# Creating a unique dataset with relevant data
mask = [
    'id','bedrooms', 'city','bathrooms',
    'iptu','area','rentPrice',
    'type','iptuType','totalCost',
    'parkingSpaces','floor','suites',
    'street','neighborhood']
data_tmp = data_tmp.merge(data_raw[mask],how='left', right_on='id',left_on='id').drop(['type_x'],axis=1).rename(columns={'type_y':'type'})

##### Creating dataset with properties of the São Paulo center

In [10]:
# Creating a list with neighborhoods of São Paulo center.
neighborhoods = [
    'Bela Vista','Bom Retiro','Cambuci','Consolação',
    'Sé','Higienópolis','Liberdade','Santa Cecília']

In [11]:
data_ = mount_filtered_dataframe(data_tmp, 'São Paulo', neighborhoods)

In [12]:
data_foursquare = get_foursquare_data(data_,'lat','lon',500,10000)

In [13]:
# Export data
#data_.to_csv(ROOT_PATH+DATA_PROCESSED_PATH+'dataframe_quintoAndar_preprocessed.csv',index=False)
#data_foursquare.to_csv(ROOT_PATH+DATA_RAW_PATH+'dataframe_foursquare_center_SP.csv',index=False)

In [16]:
data_foursquare.reset_index(drop=True,inplace=True)