In [1]:
import os
import sqlalchemy
import pandas as pd 

from boliga import BoligaSold

In [12]:
DB = 'boligDB'
TABLE = 'boliga_sold'
POSTGRES_URI = f"{os.environ['POSTGRES_URI']}/{DB}"

In [7]:
def send_bolig(bolig, table, **kwargs):

    if bolig.empty:
        return f'No DataFrame to send to {table}'

    # postgres query roomSize will require "roomSize"
    bolig.columns = bolig.columns.str.lower()

    # columns with dict causes issues. stringfy thme
    columns = bolig.select_dtypes('object').columns
    bolig[columns] = bolig[columns].astype(str)

    engine = sqlalchemy.create_engine(POSTGRES_URI)
    bolig.to_sql(table, engine, if_exists='append')
    print(f'There were {len(bolig)} estates send to {DB}.{table}')
    engine.dispose()
    
    return bolig

In [4]:
# Bolig Sold
api_name = 'boliga.dk'
print(f'\n[+] Using {api_name} to demostrate advance web scraping ideas for sold estates\n')

# instantiate a class
boliga_sold = BoligaSold(url='https://api.boliga.dk/api/v2/sold/search/results')

# multipe pages per call
workers = 5
start_page = 1
end_page = 50
page_size = 200

print(f'[+] Start {workers} threads for {page_size} pagesize per call: start at page {start_page} and at page {end_page} \n')
boliga_sold.get_pages(start_page=start_page, end_page=end_page,
                        pagesize=page_size, workers=workers, verbose=False)


[+] Using boliga.dk to demostrate advance web scraping ideas for sold estates

[+] Start 5 threads for 200 pagesize per call: start at page 1 and at page 50 

[+] Gathering data from page 1.     Found   200 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 2.     Found   400 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 13.     Found   600 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 23.     Found   800 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 33.     Found  1000 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 43.     Found  1200 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 14.     Found  1400 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 34.     Found  1600 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 24.     Found  1800 estates   Time 17-10-2020 18:37:09
[+] Gathering data from page 3.     Found  2000 estates   Time 17-10-2020 18:37:09
[+]

BoligaSold(API='https://api.boliga.dk/api/v2/sold/search/results')

In [7]:
bolig = boliga_sold.DataFrame
print(f'\n{bolig.shape[0]} estates found.\nData types are?')
print(bolig.dtypes)  # data types


10400 estates found.
Data types are?
estateId              int64
address              object
zipCode               int64
price                 int64
soldDate             object
propertyType          int64
saleType             object
sqmPrice            float64
rooms               float64
size                  int64
buildYear             int64
change              float64
guid                 object
latitude            float64
longitude           float64
municipalityCode      int64
estateCode            int64
city                 object
groupKey             object
canGetVR               bool
dtype: object


In [10]:
_ = send_bolig(bolig, table=TABLE)

There were 10400 estates send to boligDB.boliga_sold


In [14]:
TABLE

'boliga_sold'

In [4]:
engine = sqlalchemy.create_engine(POSTGRES_URI)
example = pd.read_sql(f'SELECT estateId, price, soldDate FROM {TABLE}', engine)
engine.dispose()

In [5]:
example.head()

Unnamed: 0,estateid,price,solddate
0,1698054,4380000,2020-10-07T22:00:00.000Z
1,1683616,2695000,2020-10-07T22:00:00.000Z
2,1694355,4480000,2020-10-07T22:00:00.000Z
3,0,1650000,2020-10-07T22:00:00.000Z
4,0,1600000,2020-10-07T22:00:00.000Z


In [27]:
# empty table
with sqlalchemy.create_engine(
    f'{POSTGRES_URI}',
    isolation_level='AUTOCOMMIT'
).connect() as connection:
    connection.execute(f'TRUNCATE TABLE {TABLE}')

In [None]:
# Center in Grafana Geop
# Copenhagen lat, log == 55.676098, 12.568337