# this script will download the entire database then save them to a sqlite database (except for price column)
* method: scrape data from index to index
    * for example we scrape from 020000000000 to 030000000000
* method: decode json and save to sqlite database
* method: scrape each property's price
* method: scrape each property's pdf_link

In [10]:
from __future__ import absolute_import, division, print_function, \
    with_statement
import requests
from peewee import * 
import json
import numpy
import logging
from multiprocessing.dummy import Pool as ThreadPool
import threading

In [1]:
objects_done = 0
objects_total = 0

In [2]:
def parse_json_save_to_sqlite(json_string):
    try:
        json_obj = json.loads(json_string)
    except Exception as e:
        logging.error(e)
        logging.error("there is some problem with json loading")
    
    total = len(json_obj['features'])
    print("------------- get total: %s properties ----------------" % str(total))
    for index, each_building in enumerate(json_obj['features']):
        building_info = each_building['properties']
        
        building = PropertyModel()
        if len(str(building_info['wobj_obj_id'])) > 0:
            try:
                building.identificatie = building_info['wobj_obj_id']
            except Exception as e:
                print("this building has no identification code. ")
                building.identificatie = "none"
        else:
            building.identificatie = "none"
            print("this building has no identification code. ")
            
        if len(str(building_info['wobj_huisnummer'])) > 0:
            try:
                building.house_number = building_info['wobj_huisnummer']
            except Exception as e:
                building.house_number = "none"
                print("this building has no house number. ")
        else:
            building.house_number = "none"
            print("this building has no house number. ")
        
        if len(str(building_info['wobj_huisletter'])) > 0:
            try:
                building.house_number_ext = building_info['wobj_huisletter']
            except Exception as e:
                building.house_number_ext = "none"
                print("this building has no house number extension")
        else:
            building.house_number_ext = "none"
            print("this building has no house number extension")
            
        if len(str(building_info['wobj_postcode'])) > 0:
            try:
                building.postcode = building_info['wobj_postcode']
            except Exception as e:
                building.postcode = "none"
                print('this building has no postcode. ')
        else:
            building.postcode = "none"
            print('this building has no postcode. ')
            
        if len(str(building_info['wobj_woonplaats'])) > 0:
            try:
                building.plaatsnaam = building_info['wobj_woonplaats']
            except Exception as e:
                building.plaatsnaam = "none"
                print("this building has no plaatsnaam. ")
        else:
            building.plaatsnaam = "none"
            print("this building has no plaatsnaam. ")
        
        if len(str(building_info['wobj_straat'])) > 0:
            try:
                building.street = building_info['wobj_straat']
            except Exception as e:
                building.street = "none"
                print("this building has no street name. ")
        else:
            building.street = "none"
            print("this building has no street name. ")
            
        if len(str(building_info['wobj_bag_bouwjaar'])) > 0:
            try:
                building.bouwjaar = building_info['wobj_bag_bouwjaar']
            except Exception as e:
                building.bouwjaar = "none"
                print("this building has no bouwjaar. ")
        else:
            building.bouwjaar = "none"
            print("this building has no bouwjaar. ")
            
        if len(str(building_info['wobj_bag_gebruiksdoel'])) > 0:
            try:
                building.gebruiksdoel = building_info['wobj_bag_gebruiksdoel']
            except Exception as e:
                building.gebruiksdoel = "none"
                print("this building has no gebruiksdoel. ")
        else:
            building.gebruiksdoel = "none"
            print("this building has no gebruiksdoel. ")
            
        if len(str(building_info['wobj_oppervlakte'])) > 0:
            try:
                building.oppervlakte = building_info['wobj_oppervlakte']
            except Exception as e:
                building.oppervlakte = "none"
                print("this building has no oppervlakte. ")
        else:
            building.oppervlakte = "none"
            print("this building has no oppervlakte. ")
        
        
        building.price_2015, building.price_2016 = \
            parse_each_property_price(scrape_each_property_price(building.identificatie))
        
        try:
            building.save()
        except:
            pass
        
#         print("Saving property to database now, process: {:2.2f}% ...".format((index+1)*100/(total)), end="\r")

In [3]:
class BaseModel(Model):
    class Meta:
        database = SqliteDatabase("netherland_properties.db")
class PropertyModel(BaseModel):
    identificatie = CharField(unique=True)
    house_number = CharField(null=True)
    house_number_ext = CharField(null=True)
    postcode = CharField(null=True)
    plaatsnaam = CharField(null=True)
    street = CharField(null=True)
    
    price_2015 = CharField(null=True)
    price_2016 = CharField(null=True)
    
    bouwjaar = CharField(null=True)
    gebruiksdoel = CharField(null=True)
    oppervlakte = CharField(null=True)

NameError: name 'Model' is not defined

In [18]:
def init_database():
    db = SqliteDatabase("netherland_properties.db")
    db.connect()
    try:
        db.drop_tables([PropertyModel])
    except:
        pass
    db.create_tables([PropertyModel])
    db.close()

In [19]:
init_database()

In [4]:
def scrape_obj_from_id_to_id(f=None, t=None):
    if f is None:
        raise Exception("From id is None, check the script please. ")
    if t is None:
        raise Exception("To id is None, check the script please. ")
        
    # format from_id and to_id
    from_id = '%012d'%f
    to_id = '%012d'%t
    
    # initiate a request object
    s = requests.Session()
    s.get("https://www.wozwaardeloket.nl/index.jsp?a=1&accept=true&")
    
    xml_obj = \
    """
    <wfs:GetFeature
        xmlns:wfs="http://www.opengis.net/wfs" service="WFS" version="1.1.0" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <wfs:Query typeName="wozloket:woz_woz_object" srsName="EPSG:28992"
            xmlns:WozViewer="http://WozViewer.geonovum.nl"
            xmlns:ogc="http://www.opengis.net/ogc">
            <ogc:Filter
                xmlns:ogc="http://www.opengis.net/ogc">
                <ogc:And>
                    <ogc:PropertyIsGreaterThan matchCase="true">
                        <ogc:PropertyName>wobj_obj_id</ogc:PropertyName>
                        <ogc:Literal>%s</ogc:Literal>
                    </ogc:PropertyIsGreaterThan>
                    <ogc:PropertyIsLessThan matchCase="true">
                        <ogc:PropertyName>wobj_obj_id</ogc:PropertyName>
                        <ogc:Literal>%s</ogc:Literal>
                    </ogc:PropertyIsLessThan>
                </ogc:And>
            </ogc:Filter>
        </wfs:Query>
    </wfs:GetFeature>
    """
    xml_obj = xml_obj%(str(from_id), str(to_id))
    
    response = None
    try:
        response = s.post(url="https://www.wozwaardeloket.nl/woz-proxy/wozloket", data=xml_obj)
        print("scraping woz obj from id %s to id %s . "%(str(from_id), str(to_id)), end="\n")
    except Exception as e:
        logging.error(e)
        logging.error("request has met problem. ")
        logging.error("from_id=%s"%str(from_id))
        logging.error("to_id=%s"%str(to_id))
        
    return response.text

In [5]:
def scrape_range_and_save(args):
    global objects_total, objects_done
    json_string = scrape_obj_from_id_to_id(args[0], args[1])
    print("----------------- Databased dumped, start parsing :) -------------------", end="\n")
    parse_json_save_to_sqlite(json_string=json_string)
    objects_done += 1
    print("----------------- Process: {:2.2f}%------------------".format((objects_done)*100/(objects_total)))

In [6]:
def stage1_scrape_all_obj():
    global objects_total, objects_done
    # the 5000000000 is the step in range, if the script met error, we will need to change this.
    range_list = numpy.arange(15000000000, 110000000000, 5000000)
    arg_list = [[range_list[index], range_list[index+1]]for index in range(len(range_list)-1)]
    objects_total = len(arg_list)
    
    pool = ThreadPool(16)
    pool.map(scrape_range_and_save, arg_list)
    pool.close()
    pool.join()
    objects_total = 0
    objects_done = 0

In [7]:
def scrape_each_property_price(property_id):
    s = requests.Session()
    s.get("https://www.wozwaardeloket.nl/index.jsp?a=1&accept=true&")
    
    xml_obj = \
    """
    <wfs:GetFeature
        xmlns:wfs="http://www.opengis.net/wfs" service="WFS" version="1.1.0" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <wfs:Query typeName="wozloket:woz_woz_object" srsName="EPSG:28992"
            xmlns:WozViewer="http://WozViewer.geonovum.nl"
            xmlns:ogc="http://www.opengis.net/ogc">
            <ogc:Filter
                xmlns:ogc="http://www.opengis.net/ogc">
                <ogc:PropertyIsEqualTo matchCase="true">
                    <ogc:PropertyName>wobj_obj_id</ogc:PropertyName>
                    <ogc:Literal>%s</ogc:Literal>
                </ogc:PropertyIsEqualTo>
            </ogc:Filter>
        </wfs:Query>
    </wfs:GetFeature>
    """
    
    xml_obj = xml_obj%str(property_id)
    
    response = None
    try:
        response = s.post(url="https://www.wozwaardeloket.nl/woz-proxy/wozloket", data=xml_obj)
        print("--------scraping price of property id %s-------"%str(property_id), end="\n")
    except Exception as e:
        logging.error(e)
        logging.error("request has met problem. ")
        logging.error("property_id: %s"%str(property_id))
        
    return response.text

In [8]:
def parse_each_property_price(json_string):
    try:
        json_obj = json.loads(json_string)['features'][0]['properties']
    except Exception as e:
        logging.error(e)
        logging.error("there is some problem with json loading")
    
    price15, price16 = None, None
    try:
        price15 = int(json_obj['wobj_wrd_woz_waarde'])/1000
    except Exception as e:
        price15 = "none"
        
    try:
        price16 = int(json_obj['wobj_huidige_woz_waarde'])/1000
    except Exception as e:
        price16 = "none"
    
    return "{:.3f}".format(price15), "{:.3f}".format(price16)

In [11]:
stage1_scrape_all_obj()

ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:

scraping woz obj from id 032820000000 to id 032825000000 . 
scraping woz obj from id 020940000000 to id 020945000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 034305000000 to id 034310000000 . 
scraping woz obj from id 025395000000 to id 025400000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 035790000000 to id 035795000000 . 
scraping woz obj from id 026880000000 to id 026885000000 . 
scraping woz obj from id 016485000000 to id 016490000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 019455000000 to id 019460000000 . 
---------

ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading


scraping woz obj from id 058065000000 to id 058070000000 . 
scraping woz obj from id 040245000000 to id 040250000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 043215000000 to id 043220000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 041730000000 to id 041735000000 . 
scraping woz obj from id 038760000000 to id 038765000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:

scraping woz obj from id 050640000000 to id 050645000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 052125000000 to id 052130000000 . 
scraping woz obj from id 066975000000 to id 066980000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 044700000000 to id 044705000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 065490000000 to id 065495000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 047670000000 to id 047675000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 056580000000 to id 056585000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 061035000000 to id 061040000000 . 
scraping 

ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 080340000000 to id 080345000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 049155000000 to id 049160000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 046185000000 to id 046190000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 072915000000 to id 072920000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 069945000000 to id 069950000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 078855000000 to id 078860000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 077370000000 to id 077375000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 055095000000 to id 055100000000 . 
scraping woz obj from id 071430000000 to id 071435000000 . 
scraping woz obj from id 064005000000 to id 064010000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 075885000000 to id 075890000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 068460000000 to id 068465000000 . 
----------------- Databased dumped, start parsing :) -------------------
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 086280000000 to id 086285000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 062520000000 to id 062525000000 . 
scraping woz obj from id 081825000000 to id 081830000000 . 
----------------- Databased dumped, start parsing :) -------------------
---------

ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)


scraping woz obj from id 087765000000 to id 087770000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 074400000000 to id 074405000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 093705000000 to id 093710000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 092220000000 to id 092225000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 098160000000 to id 098165000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 101130000000 to id 101135000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 099645000000 to id 099650000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 102615000000 to id 102620000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 083310000000 to id 083315000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 095190000000 to id 095195000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 104100000000 to id 104105000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 084795000000 to id 084800000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading
ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 108555000000 to id 108560000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 105585000000 to id 105590000000 . 
----------------- Databased dumped, start parsing :) -------------------
scraping woz obj from id 107070000000 to id 107075000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 096675000000 to id 096680000000 . 
----------------- Databased dumped, start parsing :) -------------------


ERROR:root:Expecting value: line 1 column 1 (char 0)
ERROR:root:there is some problem with json loading


scraping woz obj from id 089250000000 to id 089255000000 . 
----------------- Databased dumped, start parsing :) -------------------


UnboundLocalError: local variable 'json_obj' referenced before assignment