In [2]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import pandas as pd
import requests
import time
import os
from multiprocessing import Pool
import random

class Property:
    def __init__(self, address, postcode, buurt, woonplaats, huisnummer, count):
        self.address = address
        self.postcode = postcode
        self.buurt = buurt
        self.woonplaats = woonplaats
        self.huisnummer = huisnummer
        self.count = count
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})



    def get_address_ids(self, complete_address):
        """Retrieve address IDs with a batch request if possible."""
        
        url = "https://api.pdok.nl/bzk/locatieserver/search/v3_1/suggest"
        params = {"q": complete_address}


        try:
            response = self.session.get(url, params=params, timeout=5)
            #response = request.get(url, params=params, timeout=5)
            response.raise_for_status()
            data = response.json()
            return [doc.get("id", "") for doc in data.get("response", {}).get("docs", [])]
        
        except (requests.RequestException, ValueError) as ex:
            print(f"Error retrieving address ID: {ex}")
            return None

    def get_correct_addressID(self, complete_address):
        """Obtain correct address IDs, leveraging more threads and reducing retries."""
        
        _ids = self.get_address_ids(complete_address)
        if not _ids:
            return None
        
        url = "https://api.pdok.nl/bzk/locatieserver/search/v3_1/lookup"
        address_data = {'nummeraanduiding_ids': [], 'adresseerbaarobject_ids': []}

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(self.fetch_address_data, url, _id) for _id in _ids]
            for future in as_completed(futures):
                data = future.result()

                # Meassurement for getting only the corresponding id. Rijnlaan 1-BSA, 3522BA Utrecht
                if data.get("weergavenaam") == f'{self.address.split(" ")[0]} {self.huisnummer}, {self.postcode} {self.woonplaats}':
                    #print(f'Weergavenaam: {data.get("weergavenaam")}')
                    
                    nummeraanduiding_id = data.get("nummeraanduiding_id")
                    adresseerbaarobject_id = data.get("adresseerbaarobject_id")
                    
                    if nummeraanduiding_id and adresseerbaarobject_id:
                        address_data["nummeraanduiding_ids"].append(nummeraanduiding_id)
                        address_data["adresseerbaarobject_ids"].append(adresseerbaarobject_id)
                        
        return address_data

    def fetch_address_data(self, url, _id):
        """Helper to fetch data for each address ID."""
        params = {"id": _id}
        try:
            response = self.session.get(url, params=params, timeout=5)
            #response = requests.get(url, params=params, timeout=5)
            response.raise_for_status()
            return response.json().get("response", {}).get("docs", [{}])[0]
        except requests.RequestException as e:
            print(f"Request failed for {self.address} with error: {e}")
            return {}

    def get_woz_values(self, _ids):
        headers = {
            'sec-ch-ua-platform': '"Linux"',
            'Referer': '',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'sec-ch-ua': '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
            'sec-ch-ua-mobile': '?0',
        }
        
        base_url = "https://api.kadaster.nl/lvwoz/wozwaardeloket-api/v1/wozwaarde/nummeraanduiding/"
        woz_values = {year: None for year in range(2014, 2024)}
        grondoppervlakte = [None]  # Use list to hold reference for mutability
        max_retries = 4
        
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(self.fetch_woz_value, base_url, _id, woz_values, grondoppervlakte) for _id in _ids]
            for future in as_completed(futures):
                future.result()
        
        return woz_values, grondoppervlakte[0]



    def fetch_woz_value(self, base_url, _id, woz_values, grondoppervlakte):
        attempt = 0
        while attempt < 4:
            try:
                response = self.session.get(f"{base_url}{_id}", timeout=5)
                #response = requests.get(url, timeout=5)
                #response = requests.get(f"{base_url}{_id}", timeout=5)
                if response.status_code == 429:
                    # wait_time = min(60, 2 ** attempt + random.uniform(0, 1))
                    # print(f"Rate limited. Waiting for {wait_time:.2f} seconds...")
                    # time.sleep(wait_time)
                    # attempt += 1
                    # continue

                    wait_time = min(60, (2 ** attempt) + random.uniform(0, 2 ** attempt))
                    print(f"Rate limited. Waiting for {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                    attempt += 1
                    continue
                
                response.raise_for_status()
                data = response.json()

                # Check wich if statements are nodig
                if not data:
                    print("Data is empty:", data)
                    
                if 'wozWaarden' not in data or not data['wozWaarden']:
                    print(f"No WOZ values found for ID {_id}, skipping.")
                    return False

                for item in data['wozWaarden']:
                    year = int(item['peildatum'][:4])
                    if 2014 <= year <= 2023:
                        woz_values[year] = item['vastgesteldeWaarde']
                       
                # Update grondoppervlakte if not yet set
                if grondoppervlakte[0] is None:
                    grondoppervlakte[0] = data['wozObject'].get('grondoppervlakte', None)
                
                return True
                
            except requests.exceptions.RequestException as e:
                print(f"Request failed for {_id} with error: {e} woz error for adress {self.address}")
                attempt += 1
                if attempt >= 4:
                    print("Maximum retries reached. Skipping this ID.")
                    return False
    

    def get_other_features(self, _ids):
        """Fetch additional features such as bouwjaar and oppervlakte."""
        features = {}
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(self.fetch_other_feature, _id) for _id in _ids]
            for future in as_completed(futures):
                result = future.result()
                if result:
                    features.update(result)
        return features
        

    def fetch_other_feature(self, _id: str):
        headers = {
            'Accept': 'application/json, text/plain, */*',
            'User-Agent': 'Mozilla/5.0',
        }
        params = {'service': 'WFS'}
        features = {}

        data = (
            f'<wfs:GetFeature xmlns:wfs="http://www.opengis.net/wfs" '
            f'service="WFS" version="1.1.0" xsi:schemaLocation="http://www.opengis.net/wfs '
            f'http://schemas.opengis.net/wfs/1.0.0/WFS-transaction.xsd" '
            f'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" outputFormat="application/json">'
            f'<wfs:Query typeName="verblijfsobject" xmlns:null="http://bag.geonovum.nl">'
            f'<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">'
            f'<ogc:PropertyIsEqualTo>'
            f'<ogc:PropertyName>identificatie</ogc:PropertyName>'
            f'<ogc:Literal>0{int(_id)}</ogc:Literal>'
            f'</ogc:PropertyIsEqualTo>'
            f'</ogc:Filter>'
            f'</wfs:Query>'
            f'</wfs:GetFeature>'
        )

        # try:
        #     response = requests.post(
        #         'https://service.pdok.nl/lv/bag/wfs/v2_0',
        #         params=params,
        #         headers=headers,
        #         data=data,
        #         timeout=5
        #     )
        try:
            response = self.session.post(
                'https://service.pdok.nl/lv/bag/wfs/v2_0',
                params=params,
                headers=headers,
                data=data,
                timeout=5
            )
            
            response.raise_for_status()

            if response.headers.get("Content-Type") == "application/json":
                data = response.json()
                
                if 'features' in data and data['features']:
                    feature_data = data['features'][0]['properties']
                    return {
                        'gebruiksdoel': feature_data.get('gebruiksdoel'),
                        'bouwjaar': feature_data.get('bouwjaar'),
                        'oppervlakte': feature_data.get('oppervlakte')
                    }
                
        except requests.exceptions.RequestException as ex:
            print(f"Request failed for ID {self.address} with error: {ex}, peoepepdepde")
            return None
            
        return {}

    def to_data(self):
        complete_address = f'{self.address} {self.postcode} {self.woonplaats}'
        
        print(f'Address being processed: {complete_address}, Property count: {self.count}')
        
        ids = self.get_correct_addressID(complete_address)
        
        if not ids:
            modified_address = f'{self.address.split(" ")[0]} {self.huisnummer}, {self.postcode} {self.woonplaats}'
            ids = self.get_correct_addressID(modified_address)

            if not ids:
                return None  # Empty DataFrame if no IDs

        #print(ids)

        # Other features
        other_features = self.get_other_features(ids["adresseerbaarobject_ids"])

        # Check if "gebruiksdoel" is "woonfunctie"; if not, skip this property
        if other_features.get("gebruiksdoel") != "woonfunctie":
            print(f"Skipping property at {self.address} as 'gebruiksdoel' is not 'woonfunctie'")
            return None  # Return an empty DataFrame to indicate skipping

        # WOZ values from 2014 to 2023
        woz_values, grondoppervlakte = self.get_woz_values(ids["nummeraanduiding_ids"])

        # When there aren't any woz values for the particular property, then skip
        if sum(1 for year in woz_values if woz_values.get(year) is None) == 10:   # Adjust for min number of woz_values
            print(f'{self.address}: does not have any woz values')
            return None

        else: 
            data = {
                "address": self.address,
                "buurt": self.buurt,
                "bouwjaar": other_features.get("bouwjaar"),
                "grondoppervlakte": grondoppervlakte,
                "oppervlakte": other_features.get("oppervlakte"),
                **{f"woz_{year}": woz_values.get(year) for year in range(2014, 2024)},
            }
            #print(data)

    
        return data

In [3]:
def initialize_csv(file_path: str):
    # Define the columns as per the specified labels
    columns = [
        "address", "buurt", "bouwjaar", "grondoppervlakte", "oppervlakte",
        "woz_2014", "woz_2015", "woz_2016", "woz_2017", "woz_2018",
        "woz_2019", "woz_2020", "woz_2021", "woz_2022", "woz_2023"
    ]

    # Create an empty DataFrame with these columns
    df = pd.DataFrame(columns=columns)

    # Save to CSV
    df.to_csv(file_path, index=False)
    print(f"Initialized CSV at: {file_path}")

file_path = '/home/wouter/Documents/Scriptie/csv/property_data.csv'
initialize_csv(file_path)

Initialized CSV at: /home/wouter/Documents/Scriptie/csv/property_data.csv


In [4]:
# Initialize CSV if not already initialized
file_path = '/home/wouter/Documents/Scriptie/csv/property_data.csv'
initialize_csv(file_path)

Initialized CSV at: /home/wouter/Documents/Scriptie/csv/property_data.csv


In [5]:
import time

def save_property_to_csv(dataframe):
    start_time = time.time()  # Use time.time() for measuring elapsed time
    dict_data = []
    processed_count = 0
    
    # Iterate through each row in the filtered DataFrame
    for _, row in dataframe.iterrows():
        processed_count += 1
        address = f"{row['straat']} {str(row['huisnummer']).zfill(5)}"
        
        # Create a Property instance for each row
        property_instance = Property(address, row['postcode'], row['buurt'], row['woonplaats'], row['huisnummer'], processed_count)
        property_data = property_instance.to_data()

        if property_data:  # Only add valid data
            dict_data.append(property_data)
            
    if dict_data:
        pd.DataFrame(dict_data).to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False)
    
    print(f"Time taken: {time.time() - start_time} seconds")  # Print elapsed time
    return processed_count

## Test with property class

In [6]:
import pandas as pd

file = open('utrecht_addresses.csv')
file = 'utrecht_addresses.csv'
utrecht_df = pd.read_csv(file)
display(utrecht_df.head())

Unnamed: 0,postcode,huisnummer,straat,buurt,wijk,woonplaats,gemeente,provincie,latitude,longitude
0,3511KZ,1,3e Buurkerksteeg,"Lange Elisabethstraat, Mariaplaats en omgeving",Wijk 06 Binnenstad,Utrecht,Utrecht,Utrecht,52.090449,5.118523
1,3511JH,1,Achter Clarenburg,"Lange Elisabethstraat, Mariaplaats en omgeving",Wijk 06 Binnenstad,Utrecht,Utrecht,Utrecht,52.090584,5.116154
2,3511MR,1,Albert Verweystraat,Hooch Boulandt,Wijk 06 Binnenstad,Utrecht,Utrecht,Utrecht,52.082053,5.120199
3,3511LM,1,Alendorpstraat,"Lange Elisabethstraat, Mariaplaats en omgeving",Wijk 06 Binnenstad,Utrecht,Utrecht,Utrecht,52.089463,5.118824
4,3511LM,1-BS,Alendorpstraat,"Lange Elisabethstraat, Mariaplaats en omgeving",Wijk 06 Binnenstad,Utrecht,Utrecht,Utrecht,52.089475,5.118816


In [7]:
#print(file_path)
    
# Example: Filter and save properties from the Utrecht DataFrame
#amsterdamsestraatweg_df = utrecht_df[utrecht_df['straat'] == 'IJzerdraadpad']

#hoograven_df = utrecht_df[(utrecht_df['buurt'] == 'Oud Hoograven-Noord') | (utrecht_df['buurt'] == 'Nieuw Hoograven-Noord') | (utrecht_df['buurt'] == 'Oud Hoograven-Zuid') | (utrecht_df['buurt'] == 'Nieuw Hoograven-Zuid')]

lunetten_df = utrecht_df[(utrecht_df['buurt'] == 'Lunetten-Noord') | (utrecht_df['buurt'] == 'Lunetten-Zuid')]
#lang_df = utrecht_df[(utrecht_df['buurt'] == 'Lange Elisabethstraat, Mariaplaats en omgeving') | (utrecht_df['buurt'] == 'Hooch Boulandt') | (utrecht_df['buurt'] == 'Springweg en omgeving Geertebuurt')]

print(len(lunetten_df))


# display(hoograven_df.head())
#display(rivierenwijk_df.head())
#sliced = amsterdamsestraatweg_df[60:77]


#print(len(hoograven_df) + len(lang_df))


#save_property_to_csv(amsterdamsestraatweg_df)

5700


In [38]:
save_property_to_csv(hoograven_df)

Address being processed: Aquamarijnlaan 00001 3523EK Utrecht, Property count: 1
Address being processed: Broeder Alarmstraat 00001 3523TV Utrecht, Property count: 2
Skipping property at Broeder Alarmstraat 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Burgemeester Smitsstraat 00001 3523KL Utrecht, Property count: 3
Skipping property at Burgemeester Smitsstraat 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Constant Erzeijstraat 00001 3523VS Utrecht, Property count: 4
Skipping property at Constant Erzeijstraat 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Diamantweg 00001 3523CM Utrecht, Property count: 5
Address being processed: Heemstedelaan 00001 3523KE Utrecht, Property count: 6
Address being processed: Huis ten Boschlaan 00001 3523HM Utrecht, Property count: 7
Skipping property at Huis ten Boschlaan 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Huis ten Boschlaan 001-1 3523HM Utrecht, Pro

5735

In [41]:
save_property_to_csv(lang_df)

Address being processed: 3e Buurkerksteeg 00001 3511KZ Utrecht, Property count: 1
Skipping property at 3e Buurkerksteeg 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Achter Clarenburg 00001 3511JH Utrecht, Property count: 2
Skipping property at Achter Clarenburg 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Albert Verweystraat 00001 3511MR Utrecht, Property count: 3
Skipping property at Albert Verweystraat 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Alendorpstraat 00001 3511LM Utrecht, Property count: 4
Address being processed: Alendorpstraat 01-BS 3511LM Utrecht, Property count: 5
Address being processed: Andreashof 00001 3511VZ Utrecht, Property count: 6
Address being processed: Arthur van Schendelstraat 00001 3511MA Utrecht, Property count: 7
Skipping property at Arthur van Schendelstraat 00001 as 'gebruiksdoel' is not 'woonfunctie'
Address being processed: Bakkerstraat 00001 3511JV Utrecht, Property count: 8


3946

In [14]:
print(len(rivierenwijk_df))

4613


In [58]:
riv_price = open('csv/rivierenwijkprices.csv')
riv_price_df = pd.read_csv(riv_price)
display(riv_price_df.head())

print(len(riv_price_df))

Unnamed: 0,address,bouwjaar,grondoppervlakte,oppervlakte,woz_2014,woz_2015,woz_2016,woz_2017,woz_2018,woz_2019,woz_2020,woz_2021,woz_2022,woz_2023
0,Balijelaan 00002,1919,0.0,40,143000.0,140000.0,153000.0,168000.0,167000.0,180000.0,193000.0,216000.0,261000.0,258000.0
1,Balijelaan 0002B,1919,0.0,34,125000.0,124000.0,165000.0,179000.0,168000.0,179000.0,193000.0,216000.0,257000.0,267000.0
2,Balijelaan 0002C,1919,0.0,49,169000.0,167000.0,220000.0,239000.0,234000.0,251000.0,269000.0,301000.0,337000.0,331000.0
3,Balijelaan 0002D,1919,0.0,49,169000.0,167000.0,225000.0,244000.0,234000.0,251000.0,270000.0,302000.0,364000.0,394000.0
4,Balijelaan 0002E,1919,0.0,34,122000.0,121000.0,160000.0,160000.0,163000.0,174000.0,187000.0,209000.0,254000.0,263000.0


3984


In [57]:
hoog_price = open('csv/hoogravenprices.csv')
hoog_price_df = pd.read_csv(hoog_price)
display(hoog_price_df.head())

print(len(hoog_price_df))

Unnamed: 0,address,bouwjaar,grondoppervlakte,oppervlakte,woz_2014,woz_2015,woz_2016,woz_2017,woz_2018,woz_2019,woz_2020,woz_2021,woz_2022,woz_2023
0,Aquamarijnlaan 00001,1926,79.0,96,219000.0,219000.0,290000.0,290000.0,323000.0,335000.0,374000.0,399000.0,476000.0,465000.0
1,Diamantweg 00001,2012,0.0,91,207000.0,205000.0,248000.0,299000.0,312000.0,375000.0,393000.0,418000.0,448000.0,497000.0
2,Heemstedelaan 00001,2008,53.0,127,264000.0,265000.0,282000.0,339000.0,379000.0,384000.0,414000.0,496000.0,569000.0,530000.0
3,Noordeindestraat 00001,1936,131.0,101,263000.0,264000.0,361000.0,367000.0,409000.0,426000.0,481000.0,494000.0,607000.0,709000.0
4,Oranje-Nassaulaan 00001,1936,104.0,133,301000.0,302000.0,326000.0,385000.0,428000.0,450000.0,498000.0,528000.0,662000.0,751000.0


3823


In [21]:
import pandas as pd

# Define the file path
file_path = 'datacbs/kwb-2024.xls'

# Read the Excel file
df_24 = pd.read_excel(file_path)

# Print the first few rows to verify the data
print(df_24[df_24['gm_naam'] == 'Utrecht'])


     gwb_code_10 gwb_code_8                      regio  gm_naam      recs  \
4782      GM0344       0344                    Utrecht  Utrecht  Gemeente   
4783    WK034401     034401               Wijk 01 West  Utrecht      Wijk   
4784  BU03440111   03440111     Welgelegen, Den Hommel  Utrecht     Buurt   
4785  BU03440112   03440112                  Oog in Al  Utrecht     Buurt   
4786  BU03440113   03440113            Halve Maan-Zuid  Utrecht     Buurt   
...          ...        ...                        ...      ...       ...   
4899  BU03441024   03441024                 Veldhuizen  Utrecht     Buurt   
4900  BU03441031   03441031             De Meern-Noord  Utrecht     Buurt   
4901  BU03441032   03441032              De Meern-Zuid  Utrecht     Buurt   
4902  BU03441033   03441033  Bedrijvengebied Oudenrijn  Utrecht     Buurt   
4903  BU03441041   03441041                 Rijnenburg  Utrecht     Buurt   

        gwb_code ind_wbi   a_inw   a_man  a_vrouw  ...  g_afs_kv  g_afs_sc 

In [26]:
lijst_buurt = df_24[(df_24['gm_naam'] == 'Utrecht') & (df_24['recs'] == 'Buurt')].drop_duplicates()

lijst_buurt_24 = lijst_buurt['regio'].unique()
u_buurt = utrecht_df['buurt'].unique()

print(len(u_buurt))

print(len(lijst_buurt_24))
print(lijst_buurt_24)



101
111
['Welgelegen, Den Hommel' 'Oog in Al' 'Halve Maan-Zuid' 'Halve Maan-Noord'
 'Lombok-Oost' 'Leidseweg en omgeving' 'Lombok-West'
 'Laan van Nieuw-Guinea, Spinozaweg e.o.'
 'Nieuw Engeland, Th. a. Kempisplantsoen en omgeving'
 'Schepenbuurt, Cartesiusweg e.o.' 'Bedrijventerrein Lageweide'
 'Pijlsweerd-Zuid' 'Pijlsweerd-Noord' 'Nijenoord, Hoogstraat en omgeving'
 'Ondiep' '2e Daalsebuurt en omgeving'
 'Egelantierstraat, Mariëndaalstraat e.o.' 'Julianapark en omgeving'
 'Elinkwijk en omgeving' 'Prins Bernhardplein en omgeving' 'Geuzenwijk'
 'Schaakbuurt en omgeving' 'Queeckhovenplein en omgeving' 'Zuilen-Noord'
 'Taag- en Rubicondreef en omgeving' 'Wolga- en Donaudreef en omgeving'
 'Zamenhofdreef en omgeving' 'Neckardreef en omgeving' 'Vechtzoom-zuid'
 'Vechtzoom-noord, Klopvaart' 'Bedrijventerrein en omgeving'
 'Zambesidreef en omgeving' 'Tigrisdreef en omgeving'
 'Poldergebied Overvecht' 'Vogelenbuurt' 'Lauwerecht' 'Staatsliedenbuurt'
 'Tuinwijk-West' 'Tuinwijk-Oost' 'Tuindorp e