In [2]:
import pandas as pd
import numpy as np
import requests

# from tqdm.auto import tqdm, trange
from tqdm import tqdm, trange
# tqdm.pandas()


from credentials import bosa_mapping_url

from zipfile import ZipFile
import json
import random
import time
import os, urllib

import glob


In [3]:
def download_if_nexist(url, filename):
    """
    If the (local) file <filename> does not exists, download it from <url>

    Parameters
    ----------
    url: str
       url to fetch
    filename: str
       local file to save

    Returns
    -------

    None
    """
    if not os.path.isfile(filename):
        #gcontext = ssl.SSLContext()
        with urllib.request.urlopen(url) as response:
            with open(filename, "wb") as f:
                f.write(response.read())

In [4]:
datadir = "data/geocoding/"
os.makedirs(datadir, exist_ok=True)
os.makedirs(f"{datadir}/full", exist_ok=True)


In [5]:
sample_size=1000

seed=314

In [6]:
datasets = ["kbo", "rep", "best", "rrn", "resto"]
datasets = ["rrn"]

# Best

In [None]:
if "best" in datasets:
    best_vlg_fn = f"{datadir}/full/openaddress-bevlg.zip"
    download_if_nexist("https://opendata.bosa.be/download/best/openaddress-bevlg.zip", best_vlg_fn)

    best_wal_fn = f"{datadir}/full/openaddress-bewal.zip"
    download_if_nexist("https://opendata.bosa.be/download/best/openaddress-bewal.zip", best_wal_fn)

    best_bru_fn = f"{datadir}/full/openaddress-bebru.zip"
    download_if_nexist("https://opendata.bosa.be/download/best/openaddress-bebru.zip", best_bru_fn)

In [None]:
if "best" in datasets:
    best_full = pd.concat([pd.read_csv(fn, usecols=["municipality_name_de", "municipality_name_fr", "municipality_name_nl", 
                                                    "streetname_de", "streetname_fr", "streetname_nl",
                                                    "postcode", "house_number", "region_code"], dtype=str) for fn in [best_vlg_fn, best_wal_fn, best_bru_fn] ])

In [None]:
if "best" in datasets:
    best_full["street"] = np.where(best_full.region_code == "BE-VLG", best_full.streetname_nl.fillna(best_full.streetname_fr).fillna(best_full.streetname_de),
                          np.where(best_full.region_code == "BE-WAL", best_full.streetname_fr.fillna(best_full.streetname_de).fillna(best_full.streetname_nl),
                          np.where(best_full.region_code == "BE-BRU", best_full.streetname_fr.fillna(best_full.streetname_nl).fillna(best_full.streetname_de), None)))

    best_full["city"] =   np.where(best_full.region_code == "BE-VLG", best_full.municipality_name_nl.fillna(best_full.municipality_name_fr).fillna(best_full.municipality_name_de),
                          np.where(best_full.region_code == "BE-WAL", best_full.municipality_name_fr.fillna(best_full.municipality_name_de).fillna(best_full.municipality_name_nl),
                          np.where(best_full.region_code == "BE-BRU", best_full.municipality_name_fr.fillna(best_full.municipality_name_nl).fillna(best_full.municipality_name_de), None)))

In [None]:
if "best" in datasets:
    best_sample =  best_full[["street", "house_number", "postcode", "city"]].rename(columns={"house_number": "housenumber"})
    best_sample = best_sample.drop_duplicates().sample(sample_size, random_state=seed).reset_index(drop=True)
    best_sample.to_csv(f"{datadir}/best_{sample_size}.csv.gz", index=False)

# KBO


In [None]:
kbo_fn = f"{datadir}/full/kbo_full.zip"
# To be downloaded from https://kbopub.economie.fgov.be/kbo-open-data/

#download_if_nexist("https://kbopub.economie.fgov.be/kbo-open-data/affiliation/xml/files/KboOpenData_0111_2023_05_Full.zip", kbo_fn)


In [None]:
if "kbo" in datasets:
    with ZipFile(kbo_fn) as z:
       # open the csv file in the dataset
        with z.open("address.csv") as f:

            kbo_full = pd.read_csv(f, usecols=["CountryFR", "Zipcode", 
                                               "MunicipalityNL","MunicipalityFR", 
                                               "StreetNL", "StreetFR", "HouseNumber"],
                                dtype=str)
    

In [None]:
if "kbo" in datasets:

    kbo_full = kbo_full[kbo_full.CountryFR.isnull()].copy()
    
    kbo_sample = kbo_full.drop_duplicates(subset=["Zipcode","StreetNL", "StreetFR", "HouseNumber"]).sample(sample_size, random_state=seed)

In [None]:
# kbo_sample

In [None]:
if "kbo" in datasets:
    kbo_sample["lg"] = np.where(kbo_sample.Zipcode.str[0].isin(["2", "3", "8", "9"]), "VL",
                       np.where(kbo_sample.Zipcode.str[0].isin(["4", "5", "6", "7"]), "FR",
                       np.where(kbo_sample.Zipcode.str[0:2].between("10", "14"), "FR", "VL")))
    
    

In [None]:
if "kbo" in datasets:
    kbo_sample["street"] = np.where(kbo_sample.lg == "FR", 
                                    kbo_sample.StreetFR.fillna(kbo_sample.StreetNL),
                                    kbo_sample.StreetNL.fillna(kbo_sample.StreetFR))

    kbo_sample["city"] = np.where(kbo_sample.lg == "FR", 
                                  kbo_sample.MunicipalityFR.fillna(kbo_sample.MunicipalityNL),
                                  kbo_sample.MunicipalityNL.fillna(kbo_sample.MunicipalityFR))


In [None]:
if "kbo" in datasets:
    kbo_sample = kbo_sample.rename(columns={"Zipcode": "postcode", "HouseNumber": "housenumber"})[["street", "housenumber", "postcode", "city"]]
    
    kbo_sample.to_csv(f"{datadir}/kbo_{sample_size}.csv.gz", index=False)

In [None]:
# kbo_sample

# RRN

In [None]:
if "rrn" in datasets:
    best_RN_mapping_fn = f"{datadir}/full/3_RRN_2023Q1.zip"
    download_if_nexist(f"{bosa_mapping_url}/3_RRN_2023Q1.zip", best_RN_mapping_fn)

In [None]:
if "rrn" in datasets:
    with ZipFile(best_RN_mapping_fn, 'r') as zipObj:
        recs = []
        for f in ["STEP131_RR_B_Result.txt", "STEP131_RR_F_Result.txt", "STEP131_RR_W_Result.txt"]:
            print(f)
            for row in tqdm(zipObj.open(f) ):
                recs.append(row)

        print(f"Got {len(recs)} records, take a sample")
        random.seed(seed)
        recs= random.sample(recs, int(sample_size*1.2))

        rrn_sample = pd.DataFrame([json.loads(r) for r in recs])

In [None]:
if "rrn" in datasets:
    rrn_sample = rrn_sample.drop_duplicates(subset=["Snl", "Sfr", "Sde", "P", "hs"])
    rrn_sample = rrn_sample.sample(sample_size, random_state=seed)

In [None]:
if "rrn" in datasets:
    url_all_cities = "https://services.socialsecurity.be/REST/referenceData/geography/v1/countries/150/cities?pageSize=0"

    download_if_nexist(url_all_cities, f"{datadir}/full/referencedata_cities.json")

    with open(f"{datadir}/full/referencedata_cities.json", encoding='utf-8') as f:
        refdata_cities = pd.DataFrame(json.load(f)["items"])

        refdata_cities["city"]  = np.where(refdata_cities.officialLanguage=="N", refdata_cities["cityName"].apply(lambda x: x["nl"]), 
                          np.where(refdata_cities.officialLanguage=="DF", refdata_cities["cityName"].apply(lambda x: x["de"]),
                                   refdata_cities["cityName"].apply(lambda x: x["fr"])))
        refdata_cities = refdata_cities[["cityNisCode", "city"]].dropna().drop_duplicates()
        refdata_cities["cityNisCode"] = refdata_cities.cityNisCode.astype(pd.Int64Dtype()).astype(str)


In [None]:
# rrn_sample[rrn_sample.street ==""].replace("", pd.NA).Sfr.fillna(rrn_sample.Sde)

In [None]:
if "rrn" in datasets:
    rrn_sample = rrn_sample.merge(refdata_cities.rename(columns={"cityNisCode":"idM_SRC" }))
    rrn_sample["street"] = np.where(rrn_sample.R == "F", rrn_sample.Snl, rrn_sample.replace("", pd.NA).Sfr.fillna(rrn_sample.Sde).fillna(rrn_sample.Snl))

    rrn_sample = rrn_sample.rename(columns={
                        "hs":      "housenumber",
                        "POri":    "postcode", 
                        })


In [None]:
if "rrn" in datasets:
    rrn_sample[["street", "housenumber", "postcode", "city"]].to_csv(f"{datadir}/rrn_{sample_size}.csv.gz", index=False)

# Repertoire

In [None]:
# Get a sample of enterprise number from KBO open data

# 
# datasets=["rep"]

In [None]:
if "rep" in datasets:
    with ZipFile(kbo_fn) as z:
        
       # open the csv file in the dataset
        with z.open("enterprise.csv") as f:

            cbe_list_full = pd.read_csv(f,
                                        usecols  = ["EnterpriseNumber", "TypeOfEnterprise", "JuridicalForm"], 
                                        dtype=str)
    cbe_list_full

In [None]:
# sample_size=100

In [None]:
if "rep" in datasets:
    # Get a sample of CBE/KBO numbers
    cbe_list_presample = cbe_list_full[(cbe_list_full.TypeOfEnterprise == "2") & 
                                       (cbe_list_full.JuridicalForm.isin(["015", "014"]))].sample(5*sample_size, random_state=seed)
    

In [None]:
import time
def call_repertorium(cbe_number):
    url = f"https://services.socialsecurity.be/REST/employer/identification/v6/employers/search"
    
    r = requests.get(url,
        params= { "enterpriseNumber": str(cbe_number).replace(".", "")})
    
    return json.loads(r.text)

def get_repertorium_address(cbe_number):
    r = call_repertorium(cbe_number)
    
    time.sleep(0.5)
    if "identity" in r and "address" in r["identity"]:
        return r["identity"]["address"]
    else:
        return None
# get_repertorium_address(864279008)

In [None]:
# Call repertorium web service based on CBE list, until we find "sample_size*1.2" (to be sure to find sample_size unique addresses) distinct addresses
if "rep" in datasets:
    rec_with_addr=[]
    with tqdm(total=sample_size) as pbar:
        for i, rec in cbe_list_presample.reset_index(drop=True).iterrows():
            addr = get_repertorium_address(rec.EnterpriseNumber)
            if addr is not None:
                addr = {k: addr[k] for k in ['streetName','houseNumber', 'postCode', 'municipalityName']}
                if not addr in rec_with_addr :
                    #rec_with_addr.append( (rec.EnterpriseNumber, addr))
                    rec_with_addr.append(  addr)
                    pbar.update()
                else: 
                    print("skipping duplicated address")
            if len(rec_with_addr) >= sample_size:
                break

        rep_sample = pd.DataFrame(rec_with_addr)
        rep_sample

In [None]:
# rep_sample = pd.DataFrame(rec_with_addr)
rep_sample

In [None]:
def get_lg_pref(item, lg_order):
    for lg in lg_order:
        if lg in item:
            return item[lg]
    return None

if "rep" in datasets:
    rep_sample = rep_sample.rename(columns={"postCode": "postcode", 
                                            "houseNumber": "housenumber",
                                           })

    rep_sample["lg"] = np.where(rep_sample.postcode.str[0].isin(["2", "3", "8", "9"]), "VL",
                       np.where(rep_sample.postcode.str[0].isin(["4", "5", "6", "7"]), "FR",
                       np.where(rep_sample.postcode.str[0:2].between("10", "14"), "FR", "VL")))

    rep_sample["street"] = np.where(rep_sample.lg == "FR", 
                                    rep_sample.streetName.apply(lambda r : get_lg_pref(r, ["fr", "nl", "de"])),
                                    rep_sample.streetName.apply(lambda r : get_lg_pref(r, ["nl", "fr", "de"])))# )


    rep_sample["city"] = np.where(rep_sample.lg == "FR", 
                                  rep_sample.municipalityName.apply(lambda r : get_lg_pref(r, ["fr", "nl", "de"])),
                                  rep_sample.municipalityName.apply(lambda r : get_lg_pref(r, ["nl", "fr", "de"])))



In [None]:
if "rep" in datasets:
    rep_sample[["street", "housenumber", "postcode", "city"]].to_csv(f"{datadir}/rep_{sample_size}.csv.gz", index=False)

# Resto

In [None]:
if "resto" in datasets:
    resto_datadir = f"{datadir}/full/resto"

    resto_full = pd.concat([pd.read_csv(f) for f in glob.glob(f"{resto_datadir}/*.csv.gz")])
    resto_full

In [None]:
if "resto" in datasets:
    resto_sample = resto_full.drop("name", axis=1).drop_duplicates().sample(sample_size, random_state=seed)[["street", "housenumber", "postcode", "city"]]
                               
    resto_sample.to_csv(f"{datadir}/resto_{sample_size}.csv.gz", index=False)

In [1]:
# resto_sample

# NGI 

In [17]:
import geopandas as gpd

In [38]:
ngi_full = pd.read_csv(f"{datadir}/ign_geocoder.csv", sep=';', encoding='latin1')

In [39]:
#.input_request.nunique()
ngi_full = ngi_full.drop_duplicates(subset=["input_request", "ouput_street(ref,cont_id,street,score)", "geometry"])

In [67]:
import shapely
p = shapely.geometry.Point( 647305.1605065995, 670243.7006171012, 0)
gpd.GeoSeries([p]).set_crs("epsg:3812").to_crs("epsg:4326")

0    POINT Z (4.33050 50.84260 0.00000)
dtype: geometry

In [68]:
ngi_full = gpd.GeoDataFrame(ngi_full)
ngi_full["geom"] = gpd.GeoSeries.from_wkt(ngi_full.geometry.fillna(""), crs="epsg:3812")
ngi_full = ngi_full.set_geometry("geom")
ngi_full = ngi_full.to_crs("epsg:4326")

In [69]:
ngi_full[ngi_full["ouput_street(ref,cont_id,street,score)"].fillna("").str.contains("BXL")]#.plot()

Unnamed: 0,input_request,"ouput_city(ref,cont_id,city,score)","ouput_street(ref,cont_id,street,score)",search_time(sec),geometry,geom
1343,rue Ropsy Chaudron 7 1070 Anderlecht,,"[('BXLSTR---1370---2', 1, 'RUEROPSYCHAUDRON', ...",0.028455,POINT Z (647305.1605065995 670243.7006171012 0...,POINT Z (4.33050 50.84260 0.00000)
1348,rue Eloy 114 1070 Anderlecht,,"[('BXLSTR---2152---2', 1, 'RUEELOY', 1.0)]",0.040785,POINT Z (646894.220324351 669500.8136294986 0),POINT Z (4.32467 50.83592 0.00000)
1352,rue Georges Moreau 107 1070 Anderlecht,,"[('BXLSTR---4578---2', 1, 'RUEGEORGESMOREAU', ...",0.030206,POINT Z (646857.098786811 669348.3704329199 0),POINT Z (4.32414 50.83454 0.00000)
1354,rue de Douvres 80 1070 Anderlecht,,"[('BXLSTR---265---2', 1, 'RUEDEDOUVRES', 1.0)]",0.055286,POINT Z (646070.3675027142 669663.9241506008 0...,POINT Z (4.31297 50.83738 0.00000)
1360,rue Van Soust 78 1070 Anderlecht,,"[('BXLSTR---3772---2', 1, 'RUEVANSOUST', 1.0)]",0.023925,POINT Z (645894.6817438435 670731.1856306997 0),POINT Z (4.31046 50.84697 0.00000)
...,...,...,...,...,...,...
2057,rue Jef Devos - Jef Devostraat 55 1190 Forest,,"[('BXLSTR---2582---2', 1, 'RUEJEFDEVOS', 0.55)]",0.051182,POINT Z (647190.9663381542 668012.0429690753 0...,POINT Z (4.32889 50.82253 0.00000)
2060,avenue Reine Marie Henriette - Koninging Maria...,,"[('BXLSTR---4793---2', 1, 'AVENUEREINEMARIEHEN...",0.018543,POINT Z (647529.4125021889 667929.1129648751 0...,POINT Z (4.33369 50.82179 0.00000)
2061,avenue Reine Marie Henriette - Koninging Maria...,,"[('BXLSTR---4793---2', 1, 'AVENUEREINEMARIEHEN...",0.019352,POINT Z (647529.4125021889 667929.1129648751 0...,POINT Z (4.33369 50.82179 0.00000)
2062,rue de Fierlant - de Fierlantstraat 35 1190 Fo...,,"[('BXLSTR---3966---2', 1, 'DEFIERLANTSTRAAT', ...",0.021289,POINT Z (647345.5062939598 668505.537412102 0)...,POINT Z (4.33108 50.82697 0.00000)


In [42]:
ngi_full.loc[8].geometry

'POINT Z (652875.8414014473 712078.2757578157 0) POINT Z (652875.8414014473 712078.2757578157 0) POINT Z (652875.8414014473 712078.2757578157 0) '

In [44]:
print(ngi_full.loc[8].geom)

POINT Z (652875.8414014473 712078.2757578157 0)


In [35]:
ngi_full[ngi_full["ouput_city(ref,cont_id,city,score)"].notnull()]

Unnamed: 0,input_request,"ouput_city(ref,cont_id,city,score)","ouput_street(ref,cont_id,street,score)",search_time(sec),geometry


In [28]:
ngi_full["ouput_street(ref,cont_id,street,score)"].iloc[0]

"[('VLDSTR---287---2015-05-28T18:40:37.400', 1, 'BEGIJNENVEST', 1.0)]"

In [71]:
from geopy.location import Location

In [None]:
ngi_full["locationLocation(placename, (latitude, longitude), feature)

In [81]:
ngi_full["location"] = ngi_full.apply(lambda row: Location(row.input_request, (row.geom.y, row.geom.x) if row.geom else (0,0), row), axis=1)

In [83]:
ngi_full.iloc[0]

input_request                                                Begijnenvest 35 2000 Antwerpen
ouput_city(ref,cont_id,city,score)                                                      NaN
ouput_street(ref,cont_id,street,score)    [('VLDSTR---287---2015-05-28T18:40:37.400', 1,...
search_time(sec)                                                                   0.060004
geometry                                   POINT Z (652411.6826476178 711175.3977379295 0) 
geom                                         POINT Z (4.403343987223315 51.2105286322297 0)
location                                  (Begijnenvest 35 2000 Antwerpen, (51.210528632...
Name: 0, dtype: object