In [1]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas()

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


  from tqdm.autonotebook import tqdm


In [2]:
import urllib3

In [3]:
http = urllib3.PoolManager()

# Functions

In [4]:
ws_hostname = "127.0.1.1"
ws_hostname = "172.27.0.64"


street_field  = "street"
housenbr_field = "housenumber"
postcode_field = "postcode"
city_field  =    "city"
country_field =  "country"
addr_key_field = "addrKey"

# ws_hostname = "192.168.1.3"

In [5]:

sample_size = None# 1000
def get_addresses(addresses_filename):
    addresses = pd.read_csv(addresses_filename,  
                            usecols = lambda x: x in [addr_key_field, "addr_key",
                                       country_field, 
                                       postcode_field, 
                                       city_field, 
                                       street_field, 
                                       housenbr_field,
                                       country_field], 
                           dtype={postcode_field: str, housenbr_field: str, country_field: str})
    
#     display(addresses)
    addresses = addresses.rename(columns={"addr_key": addr_key_field})
    if country_field in addresses: 
        addresses[country_field] =addresses[country_field].fillna("Belgique")
    else: 
        addresses[country_field] = "Belgique"
    #addresses = addresses.rename(columns={"index":addr_key_field})
    
    addresses = addresses[addresses[street_field].notnull() & addresses[city_field].notnull()] # & addresses[country_field].isnull() ]
    
    addresses[postcode_field] = addresses[postcode_field].astype(str)
    
    if sample_size: 
        addresses = addresses.sample(sample_size, random_state=0)
    
    return addresses#.drop(country_field, axis=1)

In [6]:
def call_ws(addr_data, check_result=True, structured_osm=False): #lg = "en,fr,nl"
    t = datetime.now()
    
    try: 
        r = http.request(
        'POST',
        f'http://{ws_hostname}:5000//REST/nominatimWrapper/v0.1/search',
        fields= { 
             "street": addr_data[street_field],
             "housenumber": addr_data[housenbr_field],
             "city": addr_data[city_field],
             "postcode": addr_data[postcode_field],
             "country": addr_data[country_field],
             "check_result" : "yes" if check_result else "no",
             "struct_osm" : "yes" if structured_osm else "no"

        })
    except Exception as e:
        print("Exception !")
        print(addr_data)
        print(e)
        raise e
        
    if r.status == 204:
        print("No result!")
        print(r.data)
        return
    elif r.status == 200:
        try:
            res = json.loads(r.data.decode('utf-8'))
            res["time"] = (datetime.now() - t).total_seconds()
        except ValueError as ve:

            print("Cannot decode result:")
            print(ve)
            print(r.data.decode('utf-8'))
            return r.data
        return res
    else: 
        print(f"Unknown return code: {r.status} ")
        print(r.data)



In [7]:
def call_ws_batch(addr_data, mode="geo", with_rejected=False, check_result=True, structured_osm=False): #lg = "en,fr,nl"
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"
    }).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/REST/nominatimWrapper/v0.1/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode,
        "withRejected": "yes" if with_rejected else "no",
        "checkResult" : "yes" if check_result else "no",
        "structOsm"   : "yes" if structured_osm else "no",
        #"extra_house_nbr": "no"
    })
    
    try:
        res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
    except ValueError as ve:
        
        print("Cannot decode result:")
        print(ve)
        print(r.data.decode('utf-8'))
     
        return r.data
#     display(res)
    return res

In [8]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d:  "NONE" if d is None else "error" if "error" in d else "match" if "match" in d else "no_result")
    addresses["time"]  = addresses.json.apply(lambda d: "NONE" if d is None else d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: "NONE" if d is None else d["timing"] if "timing" in d else {})
    
    for f in ["method", "placeRank"]:
        addresses[f]= addresses.json.apply(lambda d: d["match"][0][f] if d is not None and len(d)>0 and "match" in d else "none")
    
    for field in ["Street", "Number", "Postcode", "City"]:
        addresses["addrOut"+field]= addresses.json.apply(lambda d: d["match"][0]["addrOut"+field] if d is not None and len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [9]:
res=call_ws({street_field:   "Avenue Fonsny", \
         housenbr_field: "20",\
         city_field:     "Saint-Gilles",\
         postcode_field: "1060",\
         country_field:  "Belgique"}, check_result=False, structured_osm=False)
res

{'match': [{'method': 'fast',
   'displayName': '20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, 1060, België / Belgique / Belgien',
   'placeId': 182128,
   'lat': '50.8358677',
   'lon': '4.3385087',
   'placeRank': 30,
   'addrOutStreet': 'Avenue Fonsny - Fonsnylaan',
   'addrOutCity': 'Saint-Gilles - Sint-Gillis',
   'addrOutNumber': '20',
   'addrOutCountry': 'België / Belgique / Belgien',
   'addrOutPostcode': '1060',
   'inHouseNbr': '20',
   'lpostHouseNbr': '20',
   'lpostUnit': '',
   'osmAddrIn': 'Avenue Fonsny, 20, 1060 Saint-Gilles, Belgique'}],
 'time': 0.032622}

## Batch calls (row by row)

In [10]:
addresses = pd.concat([
    #get_addresses("../GISAnalytics/data/geocoding/resto_1000_sample.csv.gz"),
    #get_addresses("../GISAnalytics/data/geocoding/best_1000_sample.csv.gz"),
    get_addresses("address.csv.gz")])
addresses = addresses.reset_index(drop=True)
addresses["addrKey"] = addresses.index.astype(str)
addresses

Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,2,9860,Oosterzele,Poststraat,19,Belgique
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


In [11]:
# addresses = addresses[addresses.addr_key.isin([ "2078829"])]#"1622",
# addresses


### Simple way

In [12]:
addresses_seq = addresses.copy()

t = datetime.now()
addresses_seq["json"] = addresses_seq.fillna("").progress_apply(call_ws, check_result=False, structured_osm=False, axis=1)
tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses_seq.shape[0]/tot_time:.2f} it/s")
# KBO dataset:
# Normal mode: 128.78 seconds, 7.77 it/s
# Fastmode:     68.80 seconds, 14.54 it/s

#Resto dataset: 
# Normal mode: 145.73 seconds, 6.86 it/s
# Fast mode:    82.99 seconds, 12.05 it/s

# Best dataset:
# Normal mode: 108.53 seconds, 9.21 it/s
# Fast mode: 37.44 seconds, 26.71 it/s

  0%|          | 0/1000 [00:00<?, ?it/s]

58.59 seconds, 17.07 it/s


In [13]:
expand_json(addresses_seq)
addresses_seq

Unnamed: 0,addrKey,postcode,city,street,housenumber,country,json,status,time,timing,method,placeRank,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.026674,{},fast,30,Kriekenlaan,22,2240,Zandhoven
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique,"{'match': [{'placeId': 291737, 'lat': '51.0223...",match,0.194757,{},nonum,26,Cardijnlaan,,8600,Diksmuide
2,2,9860,Oosterzele,Poststraat,19,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.029309,{},fast,26,Poststraat,,9860,Issegem
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.021591,{},fast,30,Sylvain Dupuisstraat,24;26;28,8300,Knokke-Heist
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.073660,{},fast,26,Oude Baan,,3630,Maasmechelen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique,"{'match': [{'placeId': 2657266, 'lat': '50.462...",match,0.126125,{},regex[init],30,Rue du Wainage,62,5060,Sambreville
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.069745,{},fast,26,Chaussée de Tournai,,7520,Ramegnies-Chin
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.046952,{},fast,26,Rue Louis Caty,,7331,Saint-Ghislain
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.034075,{},fast,26,Drève des Bouleaux,,7090,Braine-le-Comte


In [14]:
addresses_seq.method.value_counts()

fast                             848
regex[init]                      103
libpostal+regex[lpost]+photon     26
nonum                             11
libpostal+regex[lpost]             9
nostreet                           2
photon                             1
Name: method, dtype: int64

In [15]:
# addresses_seq.iloc[0].json
addresses_seq

Unnamed: 0,addrKey,postcode,city,street,housenumber,country,json,status,time,timing,method,placeRank,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.026674,{},fast,30,Kriekenlaan,22,2240,Zandhoven
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique,"{'match': [{'placeId': 291737, 'lat': '51.0223...",match,0.194757,{},nonum,26,Cardijnlaan,,8600,Diksmuide
2,2,9860,Oosterzele,Poststraat,19,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.029309,{},fast,26,Poststraat,,9860,Issegem
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.021591,{},fast,30,Sylvain Dupuisstraat,24;26;28,8300,Knokke-Heist
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.073660,{},fast,26,Oude Baan,,3630,Maasmechelen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique,"{'match': [{'placeId': 2657266, 'lat': '50.462...",match,0.126125,{},regex[init],30,Rue du Wainage,62,5060,Sambreville
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.069745,{},fast,26,Chaussée de Tournai,,7520,Ramegnies-Chin
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.046952,{},fast,26,Rue Louis Caty,,7331,Saint-Ghislain
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.034075,{},fast,26,Drève des Bouleaux,,7090,Braine-le-Comte


In [16]:
addresses_seq.json.apply(lambda d: "NONE" if d is None else "error" if "error" in d else "stop")

0      stop
1      stop
2      stop
3      stop
4      stop
       ... 
995    stop
996    stop
997    stop
998    stop
999    stop
Name: json, Length: 1000, dtype: object

In [17]:
addresses_seq.method.value_counts()

fast                             848
regex[init]                      103
libpostal+regex[lpost]+photon     26
nonum                             11
libpostal+regex[lpost]             9
nostreet                           2
photon                             1
Name: method, dtype: int64

### Using Dask

In [18]:
addresses_dask = addresses.copy()

In [19]:
addresses_dask

Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,2,9860,Oosterzele,Poststraat,19,Belgique
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


In [20]:
t = datetime.now()
dd_addresses = dd.from_pandas(addresses_dask.fillna(""), npartitions=8)

dask_task = dd_addresses.apply(call_ws, check_result=False, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses_dask["json"] = dask_task.compute()
    
tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses_dask.shape[0]/tot_time:.2f} it/s")
# KBO dataset:
# Normal mode: 24.52 seconds, 40.79 it/s
# Fastmode:    15.81 seconds, 63.27 it/s


# Resto dataset:
# Normal mode: 27.86 seconds, 35.89 it/s
# Fast mode:   18.44 seconds, 54.23 it/s

# Best dataset: 
# Normal mode: 16.11 seconds, 62.07 it/s
# Fast mode:    9.76 seconds, 102.42 it/s

[########################################] | 100% Completed | 30.9s
30.87 seconds, 32.39 it/s


In [21]:
# 1000, 1 worker: 4m18
# 4 workers, npart=4 : 1m20
# 8 workers, npart=4 : 1m20
# 8 workers, npart=8 : 44s

# with checker=False:
# 8 workers, npart=8 : 24s


In [22]:
expand_json(addresses_dask)
addresses_dask

Unnamed: 0,addrKey,postcode,city,street,housenumber,country,json,status,time,timing,method,placeRank,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.024893,{},fast,30,Kriekenlaan,22,2240,Zandhoven
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique,"{'match': [{'placeId': 291737, 'lat': '51.0223...",match,0.420936,{},nonum,26,Cardijnlaan,,8600,Diksmuide
2,2,9860,Oosterzele,Poststraat,19,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.127839,{},fast,26,Poststraat,,9860,Issegem
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.160368,{},fast,30,Sylvain Dupuisstraat,24;26;28,8300,Knokke-Heist
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.476253,{},fast,26,Oude Baan,,3630,Maasmechelen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique,"{'match': [{'placeId': 2657266, 'lat': '50.462...",match,0.306060,{},regex[init],30,Rue du Wainage,62,5060,Sambreville
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.181937,{},fast,26,Chaussée de Tournai,,7520,Ramegnies-Chin
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.064097,{},fast,26,Rue Louis Caty,,7331,Saint-Ghislain
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique,"{'match': [{'method': 'fast', 'displayName': '...",match,0.097742,{},fast,26,Drève des Bouleaux,,7090,Braine-le-Comte


In [23]:
addresses_dask.method.value_counts()#.json.loc[550]

fast                             848
regex[init]                      103
libpostal+regex[lpost]+photon     26
nonum                             11
libpostal+regex[lpost]             9
nostreet                           2
photon                             1
Name: method, dtype: int64

In [25]:
mg = addresses_seq[["addrKey", "city", "postcode","street", "housenumber", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity"]].merge(
    addresses_dask[["addrKey", "city", "postcode","street", "housenumber", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity"]], how="outer", indicator=True)
if mg.shape[0] == addresses.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

Same result in seq and dask run!


In [26]:
mg[mg._merge != "both"].sort_values("addrKey")


Unnamed: 0,addrKey,city,postcode,street,housenumber,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity,_merge


## Batch calls (batch WS)

### Single block

In [27]:
t = datetime.now()

addresses_batch = call_ws_batch(addresses[[addr_key_field, 
                                           street_field, housenbr_field, postcode_field, city_field, country_field]], 
                                mode="long", 
                                check_result=False, 
                                structured_osm=False,
                               with_rejected=True)

tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses.shape[0]/tot_time:.2f} it/s")
# KBO dataset: 33.94 seconds, 29.46 it/s
# Best:        24.99 seconds, 40.01 it/s
# Resto:       38.33 seconds, 26.09 it/s

addresses_batch

30.90 seconds, 32.37 it/s


Unnamed: 0,addrKey,street,housenumber,postcode,city,country,osmAddrIn,placeId,lat,lon,...,addrOutNumber,addrOutCountry,addrOutPostcode,addrOutOther,retryOn_26,method,inHouseNbr,lpostHouseNbr,lpostUnit,reject
0,0,Kriekenlaan,22,2240,Zandhoven,Belgique,"Kriekenlaan, 22, 2240 Zandhoven, Belgique",1032308.0,51.211615,4.649820,...,22,België / Belgique / Belgien,2240,Vlaanderen,,orig,22,22,,[]
1,1,Cardijnlaan(D),*,8600,Diksmuide,Belgique,"Cardijnlaan(D), 8600 Diksmuide, Belgique",291737.0,51.022340,2.860652,...,,België / Belgique / Belgien,8600,Vlaanderen,,nonum,*,,,[]
2,2,Poststraat,19,9860,Oosterzele,Belgique,"Poststraat, 19, 9860 Oosterzele, Belgique",436186.0,50.924678,3.788056,...,,België / Belgique / Belgien,9860,Vlaanderen - Oosterzele,,orig,19,19,,"[{'osmAddrIn': 'Poststraat, 19, 9860 Oosterzel..."
3,3,Sylvain Dupuisstraat,26,8300,Knokke-Heist,Belgique,"Sylvain Dupuisstraat, 26, 8300 Knokke-Heist, B...",3675084.0,51.351333,3.285724,...,24;26;28,België / Belgique / Belgien,8300,Knokke-Heist - Vlaanderen - Albertstrand,,orig,26,26,,[]
4,4,Oude Baan(M),338,3630,Maasmechelen,Belgique,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",379663.0,50.977155,5.690444,...,,België / Belgique / Belgien,3630,Vlaanderen - Cité van Mechelen aan de Maas,,orig,338,,m 338,"[{'osmAddrIn': 'Oude Baan(M), 338, 3630 Maasme..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,Rue du Wainage(VEL),62,5060,Sambreville,Belgique,"Rue du Wainage, 62, 5060 Sambreville, Belgique",2657266.0,50.462361,4.596458,...,62,België / Belgique / Belgien,5060,Wallonie,,regex[init],62,62,,[]
996,996,Chaussée de Tournai(R-C),4,7520,Tournai,Belgique,"Chaussée de Tournai(R-C), 4, 7520 Tournai, Bel...",1983533.0,50.639445,3.349058,...,,België / Belgique / Belgien,7520,Wallonie,,orig,4,4,,"[{'osmAddrIn': 'Chaussée de Tournai(R-C), 4, 7..."
997,997,Rue Louis Caty(B),32,7331,Saint-Ghislain,Belgique,"Rue Louis Caty(B), 32, 7331 Saint-Ghislain, Be...",432987.0,50.479737,3.838976,...,,België / Belgique / Belgien,7331,Wallonie,,orig,32,32,,"[{'osmAddrIn': 'Rue Louis Caty(B), 32, 7331 Sa..."
998,998,Drève des Bouleaux,5,7090,Braine-le-Comte,Belgique,"Drève des Bouleaux, 5, 7090 Braine-le-Comte, B...",379561.0,50.606398,4.159495,...,,België / Belgique / Belgien,7090,Wallonie,,orig,5,5,,[]


In [28]:
addresses_batch.sort_values("addrKey", key=lambda x: x.astype(int))

Unnamed: 0,addrKey,street,housenumber,postcode,city,country,osmAddrIn,placeId,lat,lon,...,addrOutNumber,addrOutCountry,addrOutPostcode,addrOutOther,retryOn_26,method,inHouseNbr,lpostHouseNbr,lpostUnit,reject
0,0,Kriekenlaan,22,2240,Zandhoven,Belgique,"Kriekenlaan, 22, 2240 Zandhoven, Belgique",1032308.0,51.211615,4.649820,...,22,België / Belgique / Belgien,2240,Vlaanderen,,orig,22,22,,[]
1,1,Cardijnlaan(D),*,8600,Diksmuide,Belgique,"Cardijnlaan(D), 8600 Diksmuide, Belgique",291737.0,51.022340,2.860652,...,,België / Belgique / Belgien,8600,Vlaanderen,,nonum,*,,,[]
2,2,Poststraat,19,9860,Oosterzele,Belgique,"Poststraat, 19, 9860 Oosterzele, Belgique",436186.0,50.924678,3.788056,...,,België / Belgique / Belgien,9860,Vlaanderen - Oosterzele,,orig,19,19,,"[{'osmAddrIn': 'Poststraat, 19, 9860 Oosterzel..."
3,3,Sylvain Dupuisstraat,26,8300,Knokke-Heist,Belgique,"Sylvain Dupuisstraat, 26, 8300 Knokke-Heist, B...",3675084.0,51.351333,3.285724,...,24;26;28,België / Belgique / Belgien,8300,Knokke-Heist - Vlaanderen - Albertstrand,,orig,26,26,,[]
4,4,Oude Baan(M),338,3630,Maasmechelen,Belgique,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",379663.0,50.977155,5.690444,...,,België / Belgique / Belgien,3630,Vlaanderen - Cité van Mechelen aan de Maas,,orig,338,,m 338,"[{'osmAddrIn': 'Oude Baan(M), 338, 3630 Maasme..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,Rue du Wainage(VEL),62,5060,Sambreville,Belgique,"Rue du Wainage, 62, 5060 Sambreville, Belgique",2657266.0,50.462361,4.596458,...,62,België / Belgique / Belgien,5060,Wallonie,,regex[init],62,62,,[]
996,996,Chaussée de Tournai(R-C),4,7520,Tournai,Belgique,"Chaussée de Tournai(R-C), 4, 7520 Tournai, Bel...",1983533.0,50.639445,3.349058,...,,België / Belgique / Belgien,7520,Wallonie,,orig,4,4,,"[{'osmAddrIn': 'Chaussée de Tournai(R-C), 4, 7..."
997,997,Rue Louis Caty(B),32,7331,Saint-Ghislain,Belgique,"Rue Louis Caty(B), 32, 7331 Saint-Ghislain, Be...",432987.0,50.479737,3.838976,...,,België / Belgique / Belgien,7331,Wallonie,,orig,32,32,,"[{'osmAddrIn': 'Rue Louis Caty(B), 32, 7331 Sa..."
998,998,Drève des Bouleaux,5,7090,Braine-le-Comte,Belgique,"Drève des Bouleaux, 5, 7090 Braine-le-Comte, B...",379561.0,50.606398,4.159495,...,,België / Belgique / Belgien,7090,Wallonie,,orig,5,5,,[]


In [29]:
# addresses_batch[addresses_batch.method.str.contains("error")]

In [30]:
rejected_addresses = addresses_batch[addresses_batch.reject.apply(lambda lst: len(lst)>0)].reject.apply(pd.Series).unstack().dropna().apply(pd.Series)
rejected_addresses

Unnamed: 0,Unnamed: 1,osmAddrIn,addrKey,osmOrder,placeId,lat,lon,displayName,namedetails,placeRank,category,...,addrHamlet,addrCityDistrict,addrNeighbourhood,addrCity,addrHouseNumber,addrOutNumber,addrPlace,addrQuarter,addrSuburb,addrIsolatedDwelling
0,2,"Poststraat, 19, 9860 Oosterzele, Belgique",2,1.0,2443024.0,50.9252415,3.7860435,"Poststraat, Issegem, Balegem, Oosterzele, Gent...",Poststraat,26.0,highway,...,,,,,,,,,,
0,4,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",4,1.0,2877525.0,50.9634253,5.6761328,"Oude Baan, Proosterbos, Mechelen-aan-de-Maas, ...",Oude Baan,26.0,highway,...,Proosterbos,,,,,,,,,
0,6,"Krijgslaan, 74, 9000 Gent, Belgique",6,1.0,313241.0,51.0264173,3.7116547,"Krijgslaan, Stationsbuurt Zuid, Gent, Oost-Vla...",N60 - Krijgslaan,26.0,highway,...,,Gent,Stationsbuurt Zuid,Gent,,,,,,
0,14,"Quai de Willebroeck, 37, 1000 Bruxelles, Belgique",14,1.0,184730.0,50.863851,4.3529301,"37, Quai de Willebroeck - Willebroekkaai, Tour...",,30.0,place,...,,Bruxelles - Brussel,Tour et Taxis - Thurn en Taxis,Ville de Bruxelles - Stad Brussel,37,37,,,,
0,19,"Industriepark ""De Bruwaan"", 5, 9700 Oudenaarde...",19,1.0,775486.0,50.8600607,3.5903363,"Industriepark De Bruwaan, Bevere, Oudenaarde, ...",Industriepark De Bruwaan,26.0,highway,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,4,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",4,8.0,268694.0,50.9885513,5.6946054,"Oude Baan, Eisden-Tuinwijk, Eisden, Maasmechel...",Oude Baan,26.0,highway,...,,,Eisden-Tuinwijk,,,,,,,
7,618,"Diestsesteenweg, 162, 3010 Leuven, Belgique",618,8.0,512948.0,50.8845673,4.7154495,"Diestsesteenweg, Blauwput, Kessel-Lo, Leuven, ...",N2 - Diestsesteenweg,26.0,highway,...,,,Blauwput,Leuven,,,,,,
7,970,"Route de Mons, 168, 7131 Binche, Belgique",970,8.0,1219823.0,50.4305869,4.0576122,"Chaussée du Roi Baudouin, Bray, Binche, La Lou...",N90 - Chaussée du Roi Baudouin - Route de Mons,26.0,highway,...,,,,,,,,,,
8,4,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",4,9.0,1572670.0,50.9857118,5.6932553,"Oude Baan, Eisden, Maasmechelen, Tongeren, Lim...",Oude Baan,26.0,highway,...,,,,,,,,,,


In [31]:
addresses_batch

Unnamed: 0,addrKey,street,housenumber,postcode,city,country,osmAddrIn,placeId,lat,lon,...,addrOutNumber,addrOutCountry,addrOutPostcode,addrOutOther,retryOn_26,method,inHouseNbr,lpostHouseNbr,lpostUnit,reject
0,0,Kriekenlaan,22,2240,Zandhoven,Belgique,"Kriekenlaan, 22, 2240 Zandhoven, Belgique",1032308.0,51.211615,4.649820,...,22,België / Belgique / Belgien,2240,Vlaanderen,,orig,22,22,,[]
1,1,Cardijnlaan(D),*,8600,Diksmuide,Belgique,"Cardijnlaan(D), 8600 Diksmuide, Belgique",291737.0,51.022340,2.860652,...,,België / Belgique / Belgien,8600,Vlaanderen,,nonum,*,,,[]
2,2,Poststraat,19,9860,Oosterzele,Belgique,"Poststraat, 19, 9860 Oosterzele, Belgique",436186.0,50.924678,3.788056,...,,België / Belgique / Belgien,9860,Vlaanderen - Oosterzele,,orig,19,19,,"[{'osmAddrIn': 'Poststraat, 19, 9860 Oosterzel..."
3,3,Sylvain Dupuisstraat,26,8300,Knokke-Heist,Belgique,"Sylvain Dupuisstraat, 26, 8300 Knokke-Heist, B...",3675084.0,51.351333,3.285724,...,24;26;28,België / Belgique / Belgien,8300,Knokke-Heist - Vlaanderen - Albertstrand,,orig,26,26,,[]
4,4,Oude Baan(M),338,3630,Maasmechelen,Belgique,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",379663.0,50.977155,5.690444,...,,België / Belgique / Belgien,3630,Vlaanderen - Cité van Mechelen aan de Maas,,orig,338,,m 338,"[{'osmAddrIn': 'Oude Baan(M), 338, 3630 Maasme..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,Rue du Wainage(VEL),62,5060,Sambreville,Belgique,"Rue du Wainage, 62, 5060 Sambreville, Belgique",2657266.0,50.462361,4.596458,...,62,België / Belgique / Belgien,5060,Wallonie,,regex[init],62,62,,[]
996,996,Chaussée de Tournai(R-C),4,7520,Tournai,Belgique,"Chaussée de Tournai(R-C), 4, 7520 Tournai, Bel...",1983533.0,50.639445,3.349058,...,,België / Belgique / Belgien,7520,Wallonie,,orig,4,4,,"[{'osmAddrIn': 'Chaussée de Tournai(R-C), 4, 7..."
997,997,Rue Louis Caty(B),32,7331,Saint-Ghislain,Belgique,"Rue Louis Caty(B), 32, 7331 Saint-Ghislain, Be...",432987.0,50.479737,3.838976,...,,België / Belgique / Belgien,7331,Wallonie,,orig,32,32,,"[{'osmAddrIn': 'Rue Louis Caty(B), 32, 7331 Sa..."
998,998,Drève des Bouleaux,5,7090,Braine-le-Comte,Belgique,"Drève des Bouleaux, 5, 7090 Braine-le-Comte, B...",379561.0,50.606398,4.159495,...,,België / Belgique / Belgien,7090,Wallonie,,orig,5,5,,[]


In [32]:
mg = addresses_seq[[ "city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna("").replace("fast", "orig").merge(
     addresses_batch[["city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna(""), how="outer", indicator=True)
if mg[mg._merge == "both"].shape[0] == addresses.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

Same result in seq and dask run!


In [33]:
addresses_batch

Unnamed: 0,addrKey,street,housenumber,postcode,city,country,osmAddrIn,placeId,lat,lon,...,addrOutNumber,addrOutCountry,addrOutPostcode,addrOutOther,retryOn_26,method,inHouseNbr,lpostHouseNbr,lpostUnit,reject
0,0,Kriekenlaan,22,2240,Zandhoven,Belgique,"Kriekenlaan, 22, 2240 Zandhoven, Belgique",1032308.0,51.211615,4.649820,...,22,België / Belgique / Belgien,2240,Vlaanderen,,orig,22,22,,[]
1,1,Cardijnlaan(D),*,8600,Diksmuide,Belgique,"Cardijnlaan(D), 8600 Diksmuide, Belgique",291737.0,51.022340,2.860652,...,,België / Belgique / Belgien,8600,Vlaanderen,,nonum,*,,,[]
2,2,Poststraat,19,9860,Oosterzele,Belgique,"Poststraat, 19, 9860 Oosterzele, Belgique",436186.0,50.924678,3.788056,...,,België / Belgique / Belgien,9860,Vlaanderen - Oosterzele,,orig,19,19,,"[{'osmAddrIn': 'Poststraat, 19, 9860 Oosterzel..."
3,3,Sylvain Dupuisstraat,26,8300,Knokke-Heist,Belgique,"Sylvain Dupuisstraat, 26, 8300 Knokke-Heist, B...",3675084.0,51.351333,3.285724,...,24;26;28,België / Belgique / Belgien,8300,Knokke-Heist - Vlaanderen - Albertstrand,,orig,26,26,,[]
4,4,Oude Baan(M),338,3630,Maasmechelen,Belgique,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",379663.0,50.977155,5.690444,...,,België / Belgique / Belgien,3630,Vlaanderen - Cité van Mechelen aan de Maas,,orig,338,,m 338,"[{'osmAddrIn': 'Oude Baan(M), 338, 3630 Maasme..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,Rue du Wainage(VEL),62,5060,Sambreville,Belgique,"Rue du Wainage, 62, 5060 Sambreville, Belgique",2657266.0,50.462361,4.596458,...,62,België / Belgique / Belgien,5060,Wallonie,,regex[init],62,62,,[]
996,996,Chaussée de Tournai(R-C),4,7520,Tournai,Belgique,"Chaussée de Tournai(R-C), 4, 7520 Tournai, Bel...",1983533.0,50.639445,3.349058,...,,België / Belgique / Belgien,7520,Wallonie,,orig,4,4,,"[{'osmAddrIn': 'Chaussée de Tournai(R-C), 4, 7..."
997,997,Rue Louis Caty(B),32,7331,Saint-Ghislain,Belgique,"Rue Louis Caty(B), 32, 7331 Saint-Ghislain, Be...",432987.0,50.479737,3.838976,...,,België / Belgique / Belgien,7331,Wallonie,,orig,32,32,,"[{'osmAddrIn': 'Rue Louis Caty(B), 32, 7331 Sa..."
998,998,Drève des Bouleaux,5,7090,Braine-le-Comte,Belgique,"Drève des Bouleaux, 5, 7090 Braine-le-Comte, B...",379561.0,50.606398,4.159495,...,,België / Belgique / Belgien,7090,Wallonie,,orig,5,5,,[]


In [34]:
mg[mg._merge != "both"]

Unnamed: 0,city,postcode,street,housenumber,method,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity,addrKey,_merge


In [35]:
# Geocode + address
call_ws_batch(addresses[[addr_key_field, street_field, housenbr_field, postcode_field, city_field, country_field]], mode="geo", check_result=False) 

Unnamed: 0,addrKey,lat,lon,placeRank,method
0,0,51.211615,4.649820,30.0,orig
1,2,50.924678,3.788056,26.0,orig
2,3,51.351333,3.285724,30.0,orig
3,4,50.977155,5.690444,26.0,orig
4,5,50.860955,4.661771,30.0,orig
...,...,...,...,...,...
995,945,50.392366,3.869560,26.0,libpostal+regex[lpost]+photon
996,965,50.638134,5.676558,26.0,libpostal+regex[lpost]+photon
997,569,50.280760,5.349380,26.0,photon
998,116,50.511908,4.367504,21.0,nostreet


In [36]:
# Geocode + address
call_ws_batch(addresses[[addr_key_field, street_field, housenbr_field, postcode_field, city_field, country_field]], mode="short", check_result=False) 

Unnamed: 0,addrKey,lat,lon,placeRank,method,placeId,addrOutStreet,addrOutNumber,inHouseNbr,lpostHouseNbr,lpostUnit,addrOutPostcode,addrOutCity,addrOutCountry
0,0,51.211615,4.649820,30.0,orig,1032308.0,Kriekenlaan,22,22,22,,2240,Zandhoven,België / Belgique / Belgien
1,1,51.022340,2.860652,26.0,nonum,291737.0,Cardijnlaan,,*,,,8600,Diksmuide,België / Belgique / Belgien
2,2,50.924678,3.788056,26.0,orig,436186.0,Poststraat,,19,19,,9860,Issegem,België / Belgique / Belgien
3,3,51.351333,3.285724,30.0,orig,3675084.0,Sylvain Dupuisstraat,24;26;28,26,26,,8300,Knokke-Heist,België / Belgique / Belgien
4,4,50.977155,5.690444,26.0,orig,379663.0,Oude Baan,,338,,m 338,3630,Maasmechelen,België / Belgique / Belgien
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,50.462361,4.596458,30.0,regex[init],2657266.0,Rue du Wainage,62,62,62,,5060,Sambreville,België / Belgique / Belgien
996,996,50.639445,3.349058,26.0,orig,1983533.0,Chaussée de Tournai,,4,4,,7520,Ramegnies-Chin,België / Belgique / Belgien
997,997,50.479737,3.838976,26.0,orig,432987.0,Rue Louis Caty,,32,32,,7331,Saint-Ghislain,België / Belgique / Belgien
998,998,50.606398,4.159495,26.0,orig,379561.0,Drève des Bouleaux,,5,5,,7090,Braine-le-Comte,België / Belgique / Belgien


In [38]:
# Geocode + address, with rejected addresses
call_ws_batch(addresses, mode="long", with_rejected=True)

Unnamed: 0,addrKey,postcode,city,street,housenumber,country,osmAddrIn,placeId,lat,lon,...,addrOutNumber,addrOutCountry,addrOutPostcode,addrOutOther,retryOn_26,method,inHouseNbr,lpostHouseNbr,lpostUnit,reject
0,0,2240,Zandhoven,Kriekenlaan,22,Belgique,"Kriekenlaan, 22, 2240 Zandhoven, Belgique",1032308.0,51.211615,4.649820,...,22,België / Belgique / Belgien,2240,Vlaanderen,,orig,22,22,,[]
1,1,8600,Diksmuide,Cardijnlaan(D),*,Belgique,"Cardijnlaan(D), 8600 Diksmuide, Belgique",291737.0,51.022340,2.860652,...,,België / Belgique / Belgien,8600,Vlaanderen,,nonum,*,,,[]
2,2,9860,Oosterzele,Poststraat,19,Belgique,"Poststraat, 19, 9860 Oosterzele, Belgique",436186.0,50.924678,3.788056,...,,België / Belgique / Belgien,9860,Vlaanderen - Oosterzele,,orig,19,19,,"[{'index': 2.0, 'osmAddrIn': 'Poststraat, 19, ..."
3,3,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique,"Sylvain Dupuisstraat, 26, 8300 Knokke-Heist, B...",3675084.0,51.351333,3.285724,...,24;26;28,België / Belgique / Belgien,8300,Knokke-Heist - Vlaanderen - Albertstrand,,orig,26,26,,[]
4,4,3630,Maasmechelen,Oude Baan(M),338,Belgique,"Oude Baan(M), 338, 3630 Maasmechelen, Belgique",379663.0,50.977155,5.690444,...,,België / Belgique / Belgien,3630,Vlaanderen - Cité van Mechelen aan de Maas,,orig,338,,m 338,"[{'index': 5.0, 'osmAddrIn': 'Oude Baan(M), 33..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,5060,Sambreville,Rue du Wainage(VEL),62,Belgique,"Rue du Wainage, 62, 5060 Sambreville, Belgique",2657266.0,50.462361,4.596458,...,62,België / Belgique / Belgien,5060,Wallonie,,regex[init],62,62,,[]
996,996,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique,"Chaussée de Tournai(R-C), 4, 7520 Tournai, Bel...",3456639.0,50.561071,3.424308,...,4,België / Belgique / Belgien,7641,Wallonie - Antoing,,orig,4,4,,"[{'index': 1147.0, 'osmAddrIn': 'Chaussée de T..."
997,997,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique,"Rue Louis Caty(B), 32, 7331 Saint-Ghislain, Be...",432987.0,50.479737,3.838976,...,,België / Belgique / Belgien,7331,Wallonie,,orig,32,32,,"[{'index': 1151.0, 'osmAddrIn': 'Rue Louis Cat..."
998,998,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique,"Drève des Bouleaux, 5, 7090 Braine-le-Comte, B...",379561.0,50.606398,4.159495,...,,België / Belgique / Belgien,7090,Wallonie,,orig,5,5,,[]


### Batch blocs

In [39]:
# addresses = addresses.sample(10000, replace=True)
# addresses = addresses.reset_index(drop=True)
# addresses["addr_key"]= addresses.index.astype(str)

In [40]:
t = datetime.now()

nb_threads=8

chunks = np.array_split(addresses, nb_threads) # addresses.shape[0]//100)

print(f"{len(chunks)} chunks on {nb_threads} threads")

import multiprocess as mp

p = mp.Pool(nb_threads)

def f(chunk):
    return call_ws_batch(chunk, mode="long", 
                        check_result=False, 
                        structured_osm=False)

with p:
     res= list(tqdm(p.imap(f, chunks), total=len(chunks)))
    
addresses_batch2 = pd.concat(res).reset_index(drop=True)

tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses.shape[0]/tot_time:.2f} it/s")
# KBO:    9.28 seconds, 107.72 it/s
# Best:   6.88 seconds, 145.43 it/s
# Resto: 11.79 seconds,  84.85 it/s

8 chunks on 8 threads


  0%|          | 0/8 [00:00<?, ?it/s]

20.40 seconds, 49.02 it/s


In [41]:
# addresses_batch2

In [43]:
mg = addresses_seq[[  "city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna("").replace("fast", "orig").merge(
    addresses_batch2[["city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna(""), how="outer", indicator=True)
if mg[mg._merge == "both"].shape[0] == addresses.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

Same result in seq and dask run!


In [46]:
mg[mg._merge != "both"].sort_values("addrKey")

Unnamed: 0,city,postcode,street,housenumber,method,addrOutStreet,addrOutNumber,addrOutPostcode,addrOutCity,addrKey,_merge


## Comparing options

In [48]:
addresses = get_addresses("address.csv.gz")
addresses = addresses[addresses[country_field] == "Belgique"]
# addresses = addresses.sample(10000).copy()

In [50]:
results = {}
it_per_seconds=pd.DataFrame()

for check_label in ["check", "nocheck"]:
    for struct_label in ["struct", "unstruct" ]:
        print(check_label, struct_label)
        start=datetime.now()
        
        results[(check_label, struct_label)] = call_ws_batch(addresses, 
                                                                    mode="short", 
                                                                    check_result   =  check_label == "check", 
                                                                    structured_osm =  struct_label == "struct")
        
        it_per_seconds.loc[check_label, struct_label] = addresses.shape[0] / (datetime.now()-start).total_seconds()
print("Iterations per seconds:")
it_per_seconds

check struct
check unstruct
nocheck struct
nocheck unstruct
Iterations per seconds:


Unnamed: 0,struct,unstruct
check,33.944924,27.938267
nocheck,40.64409,34.041258


In [52]:
print("Match rate")
pd.DataFrame({k1: {k2: results[(k1,k2)].shape[0]/addresses.shape[0] for k2 in ["struct", "unstruct"]} 
                  for k1 in  ["check","nocheck"]})

Match rate


Unnamed: 0,check,nocheck
struct,1.0,1.0
unstruct,1.0,1.0


In [53]:
print("Match rate (without nostreet)")
pd.DataFrame({k1: {k2: results[(k1,k2)].query("method!='nostreet'").shape[0]/addresses.shape[0] for k2 in ["struct", "unstruct"]} 
                  for k1 in  ["check","nocheck"]})

Match rate (without nostreet)


Unnamed: 0,check,nocheck
struct,0.994,0.996
unstruct,0.995,0.998


In [55]:
print("Unmatched addresses")
for k1 in results:
    print(k1)
    nomatch=addresses[~addresses[addr_key_field].isin(results[k1]["addrKey"])]
    display(nomatch)
    print(nomatch[country_field].value_counts())

Unmatched addresses
('check', 'struct')


Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,2762939,2240,Zandhoven,Kriekenlaan,22,Belgique
1,2253494,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,1764140,9860,Oosterzele,Poststraat,19,Belgique
3,87070,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,2261339,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,2720764,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,386196,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,1474962,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,2424889,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


Belgique    1000
Name: country, dtype: int64
('check', 'unstruct')


Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,2762939,2240,Zandhoven,Kriekenlaan,22,Belgique
1,2253494,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,1764140,9860,Oosterzele,Poststraat,19,Belgique
3,87070,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,2261339,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,2720764,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,386196,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,1474962,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,2424889,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


Belgique    1000
Name: country, dtype: int64
('nocheck', 'struct')


Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,2762939,2240,Zandhoven,Kriekenlaan,22,Belgique
1,2253494,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,1764140,9860,Oosterzele,Poststraat,19,Belgique
3,87070,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,2261339,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,2720764,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,386196,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,1474962,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,2424889,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


Belgique    1000
Name: country, dtype: int64
('nocheck', 'unstruct')


Unnamed: 0,addrKey,postcode,city,street,housenumber,country
0,2762939,2240,Zandhoven,Kriekenlaan,22,Belgique
1,2253494,8600,Diksmuide,Cardijnlaan(D),*,Belgique
2,1764140,9860,Oosterzele,Poststraat,19,Belgique
3,87070,8300,Knokke-Heist,Sylvain Dupuisstraat,26,Belgique
4,2261339,3630,Maasmechelen,Oude Baan(M),338,Belgique
...,...,...,...,...,...,...
995,2720764,5060,Sambreville,Rue du Wainage(VEL),62,Belgique
996,386196,7520,Tournai,Chaussée de Tournai(R-C),4,Belgique
997,1474962,7331,Saint-Ghislain,Rue Louis Caty(B),32,Belgique
998,2424889,7090,Braine-le-Comte,Drève des Bouleaux,5,Belgique


Belgique    1000
Name: country, dtype: int64


In [56]:
vc_values = pd.DataFrame(columns=results.keys(), index=results.keys())

for k1 in results:
    vc_values.loc[k1, k1] = results[k1].shape[0]
    for k2 in results:
        if k1>k2:
            r1=results[k1]
            r2=results[k2]
            mg = r1[["addrKey", "placeId"]].merge(r2[["addrKey", "placeId"]], on="addrKey", how="outer", indicator=True)
 
            vc = mg._merge.value_counts()

            mismatches = mg[mg.placeId_x != mg.placeId_y][["addrKey"]]
            mismatches = mismatches.merge(addresses.rename({addr_key_field:"addrKey"}, axis=1))
            mismatches = mismatches.merge(r1[["addrKey", "addrOutStreet", "addrOutNumber", "extraHouseNbr", "addrOutPostcode", "addrOutCity"]], on="addr_key")
            mismatches = mismatches.merge(r2[["addrKey", "addrOutStreet", "addrOutNumber", "extraHouseNbr", "addrOutPostcode", "addrOutCity"]], on="addr_key")
            mismatches.columns = pd.MultiIndex.from_arrays([["Input"]*6 + [f"x:{k1}"]*5 + [f"y:{k2}"]*5, mismatches.columns])

            mismatch_values = mismatches[(mismatches[f"x:{k1}"].rename(lambda x: x.replace("_x", ""), axis=1).fillna("") != 
                                          mismatches[f"y:{k2}"].rename(lambda x: x.replace("_y", ""), axis=1).fillna("")).any(axis=1)]
            
            mismatch_values_no_nmbr = mismatches[(mismatches[f"x:{k1}"].rename(lambda x: x.replace("_x", ""), axis=1).drop("addrOutNumber", axis=1).fillna("") != 
                                                  mismatches[f"y:{k2}"].rename(lambda x: x.replace("_y", ""), axis=1).drop("addrOutNumber", axis=1).fillna("")).any(axis=1)]
            
            
            vc_label = f"{vc['both']} ({mismatches.shape[0]} - {mismatch_values.shape[0]} - {mismatch_values_no_nmbr.shape[0]}) / {vc['left_only']} / {vc['right_only']}"
            vc_values.loc[k1, k2]=vc_label

                
            print(f"{k1} vs {k2}")
            print(vc_label)
            print("-----------------------------")
            
            print(f"Only in {k1}")
            display(r1[r1.addr_key.isin(mg[mg._merge=="left_only"].addr_key)].merge(addresses.rename({addr_key_field:"addr_key"}, axis=1)))
            
            print(f"Only in {k2}")
            display(r2[r2.addr_key.isin(mg[mg._merge=="right_only"].addr_key)].merge(addresses.rename({addr_key_field:"addr_key"}, axis=1)))
            
            print("Mismatch on place_id")
            display(mismatches)
            
            print("Mismatch on values")
            
            display(mismatch_values)
            
            print("Mismatch on values (no nbr)")
            display(mismatch_values_no_nmbr)
            
            print("#######################")
            
# display(vc_values.fillna(""))

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [None]:
print("Common in both (disagree on place_id - disagree on values - disagree on values, ignoring number) / results only for row / results only for columns")
vc_values.fillna("")

# tests

In [None]:
osm_host ="172.27.0.64:8080"
def get_osm(addr, accept_language = ""): #lg = "en,fr,nl"
    params = urllib.parse.urlencode({"q": addr,
                                    "format":"jsonv2",
                                    "accept-language":accept_language,
                                    "addressdetails":"1",
                                    "namedetails" : "1",
                                    "limit": "50"
                                    })
    
    url = "http://%s/search.php?%s"%(osm_host, params)
#     print(f"Call to OSM: {url}")
    try: 
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             return res
            return [ {field: item[field] for field in ["place_id", "lat", "lon", "display_name", "address", "namedetails", "place_rank", "category", "type"]} for item in res] 
    except Exception as e:
        raise Exception (f"Cannot get OSM results ({osm_host}): {e}") 

In [None]:
%timeit get_osm("Av. Fonsny 20, 1060 Bruxelles")

In [None]:
%timeit call_ws_test({street_field:   "Av. Fonsny", \
         housenbr_field: "20",\
         city_field:     "Saint-Gilles",\
         postcode_field: "1060",\
         country_field:  "Belgium"}, check_result=False, structured_osm=False)
# res