In [1]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas()

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


  from tqdm.autonotebook import tqdm


In [2]:
import urllib3

In [3]:
http = urllib3.PoolManager()

# Functions

In [4]:
ws_hostname = "127.0.1.1"
ws_hostname = "172.27.0.64"


street_field  = "streetName"
housenbr_field = "houseNumber"
postcode_field = "postCode"
city_field  =    "city"
country_field =  "country"
addr_key_field = "addrKey"

# ws_hostname = "192.168.1.3"

In [5]:

sample_size = None# 1000
def get_addresses(addresses_filename):
    addresses = pd.read_csv(addresses_filename,  
                            usecols = lambda x: x in [addr_key_field, "addr_key",
                                       country_field, 
                                       postcode_field, 
                                       city_field, 
                                       street_field, 
                                       housenbr_field,
                                       country_field], 
                           dtype={postcode_field: str, housenbr_field: str, country_field: str})
    
    
    addresses = addresses.rename(columns={"addr_key": addr_key_field})
    if country_field in addresses: 
        addresses[country_field] =addresses[country_field].fillna("Belgique")
    else: 
        addresses[country_field] = "Belgique"
    #addresses = addresses.rename(columns={"index":addr_key_field})
    
    addresses = addresses[addresses[street_field].notnull() & addresses[city_field].notnull()] # & addresses[country_field].isnull() ]
    
    addresses[postcode_field] = addresses[postcode_field].astype(str)
    
    if sample_size: 
        addresses = addresses.sample(sample_size, random_state=0)
    
    return addresses#.drop(country_field, axis=1)

In [6]:
def call_ws(addr_data, check_result=True, structured_osm=False, with_rejected=False): #lg = "en,fr,nl"
    t = datetime.now()
    
    try: 
        r = http.request(
        'GET',
        f'http://{ws_hostname}:5000//REST/nominatimWrapper/v0.1/search',
        fields= { 
             street_field: addr_data[street_field],
             housenbr_field: addr_data[housenbr_field],
             city_field: addr_data[city_field],
             postcode_field: addr_data[postcode_field],
             country_field: addr_data[country_field],
             addr_key_field: addr_data[addr_key_field] if addr_key_field in addr_data else None  ,
             "checkResult" : "yes" if check_result else "no",
             "structOsm" : "yes" if structured_osm else "no",
            "withRejected": "yes" if with_rejected else "no"

        })
        

    except Exception as e:
        print("Exception !")
        print(addr_data)
        print(e)
        raise e
        
    if r.status == 204:
        print("No result!")
        print(addr_data)
        print(r.data)
        return
    elif r.status == 200:
        try:
            res = json.loads(r.data.decode('utf-8'))
            res["time"] = (datetime.now() - t).total_seconds()
        except ValueError as ve:

            print("Cannot decode result:")
            print(ve)
            print(r.data.decode('utf-8'))
            return r.data
        return res
    else: 
        print(f"Unknown return code: {r.status} ")
        print(r.data)



In [7]:
def split_columns(addresses_batch):
    output = pd.DataFrame(index=addresses_batch.index, dtype=str)

    for f in addresses_batch:
#        print(f)
        grp= addresses_batch[f].apply(lambda r : pd.Series(r, dtype=str) if pd.notnull(r) else pd.Series(dtype=str)) 
        grp = pd.concat({f: grp}, names=['L0', 'L1'], axis=1)
        
        output = pd.concat([output, grp], axis=1)
    # display(output)
    output.columns = pd.MultiIndex.from_tuples(output.columns)
    return output  

In [8]:
def format_ws_res(addresses):
    # assert addresses.json.apply(lambda js: "match" in js).all()
    assert addresses.json.apply(lambda js: len(js["match"])==1 if js is not None and "match" in js else True).all()
    addresses = addresses.json.apply(lambda js: pd.Series(js["match"][0]) if js is not None and "match" in js else pd.Series(dtype=object))
    
    addresses =  split_columns(addresses)
    return addresses

In [9]:
def call_ws_batch(addr_data, mode="geo", with_rejected=False, check_result=True, structured_osm=False): #lg = "en,fr,nl"
    file_data = addr_data.to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/REST/nominatimWrapper/v0.1/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode,
        "withRejected": "yes" if with_rejected else "no",
        "checkResult" : "yes" if check_result else "no",
        "structOsm"   : "yes" if structured_osm else "no",
        #"extra_house_nbr": "no"
    })
    
    try:
        
        res_json = json.loads(r.data.decode('utf-8'))
        if "match" in res_json:
            match =  split_columns(pd.DataFrame(res_json["match"]))
            rejected= split_columns(pd.DataFrame(res_json["rejected"])) if "rejected" in res_json and len(res_json["rejected"])>0 else pd.DataFrame()
            
        else :
            match =  split_columns(pd.DataFrame(res_json))
            rejected=None
    except ValueError as ve:
        
        print("Cannot decode result:")
        print(ve)
        print(r.data.decode('utf-8'))
     
        return r.data
    
    if rejected is not None:
        return match, rejected
    
    else :
        return match
    


In [10]:
# def expand_batch_address(addresses):
#     for field in [street_field, housenbr_field, postcode_field, city_field]:
#         addresses["addrOut"+field]= addresses.address.apply(lambda d: d[field] if d is not None and field in d else "")
#     return 

# Calls

## Single address calls

In [11]:
res=call_ws({
    addr_key_field: 5,
    street_field:   "Avenue Fonsny", \
         housenbr_field: "20",\
         city_field:     "Saint-Gilles",\
         postcode_field: "1060",\
         country_field:  "Belgique"}, check_result=False, structured_osm=False, with_rejected=True)
res

{'match': [{'work': {'method': 'fast',
    'transformedAddress': 'Avenue Fonsny, 20, 1060 Saint-Gilles, Belgique'},
   'nominatim': {'displayName': '20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, 1060, België / Belgique / Belgien',
    'placeId': 182128,
    'lat': '50.8358677',
    'lon': '4.3385087',
    'placeRank': 30,
    'houseNumber': '20',
    'road': 'Avenue Fonsny - Fonsnylaan',
    'town': 'Saint-Gilles - Sint-Gillis',
    'county': 'Brussel-Hoofdstad - Bruxelles-Capitale',
    'region': 'Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest',
    'postcode': '1060',
    'country': 'België / Belgique / Belgien',
    'countryCode': 'be'},
   'output': {'streetName': 'Avenue Fonsny - Fonsnylaan',
    'city': 'Saint-Gilles - Sint-Gillis',
    'houseNumber': '20',
    'country': 'België / Belgique / Belgien',
    'postCode': '1060',
    'inHouseNbr': '',
  

In [12]:
res.keys()

dict_keys(['match', 'rejected', 'time'])

In [13]:
res=call_ws({street_field:   "Avenue Fonsny", \
         housenbr_field: "20",\
         city_field:     "Saint-Gilles",\
         postcode_field: "1060",\
         country_field:  "Belgique"}, check_result=False, structured_osm=False, with_rejected=True)
res

{'match': [{'work': {'method': 'fast',
    'transformedAddress': 'Avenue Fonsny, 20, 1060 Saint-Gilles, Belgique'},
   'nominatim': {'displayName': '20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, 1060, België / Belgique / Belgien',
    'placeId': 182128,
    'lat': '50.8358677',
    'lon': '4.3385087',
    'placeRank': 30,
    'houseNumber': '20',
    'road': 'Avenue Fonsny - Fonsnylaan',
    'town': 'Saint-Gilles - Sint-Gillis',
    'county': 'Brussel-Hoofdstad - Bruxelles-Capitale',
    'region': 'Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest',
    'postcode': '1060',
    'country': 'België / Belgique / Belgien',
    'countryCode': 'be'},
   'output': {'streetName': 'Avenue Fonsny - Fonsnylaan',
    'city': 'Saint-Gilles - Sint-Gillis',
    'houseNumber': '20',
    'country': 'België / Belgique / Belgien',
    'postCode': '1060',
    'inHouseNbr': '',
  

## Batch calls (row by row)

In [14]:
addresses = pd.concat([
   get_addresses("data/address_restobe.csv.gz"),
   # get_addresses("data/address_best.csv.gz"),
    # get_addresses("data/address_cbe.csv.gz")
])
addresses = addresses.reset_index(drop=True)
addresses[addr_key_field] = addresses.index.astype(str)
addresses

Unnamed: 0,addrKey,streetName,houseNumber,city,postCode,country
0,0,Chaussée de Namur,198,LEUZE,5310,Belgique
1,1,Rue de la Halle,11,NAMUR,5000,Belgique
2,2,Avenue Jacques Sermon,38,GANSHOREN,1083,Belgique
3,3,rue du Centre,200,WAIMES,4950,Belgique
4,4,Kleine Waterstraat,9,HULSHOUT,2235,Belgique
...,...,...,...,...,...,...
995,995,Rue Americaine,90,IXELLES,1050,Belgique
996,996,Kloosterstraat,60,EKEREN,2180,Belgique
997,997,Markt,13,BALEN,2490,Belgique
998,998,Lange Lobroekstraat,65,ANVERS,2060,Belgique


In [15]:
# addresses = addresses[addresses.addr_key.isin([ "2078829"])]#"1622",
# addresses.dtypes


### Simple way

In [16]:
# addresses = addresses[addresses.streetName.str.lower().str.contains("steenweg op antwerpen")]

# # addresses = addresses.loc[[100,112,118,142,155,179,198]]
# # addresses = addresses.loc[[ 94,  100,  198, ]]
# addresses = addresses.loc[0:100]
# addresses

In [17]:
addresses_seq = addresses.copy()

t = datetime.now()
addresses_seq["json"] = addresses_seq.fillna("").progress_apply(call_ws, check_result=False, structured_osm=False, axis=1)
tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses_seq.shape[0]/tot_time:.2f} it/s")
# KBO dataset:
# Normal mode: 128.78 seconds, 7.77 it/s
# Fastmode:     68.80 seconds, 14.54 it/s

#Resto dataset: 
# Normal mode: 145.73 seconds, 6.86 it/s
# Fast mode:    82.99 seconds, 12.05 it/s

# Best dataset:
# Normal mode: 108.53 seconds, 9.21 it/s
# Fast mode: 37.44 seconds, 26.71 it/s

addresses_seq

  0%|          | 0/1000 [00:00<?, ?it/s]

No result!
addrKey                      784
streetName     route de Balmoral
houseNumber                   35
city                SART-LEZ-SPA
postCode                    4845
country                 Belgique
Name: 784, dtype: object
b''
71.96 seconds, 13.90 it/s


Unnamed: 0,addrKey,streetName,houseNumber,city,postCode,country,json
0,0,Chaussée de Namur,198,LEUZE,5310,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
1,1,Rue de la Halle,11,NAMUR,5000,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
2,2,Avenue Jacques Sermon,38,GANSHOREN,1083,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
3,3,rue du Centre,200,WAIMES,4950,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
4,4,Kleine Waterstraat,9,HULSHOUT,2235,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
...,...,...,...,...,...,...,...
995,995,Rue Americaine,90,IXELLES,1050,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
996,996,Kloosterstraat,60,EKEREN,2180,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
997,997,Markt,13,BALEN,2490,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
998,998,Lange Lobroekstraat,65,ANVERS,2060,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."


In [18]:
addresses_seq.json.iloc[784]

In [19]:
addresses_seq_out = format_ws_res(addresses_seq)
addresses_seq_out

Unnamed: 0_level_0,work,work,work,work,nominatim,nominatim,nominatim,nominatim,nominatim,nominatim,...,output,output,output,output,input,input,input,input,input,input
Unnamed: 0_level_1,method,transformedAddress,osmOrder,cleansedHouseNbr,displayName,placeId,lat,lon,placeRank,houseNumber,...,inHouseNbr,lpostHouseNbr,lpostUnit,other,streetName,houseNumber,postCode,city,country,addrKey
0,fast,"Chaussée de Namur, 198, 5310 LEUZE, Belgique",,,"198, Chaussée de Namur, Leuze, Éghezée, Namur,...",2580480,50.5556305,4.907401297756147,30,198,...,,198,,,Chaussée de Namur,198,5310,LEUZE,Belgique,0
1,fast,"Rue de la Halle, 11, 5000 NAMUR, Belgique",,,"Rue de la Halle, Namur, Wallonie, 5000, België...",392307,50.4627801,4.8655064,26,,...,,11,,,Rue de la Halle,11,5000,NAMUR,Belgique,1
2,fast,"Avenue Jacques Sermon, 38, 1083 GANSHOREN, Bel...",,,"38, Avenue Jacques Sermon - Jacques Sermonlaan...",1094419,50.868676699999995,4.321060219298245,30,38,...,,38,,,Avenue Jacques Sermon,38,1083,GANSHOREN,Belgique,2
3,fast,"rue du Centre, 200, 4950 WAIMES, Belgique",,,"Rue du Centre, Waimes, Verviers, Liège, Wallon...",342040,50.4161219,6.1119552,26,,...,,200,,,rue du Centre,200,4950,WAIMES,Belgique,3
4,fast,"Kleine Waterstraat, 9, 2235 HULSHOUT, Belgique",,,"Tennisclub Lybo, 9, Kleine Waterstraat, Houtve...",171744,51.0399911,4.8089699,30,9,...,,9,,,Kleine Waterstraat,9,2235,HULSHOUT,Belgique,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fast,"Rue Americaine, 90, 1050 IXELLES, Belgique",,,"Rue Américaine - Amerikaanse straat, Bascule, ...",263851,50.8214139,4.3649261,26,,...,,90,,,Rue Americaine,90,1050,IXELLES,Belgique,995
996,fast,"Kloosterstraat, 60, 2180 EKEREN, Belgique",,,"60, Kloosterstraat, Schoonbroek, Ekeren, Antwe...",1697847,51.281449050000006,4.417645878242929,30,60,...,,60,,,Kloosterstraat,60,2180,EKEREN,Belgique,996
997,fast,"Markt, 13, 2490 BALEN, Belgique",,,"13, Markt, Balen, Turnhout, Antwerpen, Vlaande...",649721,51.17051085,5.168191792076369,30,13,...,,13,,,Markt,13,2490,BALEN,Belgique,997
998,fast,"Lange Lobroekstraat, 65, 2060 ANVERS, Belgique",,,"65, Lange Lobroekstraat, Stuivenberg, Antwerpe...",2166425,51.22927815,4.4322881059636465,30,65,...,,65,,,Lange Lobroekstraat,65,2060,ANVERS,Belgique,998


In [20]:
addresses_seq.json.iloc[0]

{'match': [{'work': {'method': 'fast',
    'transformedAddress': 'Chaussée de Namur, 198, 5310 LEUZE, Belgique'},
   'nominatim': {'displayName': '198, Chaussée de Namur, Leuze, Éghezée, Namur, Wallonie, 5310, België / Belgique / Belgien',
    'placeId': 2580480,
    'lat': '50.5556305',
    'lon': '4.907401297756147',
    'placeRank': 30,
    'houseNumber': '198',
    'road': 'Chaussée de Namur',
    'village': 'Leuze',
    'municipality': 'Éghezée',
    'county': 'Namur',
    'state': 'Namur',
    'region': 'Wallonie',
    'postcode': '5310',
    'country': 'België / Belgique / Belgien',
    'countryCode': 'be'},
   'output': {'streetName': 'Chaussée de Namur',
    'city': 'Leuze',
    'houseNumber': '198',
    'country': 'België / Belgique / Belgien',
    'postCode': '5310',
    'inHouseNbr': '',
    'lpostHouseNbr': '198',
    'lpostUnit': ''},
   'input': {'streetName': 'Chaussée de Namur',
    'houseNumber': '198',
    'postCode': '5310',
    'city': 'LEUZE',
    'country': 'Belg

In [22]:
# addresses_seq_out[addresses_seq_out[("work", 0)].isnull()]

In [24]:
# addresses_seq_out[addresses_seq.json.apply(lambda js: "osmOrder" in js["match"][0]['work'] if js else None).fillna(False)]

In [25]:
addresses_seq

Unnamed: 0,addrKey,streetName,houseNumber,city,postCode,country,json
0,0,Chaussée de Namur,198,LEUZE,5310,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
1,1,Rue de la Halle,11,NAMUR,5000,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
2,2,Avenue Jacques Sermon,38,GANSHOREN,1083,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
3,3,rue du Centre,200,WAIMES,4950,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
4,4,Kleine Waterstraat,9,HULSHOUT,2235,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
...,...,...,...,...,...,...,...
995,995,Rue Americaine,90,IXELLES,1050,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
996,996,Kloosterstraat,60,EKEREN,2180,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
997,997,Markt,13,BALEN,2490,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."
998,998,Lange Lobroekstraat,65,ANVERS,2060,Belgique,"{'match': [{'work': {'method': 'fast', 'transf..."


### Using Dask

In [26]:
addresses_dask = addresses.copy()

In [27]:
addresses_dask

Unnamed: 0,addrKey,streetName,houseNumber,city,postCode,country
0,0,Chaussée de Namur,198,LEUZE,5310,Belgique
1,1,Rue de la Halle,11,NAMUR,5000,Belgique
2,2,Avenue Jacques Sermon,38,GANSHOREN,1083,Belgique
3,3,rue du Centre,200,WAIMES,4950,Belgique
4,4,Kleine Waterstraat,9,HULSHOUT,2235,Belgique
...,...,...,...,...,...,...
995,995,Rue Americaine,90,IXELLES,1050,Belgique
996,996,Kloosterstraat,60,EKEREN,2180,Belgique
997,997,Markt,13,BALEN,2490,Belgique
998,998,Lange Lobroekstraat,65,ANVERS,2060,Belgique


In [28]:
t = datetime.now()
dd_addresses = dd.from_pandas(addresses_dask.fillna(""), npartitions=8)

dask_task = dd_addresses.apply(call_ws, check_result=False, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses_dask["json"] = dask_task.compute()
    
tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses_dask.shape[0]/tot_time:.2f} it/s")
# KBO dataset:
# Normal mode: 24.52 seconds, 40.79 it/s
# Fastmode:    15.81 seconds, 63.27 it/s


# Resto dataset:
# Normal mode: 27.86 seconds, 35.89 it/s
# Fast mode:   18.44 seconds, 54.23 it/s

# Best dataset: 
# Normal mode: 16.11 seconds, 62.07 it/s
# Fast mode:    9.76 seconds, 102.42 it/s

[                                        ] | 0% Completed |  6.6sNo result!
addrKey                      784
streetName     route de Balmoral
houseNumber                   35
city                SART-LEZ-SPA
postCode                    4845
country                 Belgique
Name: 784, dtype: object
b''
[########################################] | 100% Completed | 14.7s
14.69 seconds, 68.08 it/s


In [29]:
# 1000, 1 worker: 4m18
# 4 workers, npart=4 : 1m20
# 8 workers, npart=4 : 1m20
# 8 workers, npart=8 : 44s

# with checker=False:
# 8 workers, npart=8 : 24s


In [30]:
# expand_json(addresses_dask)

addresses_dask_out = format_ws_res(addresses_dask)
addresses_dask_out

Unnamed: 0_level_0,work,work,work,work,nominatim,nominatim,nominatim,nominatim,nominatim,nominatim,...,output,output,output,output,input,input,input,input,input,input
Unnamed: 0_level_1,method,transformedAddress,osmOrder,cleansedHouseNbr,displayName,placeId,lat,lon,placeRank,houseNumber,...,inHouseNbr,lpostHouseNbr,lpostUnit,other,streetName,houseNumber,postCode,city,country,addrKey
0,fast,"Chaussée de Namur, 198, 5310 LEUZE, Belgique",,,"198, Chaussée de Namur, Leuze, Éghezée, Namur,...",2580480,50.5556305,4.907401297756147,30,198,...,,198,,,Chaussée de Namur,198,5310,LEUZE,Belgique,0
1,fast,"Rue de la Halle, 11, 5000 NAMUR, Belgique",,,"Rue de la Halle, Namur, Wallonie, 5000, België...",392307,50.4627801,4.8655064,26,,...,,11,,,Rue de la Halle,11,5000,NAMUR,Belgique,1
2,fast,"Avenue Jacques Sermon, 38, 1083 GANSHOREN, Bel...",,,"38, Avenue Jacques Sermon - Jacques Sermonlaan...",1094419,50.868676699999995,4.321060219298245,30,38,...,,38,,,Avenue Jacques Sermon,38,1083,GANSHOREN,Belgique,2
3,fast,"rue du Centre, 200, 4950 WAIMES, Belgique",,,"Rue du Centre, Waimes, Verviers, Liège, Wallon...",342040,50.4161219,6.1119552,26,,...,,200,,,rue du Centre,200,4950,WAIMES,Belgique,3
4,fast,"Kleine Waterstraat, 9, 2235 HULSHOUT, Belgique",,,"Tennisclub Lybo, 9, Kleine Waterstraat, Houtve...",171744,51.0399911,4.8089699,30,9,...,,9,,,Kleine Waterstraat,9,2235,HULSHOUT,Belgique,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fast,"Rue Americaine, 90, 1050 IXELLES, Belgique",,,"Rue Américaine - Amerikaanse straat, Bascule, ...",263851,50.8214139,4.3649261,26,,...,,90,,,Rue Americaine,90,1050,IXELLES,Belgique,995
996,fast,"Kloosterstraat, 60, 2180 EKEREN, Belgique",,,"60, Kloosterstraat, Schoonbroek, Ekeren, Antwe...",1697847,51.281449050000006,4.417645878242929,30,60,...,,60,,,Kloosterstraat,60,2180,EKEREN,Belgique,996
997,fast,"Markt, 13, 2490 BALEN, Belgique",,,"13, Markt, Balen, Turnhout, Antwerpen, Vlaande...",649721,51.17051085,5.168191792076369,30,13,...,,13,,,Markt,13,2490,BALEN,Belgique,997
998,fast,"Lange Lobroekstraat, 65, 2060 ANVERS, Belgique",,,"65, Lange Lobroekstraat, Stuivenberg, Antwerpe...",2166425,51.22927815,4.4322881059636465,30,65,...,,65,,,Lange Lobroekstraat,65,2060,ANVERS,Belgique,998


In [31]:
# addresses_dask.method.value_counts()#.json.loc[550]
addr_key_field

'addrKey'

In [32]:
fields = [("input", addr_key_field), ("work", "method"), ("output", city_field), ("output", postcode_field),("output", street_field), ("output", housenbr_field)]
mg = addresses_seq_out[fields].merge(
    addresses_dask_out[fields], how="outer", indicator=True)
if mg.shape[0] == addresses.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

Same result in seq and dask run!


  mg = addresses_seq_out[fields].merge(


In [33]:
mg

Unnamed: 0_level_0,input,work,output,output,output,output,_merge
Unnamed: 0_level_1,addrKey,method,city,postCode,streetName,houseNumber,Unnamed: 7_level_1
0,0,fast,Leuze,5310,Chaussée de Namur,198,both
1,1,fast,Namur,5000,Rue de la Halle,,both
2,2,fast,Ganshoren,1083,Avenue Jacques Sermon - Jacques Sermonlaan,38,both
3,3,fast,Waimes,4950,Rue du Centre,,both
4,4,fast,Hulshout,2235,Kleine Waterstraat,9,both
...,...,...,...,...,...,...,...
995,995,fast,Ixelles - Elsene,1050,Rue Américaine - Amerikaanse straat,,both
996,996,fast,Ekeren,2180,Kloosterstraat,60,both
997,997,fast,Balen,2490,Markt,13,both
998,998,fast,Antwerpen,2060,Lange Lobroekstraat,65,both


## Batch calls (batch WS)

### Single block

In [None]:
# addresses=addresses[addresses.addrKey.isin(["1075", "1331", "2616"])]
# addresses=addresses[addresses.addrKey.isin(["2206", "2556", "2569", "2715"])]
# addresses=addresses.iloc[0:100]
addresses

In [None]:
t = datetime.now()

addresses_batch, rejected = call_ws_batch(addresses[[addr_key_field, 
                                           street_field, housenbr_field, postcode_field, city_field, country_field]], 
                                mode="long", 
                                check_result=False, 
                                structured_osm=False,
                                with_rejected=True)

tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses.shape[0]/tot_time:.2f} it/s")
# KBO dataset: 33.94 seconds, 29.46 it/s
# Best:        24.99 seconds, 40.01 it/s
# Resto:       38.33 seconds, 26.09 it/s

addresses_batch

In [None]:
rejected

In [None]:
# mg = addresses_seq[[addr_key_field, city_field, postcode_field,street_field, housenbr_field, f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}"]].fillna("").replace("fast", "orig").merge(
#     addresses_batch[[addr_key_field, city_field, postcode_field,street_field, housenbr_field, f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}"]].fillna(""), how="outer", indicator=True)

fields = [("input", addr_key_field), ("work", "method"), ("output", city_field), ("output", postcode_field),("output", street_field), ("output", housenbr_field)]
mg = addresses_seq_out[fields].dropna(subset=[("input", addr_key_field)]).replace("fast", "orig").merge(
    addresses_batch[fields], how="outer", indicator=True)

# mg = addresses_seq[[ "city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna("").replace("fast", "orig").merge(
#      addresses_batch[["city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna(""), how="outer", indicator=True)
if mg[mg._merge == "both"].shape[0] == addresses_batch.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

In [None]:
mg

In [None]:
# Geocode + address
match, rej = call_ws_batch(addresses[[addr_key_field, street_field, housenbr_field, postcode_field, city_field, country_field]], mode="geo", check_result=False, with_rejected=True) 
match

In [None]:
# Geocode + address
match, rej = call_ws_batch(addresses[[addr_key_field, street_field, housenbr_field, postcode_field, city_field, country_field]], mode="short", check_result=False) 
match


In [None]:
# addresses_batch.columns

In [None]:
# Geocode + address, with rejected addresses
match, rej = call_ws_batch(addresses, mode="long", with_rejected=True)
match

### Batch blocs

In [None]:
# addresses = addresses.sample(10000, replace=True)
# addresses = addresses.reset_index(drop=True)
# addresses["addr_key"]= addresses.index.astype(str)

In [None]:
t = datetime.now()

nb_threads=8

chunks = np.array_split(addresses, nb_threads) # addresses.shape[0]//100)

print(f"{len(chunks)} chunks on {nb_threads} threads")

import multiprocess as mp

p = mp.Pool(nb_threads)

def f(chunk):
    return call_ws_batch(chunk, mode="long", 
                        check_result=False, 
                        structured_osm=False)

with p:
     res= list(tqdm(p.imap(f, chunks), total=len(chunks)))
    
addresses_batch2 = pd.concat(res).reset_index(drop=True)

tot_time = (datetime.now() - t).total_seconds()
print(f"{tot_time:.2f} seconds, {addresses.shape[0]/tot_time:.2f} it/s")
# KBO:    9.28 seconds, 107.72 it/s
# Best:   6.88 seconds, 145.43 it/s
# Resto: 11.79 seconds,  84.85 it/s

In [None]:
# addresses_batch2
# expand_batch_address(addresses_batch2)

In [None]:
fields = [("input", addr_key_field), ("work", "method"), ("output", city_field), ("output", postcode_field),("output", street_field), ("output", housenbr_field)]
mg = addresses_seq_out[fields].dropna(subset=[("input", addr_key_field)]).replace("fast", "orig").merge(
    addresses_batch2[fields], how="outer", indicator=True)


# mg = addresses_seq[[addr_key_field, city_field, postcode_field,street_field, housenbr_field, f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}"]].fillna("").replace("fast", "orig").merge(
#      addresses_batch2[[addr_key_field, city_field, postcode_field,street_field, housenbr_field, f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}"]].fillna(""), how="outer", indicator=True)

# mg = addresses_seq[[  "city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna("").replace("fast", "orig").merge(
#     addresses_batch2[["city", "postcode","street", "housenumber", "method", "addrOutStreet", "addrOutNumber", "addrOutPostcode", "addrOutCity", "addrKey"]].fillna(""), how="outer", indicator=True)
if mg[mg._merge == "both"].shape[0] == addresses_batch2.shape[0]:
    print("Same result in seq and dask run!")
else: 
    print("!!! Not the same result in seq and dask run!")
    

## Comparing options

In [None]:
# addresses = get_addresses("address.csv.gz")
# addresses = addresses[addresses[country_field] == "Belgique"]
# # addresses = addresses.sample(10000).copy()

In [None]:
results = {}
it_per_seconds=pd.DataFrame()

for check_label in ["check", "nocheck"]:
    for struct_label in ["struct", "unstruct" ]:
        print(check_label, struct_label)
        start=datetime.now()
        
        results[(check_label, struct_label)] = call_ws_batch(addresses, 
                                                                    mode="short", 
                                                                    check_result   =  check_label == "check", 
                                                                    structured_osm =  struct_label == "struct")
        
        #expand_batch_address(results[(check_label, struct_label)])
        it_per_seconds.loc[check_label, struct_label] = addresses.shape[0] / (datetime.now()-start).total_seconds()
print("Iterations per seconds:")
it_per_seconds

In [None]:
print("Match rate")
pd.DataFrame({k1: {k2: results[(k1,k2)].shape[0]/addresses.shape[0] for k2 in ["struct", "unstruct"]} 
                  for k1 in  ["check","nocheck"]})

In [None]:
print("Match rate (without nostreet)")
pd.DataFrame({k1: {k2: results[(k1,k2)][results[(k1,k2)][('work', 'method')]!='nostreet'].shape[0]/addresses.shape[0] for k2 in ["struct", "unstruct"]} 
                  for k1 in  ["check","nocheck"]})

In [None]:
print("Unmatched addresses")
for k1 in results:
    print(k1)
    nomatch=addresses[~addresses[addr_key_field].isin(results[k1][("input", "addrKey")])]
    display(nomatch)
    print(nomatch[country_field].value_counts())

In [None]:
vc_values = pd.DataFrame(columns=results.keys(), index=results.keys())

for k1 in results:
    vc_values.loc[k1, k1] = results[k1].shape[0]
    for k2 in results:
        if k1>k2:
            r1=results[k1]
            r2=results[k2]
            mg = r1[["addrKey", "placeId"]].merge(r2[["addrKey", "placeId"]], on="addrKey", how="outer", indicator=True)
 
            vc = mg._merge.value_counts()

            mismatches = mg[mg.placeId_x != mg.placeId_y][["addrKey"]]
            mismatches = mismatches.merge(addresses)
            mismatches = mismatches.merge(r1[[addr_key_field, "placeId", "method", f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}", "lpostHouseNbr"]], on=addr_key_field)
            mismatches = mismatches.merge(r2[[addr_key_field, "placeId", "method", f"addrOut{street_field}", f"addrOut{housenbr_field}", f"addrOut{postcode_field}", f"addrOut{city_field}", "lpostHouseNbr"]], on=addr_key_field)
            mismatches.columns = pd.MultiIndex.from_arrays([["Input"]*6 + [f"x:{k1}"]*7 + [f"y:{k2}"]*7, mismatches.columns])

            mismatch_values = mismatches[(mismatches[f"x:{k1}"].rename(lambda x: x.replace("_x", ""), axis=1).fillna("") != 
                                          mismatches[f"y:{k2}"].rename(lambda x: x.replace("_y", ""), axis=1).fillna("")).any(axis=1)]
            
            mismatch_values_no_nmbr = mismatches[(mismatches[f"x:{k1}"].rename(lambda x: x.replace("_x", ""), axis=1).drop(f"addrOut{housenbr_field}", axis=1).fillna("") != 
                                                  mismatches[f"y:{k2}"].rename(lambda x: x.replace("_y", ""), axis=1).drop(f"addrOut{housenbr_field}", axis=1).fillna("")).any(axis=1)]
            
            
            vc_label = f"{vc['both']} ({mismatches.shape[0]} - {mismatch_values.shape[0]} - {mismatch_values_no_nmbr.shape[0]}) / {vc['left_only']} / {vc['right_only']}"
            vc_values.loc[k1, k2]=vc_label

                
            print(f"{k1} vs {k2}")
            print(vc_label)
            print("-----------------------------")
            
            print(f"Only in {k1}")
            display(r1[r1[addr_key_field].isin(mg[mg._merge=="left_only"][addr_key_field])].merge(addresses))
            
            print(f"Only in {k2}")
            display(r2[r2[addr_key_field].isin(mg[mg._merge=="right_only"][addr_key_field])].merge(addresses))
            
            print("Mismatch on place_id")
            display(mismatches)
            
            print("Mismatch on values")
            
            display(mismatch_values)
            
            print("Mismatch on values (no nbr)")
            display(mismatch_values_no_nmbr)
            
            print("#######################")
            
# display(vc_values.fillna(""))

In [None]:
# r1=results[('check', 'unstruct')].merge(addresses)
# r2=results[('check', 'struct')].merge(addresses)
r1=results[('nocheck', 'struct')].merge(addresses)
r2=results[('check', 'struct')].merge(addresses)
k = 2969
display(r1.query(f"addrKey=='{k}'"))
display(r2.query(f"addrKey=='{k}'"))

In [None]:
# addresses.iloc[0:60]#.dtypes

In [None]:
print("Common in both (disagree on place_id - disagree on values - disagree on values, ignoring number) / results only for row / results only for columns")
vc_values.fillna("")

# tests

In [None]:
osm_host ="172.27.0.64:8080"
def get_osm(addr, accept_language = ""): #lg = "en,fr,nl"
    params = urllib.parse.urlencode({"q": addr,
                                    "format":"jsonv2",
                                    "accept-language":accept_language,
                                    "addressdetails":"1",
                                    "namedetails" : "1",
                                    "limit": "50"
                                    })
    
    url = "http://%s/search.php?%s"%(osm_host, params)
#     print(f"Call to OSM: {url}")
    try: 
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
            return res
            return [ {field: item[field] for field in ["place_id", "lat", "lon", "display_name", "address", "namedetails", "place_rank", "category", "type"]} for item in res] 
    except Exception as e:
        raise Exception (f"Cannot get OSM results ({osm_host}): {e}") 

In [None]:
get_osm(" Strokapelstraat, 14, 2235 Hulshout, Belgique")

In [None]:
addr.columns = pd.MultiIndex.from_tuples([("A", "addrKey"), ("A", "streetName"), ("B", "houseNumber"), ("B", "city"), ("B", "postCode"), ("C", "country"), ("test", "test")])
addr

In [None]:
# addresses.iloc[0:10].T.to_dict()#groupby(["A", "B", "C"]).unstack().to_dict(orient="index")#.groupby(level=0).apply(lambda df: df.xs(df.name).to_dict()).to_dict()
addr.columns.get_level_values(0).unique()

In [None]:
addr= addresses.iloc[0:10]
# [{l0: {rec for rec in addr[l0].to_dict(orient="records")} for l0 in addr.columns.get_level_values(0).unique()}]#
addr

In [None]:
l0='A'
[{lev0:rec for rec in addr[lev0].to_dict(orient="records")} for lev0 in addr.columns.get_level_values(0).unique()]

In [None]:
# [rec for rec in addr.to_dict(orient="dict")]
addr#.to_dict(orient="index")
# addr = addr.copy()

In [None]:
addr#.loc[8, ('B', 'houseNumber')] = None

In [None]:
res = [{ k1: {k2: rec[(k1, k2)]  for k2 in addr[k1].columns.get_level_values(0)} for k1 in addr.columns.get_level_values(0).unique() }  for rec in addr.to_dict(orient="records")]

In [None]:
[{k1: {k2: rec[k1][k2] for k2 in rec[k1] if not pd.isnull(rec[k1][k2]) and rec[k1][k2] != ""} for k1 in rec} for rec in res]

In [None]:
addresses[("input",)]

In [None]:
addresses.columns

In [None]:
addresses[("output", "test")]=1

In [None]:
addresses