In [1]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas()

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


  


In [2]:
import urllib3

In [3]:
http = urllib3.PoolManager()

In [4]:
from config_batch import * 

# Functions

In [5]:
ws_hostname = "127.0.1.1"
# ws_hostname = "192.168.1.3"


In [47]:
def call_ws(addr_data, check_result=True): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                     "check_result" : "yes" if check_result else "no"
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [49]:
def call_ws_batch(addr_data, mode="geo", with_reject=False, check_result=True): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"
    }).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode,
        "with_rejected" : "yes" if with_reject else "no",
        "check_result" : "yes" if check_result else "no"
    })
    
    try:
        res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
    except ValueError:
        print("Cannot decode result:")
        print(json.loads(r.data.decode('utf-8')))
        return 
#     display(res)
    return res

In [8]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [45]:
call_ws({street_field:   "Av. Fonsny", 
         housenbr_field: "20",
         city_field:     "Saint-Gilles",
         postcode_field: "1060",
         country_field:  "Belgium"}, check_result=True)

http://127.0.1.1:5000/search/?street=Av.+Fonsny&housenumber=20&city=Saint-Gilles&postcode=1060&country=Belgium&check_result=yes


{'match': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'osm_addr_in': 'Av. Fonsny, 20, 1060 Saint-Gilles, Belgium',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis

## Batch calls (row by row)

In [10]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(100).copy()

### Simple way

In [51]:
addresses["json"] = addresses.progress_apply(call_ws, check_result=False, axis=1)

  0%|          | 0/100 [00:00<?, ?it/s]

### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed | 59.1s


In [26]:
expand_json(addresses)

In [27]:
addresses

Unnamed: 0,EntityNumber,CountryFR,Zipcode,MunicipalityFR,StreetFR,HouseNumber,json,status,time,timing,method,street,number,postcode,city
1321156,0836.380.520,Belgique,7911,Frasnes-lez-Anvaing,Rue Goderneau(MB),14,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.960761,{},regex[init],Rue Goderneau,,7906,Ath
2660061,2.231.595.460,Belgique,9981,Sint-Laureins,Vlamingstraat(STM),92,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:01.143796,{},regex[init],Vlamingstraat,,9981,Sint-Laureins
164680,0431.356.723,Belgique,1050,Ixelles,Rue de l'Amazone,21,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.724923,{},orig,Rue de l'Amazone - Amazonestraat,21,1050,Ixelles - Elsene
1295595,0832.380.556,Belgique,4000,Liège,Rue Henri-Maus,29,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.584219,{},orig,Rue Henri Maus,,4000,Liège
2293307,2.168.200.715,Belgique,8300,Knokke-Heist,Zeedijk-Knokke,641,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.548851,{},orig,Zeedijk Knokke,,8300,Knokke-Heist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751112,2.244.614.642,Belgique,9240,Zele,Smeiersberg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.544249,{},orig,Smeiersberg,,9240,Zele
2810314,2.252.606.947,Belgique,1080,Molenbeek-Saint-Jean,Rue Henri De Saegher,27,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.968186,{},orig,Rue Henri De Saegher - Henri De Saegherstraat,27,1080,Molenbeek-Saint-Jean - Sint-Jans-Molenbeek
732484,0650.498.331,Belgique,4700,Eupen,Stockbergerweg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.987093,{},orig,Stockbergerweg,,4700,Eupen
1799145,2.037.557.947,Belgique,1180,Uccle,Parvis Saint-Pierre,119,"{'match': [{'SIM_street_which': '', 'addr_out_...",match,0 days 00:00:00.517703,{},orig,Parvis Saint-Pierre - Sint-Pietersvoorplein,,1180,Uccle - Ukkel


## Batch calls (batch WS)

### Single block

In [53]:
# Only geocoding
call_ws_batch(addresses, mode="geo", check_result=True)

Unnamed: 0,addr_key,lat,lon,place_rank,method
0,0308.251.647,50.467047,4.207648,26.0,orig
1,0408.704.352,50.894373,4.379952,30.0,orig
2,0413.358.372,51.220346,4.413114,30.0,orig
3,0431.356.723,50.824780,4.359237,30.0,orig
4,0439.768.306,50.190684,4.535742,26.0,orig
...,...,...,...,...,...
93,0453.040.676,50.270696,5.521461,26.0,nonum
94,0535.902.531,50.292862,5.031069,30.0,libpostal+regex[lpost]+photon
95,2.202.472.003,50.878578,4.409547,26.0,libpostal+regex[lpost]+photon
96,2.237.979.050,50.590356,5.747109,26.0,libpostal+regex[lpost]+photon


In [62]:
# Geocode + address
call_ws_batch(addresses, mode="short") 

Unnamed: 0,addr_key,lat,lon,place_rank,method,addr_out_street,addr_out_number,extra_house_nbr,addr_out_postcode,addr_out_city,addr_out_country
0,2.164.397.226,50.712641,4.530106,26.0,orig,Rue Robert Boisacq,,9 A,1330,Rixensart,België / Belgique / Belgien
1,2.279.599.374,50.859702,2.828440,26.0,orig,Stijn Streuvelswijk,,18,8908,Ieper,België / Belgique / Belgien
2,0669.833.203,51.209175,4.470072,30.0,orig,Boterlaarbaan,239,239,2100,Deurne,België / Belgique / Belgien
3,2.243.108.073,50.658359,5.483832,26.0,orig,Rue de l'Aéropostale,,8,4460,Grâce-Hollogne,België / Belgique / Belgien
4,2.045.477.107,51.100407,3.167726,26.0,orig,Koningin Astridstraat,,109,8210,Zedelgem,België / Belgique / Belgien
...,...,...,...,...,...,...,...,...,...,...,...
92,0857.952.627,50.834163,3.231044,26.0,regex[init],Heulsestraat,,3,8501,Kortrijk,België / Belgique / Belgien
93,2.249.583.121,51.194730,4.407469,30.0,orig,Desguinlei,88-90,90,2018,Antwerpen,België / Belgique / Belgien
94,0834.315.707,51.204519,4.398727,30.0,orig,Diercxsensstraat,39,39,2018,Antwerpen,België / Belgique / Belgien
95,0857.635.002,50.843955,4.264415,26.0,orig,Ninoofsesteenweg,,227,1700,Dilbeek,België / Belgique / Belgien


In [63]:
# Geocode + address, with rejected addresses
call_ws_batch(addresses, mode="long", with_reject=True) 

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,retry_on_26,method,extra_house_nbr,rejected
0,2.164.397.226,Belgique,1330,Rixensart,Rue Robert Boisacq,9 A,"Rue Robert Boisacq, 9 A, 1330 Rixensart, Belgique",564275.0,50.712641,4.530106,...,Rue Robert Boisacq,Rixensart,,België / Belgique / Belgien,1330,Fond du Patch,,orig,9 A,
1,2.279.599.374,Belgique,8908,Ieper,Streuvelswijk,18,"Streuvelswijk, 18, 8908 Ieper, Belgique",785053.0,50.859702,2.828440,...,Stijn Streuvelswijk,Ieper,,België / Belgique / Belgien,8908,,,orig,18,
2,0669.833.203,Belgique,2100,Antwerpen,Boterlaarbaan,239,"Boterlaarbaan, 239, 2100 Antwerpen, Belgique",2010239.0,51.209175,4.470072,...,Boterlaarbaan,Deurne,239,België / Belgique / Belgien,2100,Eksterlaar,,orig,239,
3,2.243.108.073,Belgique,4460,Grâce-Hollogne,Rue de l'Aéropostale,8,"Rue de l'Aéropostale, 8, 4460 Grâce-Hollogne, ...",1445516.0,50.658359,5.483832,...,Rue de l'Aéropostale,Grâce-Hollogne,,België / Belgique / Belgien,4460,Liège logistics,,orig,8,
4,2.045.477.107,Belgique,8210,Zedelgem,Koningin Astridstraat,109,"Koningin Astridstraat, 109, 8210 Zedelgem, Bel...",1763661.0,51.100407,3.167726,...,Koningin Astridstraat,Zedelgem,,België / Belgique / Belgien,8210,,,orig,109,"[{'index': 5.0, 'osm_addr_in': 'Koningin Astri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0857.952.627,Belgique,8501,Kortrijk,Heulsestraat(Bis),3,"Heulsestraat, 3, 8501 Kortrijk, Belgique",2738948.0,50.834163,3.231044,...,Heulsestraat,Kortrijk,,België / Belgique / Belgien,8501,,,regex[init],3,
93,2.249.583.121,Belgique,2018,Antwerpen,Desguinlei,90,"Desguinlei, 90, 2018 Antwerpen, Belgique",2323301.0,51.194730,4.407469,...,Desguinlei,Antwerpen,88-90,België / Belgique / Belgien,2018,Markgrave,,orig,90,
94,0834.315.707,Belgique,2018,Antwerpen,Diercxsensstraat,39,"Diercxsensstraat, 39, 2018 Antwerpen, Belgique",2338350.0,51.204519,4.398727,...,Diercxsensstraat,Antwerpen,39,België / Belgique / Belgien,2018,Brederode,,orig,39,
95,0857.635.002,Belgique,1700,Dilbeek,Ninoofsesteenweg,227,"Ninoofsesteenweg, 227, 1700 Dilbeek, Belgique",943444.0,50.843955,4.264415,...,Ninoofsesteenweg,Dilbeek,,België / Belgique / Belgien,1700,,,orig,227,"[{'index': 120.0, 'osm_addr_in': 'Ninoofsestee..."


### Batch blocs

In [22]:
chunk_size = 10
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)

res= [call_ws_batch(chunk, mode="long") for chunk in tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 
#50: 4:04
#100 : 2:30
#250 : 2:04
#1000 : 1:37

  0%|          | 0/10 [00:00<?, ?it/s]

In [23]:
df_res = pd.concat(res, sort=False)
df_res

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,json,osm_addr_in,place_id,lat,...,SIM_street_which,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,method,retry_on_26,extra_house_nbr
0,0836.380.520,Belgique,7911,Frasnes-lez-Anvaing,Rue Goderneau(MB),14,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Goderneau, 14, 7911 Frasnes-lez-Anvaing, B...",695630.0,50.645420,...,,Rue Goderneau,Ath,,België / Belgique / Belgien,7906,,regex[init],,14
1,2.231.595.460,Belgique,9981,Sint-Laureins,Vlamingstraat(STM),92,"{'match': [{'SIM_street_which': '', 'addr_out_...","Vlamingstraat, 92, 9981 Sint-Laureins, Belgique",2339251.0,51.250677,...,,Vlamingstraat,Sint-Laureins,,België / Belgique / Belgien,9981,,regex[init],,92
2,0431.356.723,Belgique,1050,Ixelles,Rue de l'Amazone,21,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue de l'Amazone, 21, 1050 Ixelles, Belgique",203672.0,50.824780,...,,Rue de l'Amazone - Amazonestraat,Ixelles - Elsene,21,België / Belgique / Belgien,1050,,orig,,21
3,0832.380.556,Belgique,4000,Liège,Rue Henri-Maus,29,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Henri-Maus, 29, 4000 Liège, Belgique",1602456.0,50.630432,...,,Rue Henri Maus,Liège,,België / Belgique / Belgien,4000,Glain,orig,,29
4,2.168.200.715,Belgique,8300,Knokke-Heist,Zeedijk-Knokke,641,"{'match': [{'SIM_street_which': '', 'addr_out_...","Zeedijk-Knokke, 641, 8300 Knokke-Heist, Belgique",739194.0,51.352155,...,,Zeedijk Knokke,Knokke-Heist,,België / Belgique / Belgien,8300,Albertstrand,orig,,641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2.244.614.642,Belgique,9240,Zele,Smeiersberg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...","Smeiersberg, 5, 9240 Zele, Belgique",795193.0,51.058158,...,,Smeiersberg,Zele,,België / Belgique / Belgien,9240,,orig,,5
6,2.252.606.947,Belgique,1080,Molenbeek-Saint-Jean,Rue Henri De Saegher,27,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Henri De Saegher, 27, 1080 Molenbeek-Saint...",441880.0,50.852214,...,,Rue Henri De Saegher - Henri De Saegherstraat,Molenbeek-Saint-Jean - Sint-Jans-Molenbeek,27,België / Belgique / Belgien,1080,Kaya market,orig,,27
7,0650.498.331,Belgique,4700,Eupen,Stockbergerweg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...","Stockbergerweg, 5, 4700 Eupen, Belgique",2413057.0,50.626646,...,,Stockbergerweg,Eupen,,België / Belgique / Belgien,4700,Unterstadt,orig,,5
8,2.037.557.947,Belgique,1180,Uccle,Parvis Saint-Pierre,119,"{'match': [{'SIM_street_which': '', 'addr_out_...","Parvis Saint-Pierre, 119, 1180 Uccle, Belgique",485970.0,50.803207,...,,Parvis Saint-Pierre - Sint-Pietersvoorplein,Uccle - Ukkel,,België / Belgique / Belgien,1180,,orig,,119


In [24]:
df_res.method.value_counts()

orig                             73
regex[init]                      19
libpostal+regex[lpost]+photon     3
nonum                             2
nostreet                          2
Name: method, dtype: int64

In [25]:
df_res


Unnamed: 0,addr_key,country,postcode,city,street,housenumber,json,osm_addr_in,place_id,lat,...,SIM_street_which,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,method,retry_on_26,extra_house_nbr
0,0836.380.520,Belgique,7911,Frasnes-lez-Anvaing,Rue Goderneau(MB),14,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Goderneau, 14, 7911 Frasnes-lez-Anvaing, B...",695630.0,50.645420,...,,Rue Goderneau,Ath,,België / Belgique / Belgien,7906,,regex[init],,14
1,2.231.595.460,Belgique,9981,Sint-Laureins,Vlamingstraat(STM),92,"{'match': [{'SIM_street_which': '', 'addr_out_...","Vlamingstraat, 92, 9981 Sint-Laureins, Belgique",2339251.0,51.250677,...,,Vlamingstraat,Sint-Laureins,,België / Belgique / Belgien,9981,,regex[init],,92
2,0431.356.723,Belgique,1050,Ixelles,Rue de l'Amazone,21,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue de l'Amazone, 21, 1050 Ixelles, Belgique",203672.0,50.824780,...,,Rue de l'Amazone - Amazonestraat,Ixelles - Elsene,21,België / Belgique / Belgien,1050,,orig,,21
3,0832.380.556,Belgique,4000,Liège,Rue Henri-Maus,29,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Henri-Maus, 29, 4000 Liège, Belgique",1602456.0,50.630432,...,,Rue Henri Maus,Liège,,België / Belgique / Belgien,4000,Glain,orig,,29
4,2.168.200.715,Belgique,8300,Knokke-Heist,Zeedijk-Knokke,641,"{'match': [{'SIM_street_which': '', 'addr_out_...","Zeedijk-Knokke, 641, 8300 Knokke-Heist, Belgique",739194.0,51.352155,...,,Zeedijk Knokke,Knokke-Heist,,België / Belgique / Belgien,8300,Albertstrand,orig,,641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2.244.614.642,Belgique,9240,Zele,Smeiersberg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...","Smeiersberg, 5, 9240 Zele, Belgique",795193.0,51.058158,...,,Smeiersberg,Zele,,België / Belgique / Belgien,9240,,orig,,5
6,2.252.606.947,Belgique,1080,Molenbeek-Saint-Jean,Rue Henri De Saegher,27,"{'match': [{'SIM_street_which': '', 'addr_out_...","Rue Henri De Saegher, 27, 1080 Molenbeek-Saint...",441880.0,50.852214,...,,Rue Henri De Saegher - Henri De Saegherstraat,Molenbeek-Saint-Jean - Sint-Jans-Molenbeek,27,België / Belgique / Belgien,1080,Kaya market,orig,,27
7,0650.498.331,Belgique,4700,Eupen,Stockbergerweg,5,"{'match': [{'SIM_street_which': '', 'addr_out_...","Stockbergerweg, 5, 4700 Eupen, Belgique",2413057.0,50.626646,...,,Stockbergerweg,Eupen,,België / Belgique / Belgien,4700,Unterstadt,orig,,5
8,2.037.557.947,Belgique,1180,Uccle,Parvis Saint-Pierre,119,"{'match': [{'SIM_street_which': '', 'addr_out_...","Parvis Saint-Pierre, 119, 1180 Uccle, Belgique",485970.0,50.803207,...,,Parvis Saint-Pierre - Sint-Pietersvoorplein,Uccle - Ukkel,,België / Belgique / Belgien,1180,,orig,,119
