In [2]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas()

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


In [3]:
import urllib3

In [4]:
http = urllib3.PoolManager()

In [5]:
from config_batch import * 

# Functions

In [6]:
ws_hostname = "127.0.1.1"
# ws_hostname = "192.168.1.3"


In [7]:
def call_ws(addr_data): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [30]:
def call_ws_batch(addr_data, mode="geo"): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"}).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode
    })
    
#     print(r.data.decode('utf-8'))
    res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
#     display(res)
    return res

In [9]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [12]:
call_ws({street_field: "Av. Fonsny", 
          housenbr_field: "20",
          city_field: "Saint-Gilles",
          postcode_field:  "1060",
          country_field: "Belgium"})

{'match': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'add

## Batch calls (row by row)

In [32]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(1000).copy()

### Simple way

In [29]:
addresses["json"] = addresses.progress_apply(call_ws, axis=1)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed |  2min  4.4s


In [30]:
expand_json(addresses)

## Batch calls (batch WS)

### Single block

In [15]:
# Only geocoding
call_ws_batch(addresses)


Unnamed: 0,addr_key,lat,lon,place_rank,method
0,0409.104.725,50.823888,4.403813,30.0,orig
1,0415.637.575,50.409534,4.449836,30.0,orig
2,0427.050.715,51.179632,4.448321,26.0,orig
3,0427.944.303,50.838614,4.350232,26.0,orig
4,0429.163.434,51.195724,3.827420,26.0,orig
...,...,...,...,...,...
93,0829.931.109,50.813106,4.388813,30.0,libpostal+regex[lpost]+photon
94,2.145.995.435,50.942822,4.439392,26.0,libpostal+regex[lpost]+photon
95,0808.372.165,50.609802,5.289813,21.0,nostreet
96,0847.389.228,50.636694,6.040302,21.0,nostreet


In [33]:
# Geocode + address
call_ws_batch(addresses, mode="long") 


Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,retry_on_26,method,extra_house_nbr,rejected
0,0841.345.633,Belgique,1090,Jette,Rue Henri Huybreghts,19,"Rue Henri Huybreghts, 19, 1090 Jette, Belgique",640252.0,50.878116,4.320586,...,Rue Henri Huybreghts - Henri Huybreghtsstraat,Jette,19,België / Belgique / Belgien,1090,Dieleghem - Dielegem,,orig,19,
1,0716.650.351,Belgique,5101,Namur,Clos de la Verveine(EP),-,"Clos de la Verveine, -, 5101 Namur, Belgique",535094.0,50.450489,4.901398,...,Clos de la Verveine,Erpent,,België / Belgique / Belgien,5101,,,regex[init],-,
2,2.215.644.306,Belgique,8600,Diksmuide,Woumenweg,267 A,"Woumenweg, 267 A, 8600 Diksmuide, Belgique",677733.0,51.001508,2.869376,...,Woumenweg,Diksmuide,,België / Belgique / Belgien,8600,,,orig,267 A,"[{'index': 2, 'osm_addr_in': 'Woumenweg, 267 A..."
3,0657.897.352,Belgique,6887,Herbeumont,"Le Routeux, St-Médard",35,"Le Routeux, St-Médard, 35, 6887 Herbeumont, Be...",1535253.0,49.815940,5.316442,...,Le Routeux,Herbeumont,,België / Belgique / Belgien,6887,,,orig,35,
4,0841.901.107,Belgique,1180,Uccle,Avenue Winston Churchill,234 A,"Avenue Winston Churchill, 234 A, 1180 Uccle, B...",192117.0,50.812105,4.366041,...,Avenue Winston Churchill - Winston Churchilllaan,Uccle - Ukkel,234,België / Belgique / Belgien,1180,,,orig,234 A,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0862.649.704,Belgique,1420,Braine-l'Alleud,Rue de l'Hôpital,3,"Rue de l'Hôpital, 3, 1420 Braine-l'Alleud, Bel...",2198404.0,50.682124,4.369754,...,Rue de l'Hôpital,Braine-l'Alleud,3,België / Belgique / Belgien,1420,,,orig,3,
966,2.187.526.083,Belgique,2970,Schilde,De Pont,19,"De Pont, 19, 2970 Schilde, Belgique",1988809.0,51.227225,4.581196,...,De Pont,Schilde,19,België / Belgique / Belgien,2970,,,orig,19,
967,2.177.070.869,Belgique,4880,Aubel,Rue du Vieux Tilleul,33,"Rue du Vieux Tilleul, 33, 4880 Aubel, Belgique",619354.0,50.712055,5.852381,...,Rue du Vieux Tilleul,Aubel,,België / Belgique / Belgien,4880,,,orig,33,
968,2.228.079.607,Belgique,1348,Ottignies-Louvain-la-Neuve,Rue du Bosquet,8,"Rue du Bosquet, 8, 1348 Ottignies-Louvain-la-N...",508646.0,50.659814,4.630603,...,Rue du Bosquet,Mont-Saint-Guibert,,België / Belgique / Belgien,1348,Parc Einstein,,orig,8,


In [34]:
# Geocode + address, with rejected addresses
call_ws_batch(addresses, mode="long,reject") 


orig                                   794
regex[init]                            106
libpostal+regex[lpost]+photon           35
nostreet                                14
nonum                                   14
libpostal+regex[lpost]                   3
libpostal+regex[lpost]+photon+nonum      2
libpostal+regex[lpost]+nonum             1
Name: method, dtype: int64

### Batch blocs

In [29]:
chunk_size = 10
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)

res= [call_ws_batch(chunk, mode="long") for chunk in tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 
#50: 4:04
#100 : 2:30
#250 : 2:04
#1000 : 1:37

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [30]:
df_res = pd.concat(res, sort=False)
df_res

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,SIM_street,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,method,extra_house_nbr,retry_on_26
0,2.229.599.339,Belgique,2480,Dessel,Brasel,43,"Brasel, 43, 2480 Dessel, Belgique",118759816.0,51.240794,5.088575,...,1.0,Brasel,Dessel,43,België - Belgique - Belgien,2480,,orig,43,
1,0434.761.522,Belgique,8000,Brugge,Karel de Stoutelaan,172,"Karel de Stoutelaan, 172, 8000 Brugge, Belgique",129380407.0,51.211968,3.214481,...,1.0,Karel de Stoutelaan,Brugge,,België - Belgique - Belgien,8000,Brugge-Centrum,orig,172,
2,0807.528.067,Belgique,3660,Oudsbergen,Nijverheidslaan,1574,"Nijverheidslaan, 1574, 3660 Oudsbergen, Belgique",127213546.0,51.039261,5.560252,...,1.0,Nijverheidslaan,Oudsbergen,,België - Belgique - Belgien,3660,Opglabbeek-Noord,orig,1574,
3,0872.572.903,Belgique,8560,Wevelgem,Tramstraat,7,"Tramstraat, 7, 8560 Wevelgem, Belgique",296975696.0,50.814277,3.213540,...,1.0,Tramstraat,Wevelgem,7,België - Belgique - Belgien,8560,,orig,7,
4,2.053.560.076,Belgique,1370,Jodoigne,Rue du Sart(Mél.),14,"Rue du Sart, 14, 1370 Jodoigne, Belgique",96378468.0,50.739847,4.800007,...,1.0,Rue du Sart,Jodoigne,,België - Belgique - Belgien,1370,Wallonie,regex[init],14,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2.200.445.493,Belgique,9810,Nazareth,Camiel Fremaultstraat,50,"Camiel Fremaultstraat, 50, 9810 Nazareth, Belg...",114786918.0,50.957943,3.592878,...,1.0,Camiel Fremaultstraat,Nazareth,,België - Belgique - Belgien,9810,,orig,50,
6,2.203.230.878,Belgique,1200,Woluwe-Saint-Lambert,Avenue Albert Jonnart,27,"Avenue Albert Jonnart, 27, 1200 Woluwe-Saint-L...",139300758.0,50.843257,4.404007,...,1.0,Avenue Albert Jonnart - Albert Jonnartlaan,Etterbeek,27,België - Belgique - Belgien,1040,,orig,27,
7,0627.866.647,Belgique,2000,Antwerpen,Lange Gasthuisstraat,35-37,"Lange Gasthuisstraat, 35-37, 2000 Antwerpen, B...",195981861.0,51.214239,4.405437,...,1.0,Lange Gasthuisstraat,Antwerpen,35-37,België - Belgique - Belgien,2000,Wilde Zee,orig,35-37,
8,0760.284.614,Belgique,9320,Aalst,Achtermaal,18,"Achtermaal, 18, 9320 Aalst, Belgique",117553684.0,50.933103,4.002864,...,1.0,Achtermaal,Aalst,,België - Belgique - Belgien,9320,,orig,18,


In [31]:
df_res.method.value_counts()

orig                             79
regex[init]                      10
nostreet                          6
nonum                             4
libpostal+regex[lpost]+photon     1
Name: method, dtype: int64