In [2]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas(tqdm)

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display




In [3]:
import urllib3

In [4]:
http = urllib3.PoolManager()

In [5]:
from config_batch import * 

# Functions

In [6]:
ws_hostname = "127.0.0.1"

In [7]:
def call_ws(addr_data): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [8]:
def call_ws_batch(addr_data, mode="geo"): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"}).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode
    })
    
#     print(r.data.decode('utf-8'))
    res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
#     display(res)
    return res

In [9]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [10]:
call_ws({street_field: "Av. Fonsny", 
          housenbr_field: "20",
          city_field: "Saint-Gilles",
          postcode_field:  "1060",
          country_field: "Belgium"})

{'match': [{'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue F

## Batch calls (row by row)

In [11]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(1000).copy()

### Simple way

In [29]:
addresses["json"] = addresses.progress_apply(call_ws, axis=1)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed |  2min  4.4s


In [30]:
expand_json(addresses)

## Batch calls (batch WS)

### Single block

In [42]:
# Only geocoding
call_ws_batch(addresses)


Unnamed: 0,addr_key,lat,lon,place_rank
0,0413.915.628,50.841404,4.354718,30.0
1,0414.808.820,50.839255,3.166077,26.0
2,0415.311.636,51.165199,4.646993,30.0
3,0415.861.368,51.174983,4.151651,30.0
4,0417.004.384,51.204335,4.394418,26.0
...,...,...,...,...
94,2.225.889.286,50.494427,5.223551,26.0
95,0450.543.224,50.069063,4.509224,21.0
96,0598.893.143,50.747180,3.224409,21.0
97,0828.156.207,50.645138,5.573420,16.0


In [47]:
# Geocode + address
call_ws_batch(addresses, mode="long") 


Unnamed: 0,0
error,Cannot connect to Photon (photon:2322): <urlo...


### Batch blocs

In [16]:
chunk_size = 100
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)

res= [call_ws_batch(chunk, mode="long") for chunk in tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 
#50: 4:04
#100 : 2:30
#250 : 2:04
#1000 : 1:37

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [17]:
df_res = pd.concat(res, sort=False)
df_res

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,SIM_street,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,retry_on_26,method,extra_house_nbr
0,0833.866.636,Belgique,4040,Herstal,Rue Hoyoux,65,"Rue Hoyoux, 65, 4040 Herstal, Belgique",2145634.0,50.662060,5.627758,...,1.0,Rue Hoyoux,Herstal,65,België / Belgique / Belgien,4040,,,orig,65
1,0808.308.324,Belgique,1160,Auderghem,Rue Guillaume Dekelver,61,"Rue Guillaume Dekelver, 61, 1160 Auderghem, Be...",195029.0,50.817403,4.411137,...,1.0,Rue Guillaume Dekelver - Guillaume Dekelverstraat,Auderghem - Oudergem,61,België / Belgique / Belgien,1160,,,orig,61
2,0459.145.738,Belgique,9830,Sint-Martens-Latem,Kortrijksesteenweg,222,"Kortrijksesteenweg, 222, 9830 Sint-Martens-Lat...",1497018.0,51.013682,3.646452,...,1.0,Kortrijksesteenweg,Sint-Martens-Latem,,België / Belgique / Belgien,9830,Hooglatem,,orig,222
3,2.228.995.563,Belgique,8200,Brugge,Diksmuidse Heerweg,225,"Diksmuidse Heerweg, 225, 8200 Brugge, Belgique",2440583.0,51.191882,3.192547,...,1.0,Diksmuidse Heerweg,Sint-Andries,225,België / Belgique / Belgien,8200,,,orig,225
4,2.268.872.956,Belgique,1060,Saint-Gilles,Rue de Roumanie,27,"Rue de Roumanie, 27, 1060 Saint-Gilles, Belgique",756257.0,50.827887,4.350485,...,1.0,Rue de Roumanie - Roemeniëstraat,Saint-Gilles - Sint-Gillis,27,België / Belgique / Belgien,1060,,,orig,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,2.039.686.504,Belgique,2018,Antwerpen,Van den Nestlei,29,"Van den Nestlei, 29, 2018 Antwerpen, Belgique",2348312.0,51.210351,4.424799,...,1.0,Van den Nestlei,Antwerpen,29,België / Belgique / Belgien,2018,Zurenborg,,orig,29
95,0410.237.051,Belgique,2170,Antwerpen,Papaverstraat,12,"Papaverstraat, 12, 2170 Antwerpen, Belgique",1914858.0,51.254214,4.440189,...,1.0,Papaverstraat,Merksem,12,België / Belgique / Belgien,2170,,,orig,12
96,0711.685.733,Belgique,2140,Antwerpen,Guldensporenstraat,6,"Guldensporenstraat, 6, 2140 Antwerpen, Belgique",2381298.0,51.222955,4.441456,...,1.0,Guldensporenstraat,Borgerhout,6,België / Belgique / Belgien,2140,Borgerhout Intra Muros,,orig,6
97,0547.928.551,Belgique,1435,Mont-Saint-Guibert,Rue Haute,26,"Rue Haute, 26, 1435 Mont-Saint-Guibert, Belgique",2709445.0,50.651370,4.663342,...,1.0,Rue Haute,Mont-Saint-Guibert,,België / Belgique / Belgien,1435,,,orig,26


In [18]:
df_res.method.value_counts()

orig                                   818
regex[init]                             99
libpostal+regex[lpost]+photon           21
nostreet                                19
nonum                                   17
photon                                   2
libpostal+regex[lpost]+photon+nonum      1
Name: method, dtype: int64

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,SIM_street,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,retry_on_26,method,extra_house_nbr
37,2.158.729.753,Belgique,7141,Morlanwelz,Place Gonzales Decamps(CAR),16A,"Place Gonzales Decamp, 16a, 7141 Morlanwelz, B...",679121.0,50.442764,4.255813,...,1.0,Place Gonzales Decamp,Morlanwelz,,België / Belgique / Belgien,7141,Le Champ Là-Haut,,libpostal+regex[lpost]+photon,16A
46,0879.813.556,Belgique,1930,Zaventem,Van Dijcklaan,46,"Antoon Van Dijcklaan, 46, 1930 Zaventem, Belgi...",2516230.0,50.887885,4.473784,...,1.0,Antoon Van Dijcklaan,Zaventem,46.0,België / Belgique / Belgien,1930,,,libpostal+regex[lpost]+photon,46
72,2.264.283.767,Belgique,7540,Tournai,Rue Trieu(MEL),36,"Rue Trieu Ewil, 36, 7520 Tournai, België - Bel...",1690632.0,50.659548,3.274557,...,1.0,Rue Trieu Ewil,Tournai,,België / Belgique / Belgien,7520,Trieu Ewille,,libpostal+regex[lpost]+photon,36
8,2.254.536.455,Belgique,5020,Namur,Rue Lieut.Col.Maniette(TX),30,"Rue Lieutenant Colonel Maniette, 30, 5020 Namu...",536917.0,50.479735,4.742064,...,1.0,Rue Lieutenant Colonel Maniette,Temploux,,België / Belgique / Belgien,5020,,,photon,30
27,0640.893.846,Belgique,3850,Nieuwerkerken (Limb.),Driesstraat,50,"Driesstraat, 50, 3850 Nieuwerkerken, België - ...",1454307.0,50.860267,5.204004,...,1.0,Driesstraat,Nieuwerkerken,,België / Belgique / Belgien,3850,,,libpostal+regex[lpost]+photon,50
36,2.076.045.963,Belgique,7110,La Louvière,Rue Grand Peuplier(H-A),11,"Rue du Grand Peuplier, 11, 7100 La Louvière, B...",2699067.0,50.494102,4.129554,...,1.0,Rue du Grand Peuplier,La Louvière,,België / Belgique / Belgien,7100,Pont Balasse,,libpostal+regex[lpost]+photon,11
49,2.231.195.186,Belgique,7032,Mons,Rue Gontran Bachy,58,"Rue Gontrand Bachy, 58, 7032 Mons, België - Be...",2784414.0,50.429375,3.995954,...,1.0,Rue Gontrand Bachy,Mons,,België / Belgique / Belgien,7032,,,libpostal+regex[lpost]+photon,58
38,0643.951.524,Belgique,2620,Hemiksem,K. de Backerstraat,34-36,"Karel De Backerstraat, 34-36, 2620 Hemiksem, B...",900638.0,51.145405,4.341883,...,1.0,Karel De Backerstraat,Hemiksem,,België / Belgique / Belgien,2620,,,libpostal+regex[lpost]+photon,34-36
50,0806.871.734,Belgique,6840,Neufchâteau,"Rue Franklin Roosevelt,NEU",24,"Rue Franklin Roosevelt, 24, 6840 Neufchâteau, ...",2205899.0,49.840481,5.435005,...,1.0,Rue Franklin Roosevelt,Neufchâteau,,België / Belgique / Belgien,6840,Semel,,libpostal+regex[lpost]+photon,24
52,2.260.795.430,Belgique,5032,Gembloux,"Rue de l' Etoile, Bos.",44,"Rue de l'Étoile, 44, 5032 Gembloux, België - B...",522596.0,50.518553,4.716231,...,1.0,Rue de l'Étoile,Gembloux,,België / Belgique / Belgien,5032,,,libpostal+regex[lpost]+photon,44
