In [6]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas()

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


  


In [7]:
import urllib3

In [8]:
http = urllib3.PoolManager()

In [9]:
from config_batch import * 

# Functions

In [10]:
ws_hostname = "127.0.1.1"
# ws_hostname = "192.168.1.3"


In [11]:
def call_ws(addr_data): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [51]:
def call_ws_batch(addr_data, mode="geo", with_reject=False): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"}).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode,
        "with_rejected" : "yes" if with_reject else "no"
    })
    
    try:
        res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
    except ValueError:
        print("Cannot decode result:")
        print(json.loads(r.data.decode('utf-8')))
        return 
#     display(res)
    return res

In [13]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [55]:
call_ws({street_field:   "Av. Fonsny", 
         housenbr_field: "20",
         city_field:     "Saint-Gilles",
         postcode_field: "1060",
         country_field:  "Belgium"})

{'match': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'osm_addr_in': 'Av. Fonsny, 20, 1060 Saint-Gilles, Belgium',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis

## Batch calls (row by row)

In [15]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(100).copy()

### Simple way

In [5]:
addresses["json"] = addresses.progress_apply(call_ws, axis=1)

NameError: name 'addresses' is not defined

### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed |  2min  4.4s


In [30]:
expand_json(addresses)

## Batch calls (batch WS)

### Single block

In [61]:
# Only geocoding
call_ws_batch(addresses, mode="geo")

Unnamed: 0,addr_key,lat,lon,place_rank,method
0,0418.954.777,51.218356,5.276947,30.0,orig
1,0419.967.933,50.844528,4.386608,30.0,orig
2,0423.438.454,50.848947,4.350826,30.0,orig
3,0429.933.296,50.816354,4.370248,30.0,orig
4,0436.687.070,50.436915,4.011502,26.0,orig
...,...,...,...,...,...
92,0410.380.868,50.825008,4.341527,30.0,libpostal+regex[lpost]+photon
93,2.000.260.655,50.774965,4.474145,30.0,libpostal+regex[lpost]+photon
94,2.233.692.145,50.631635,5.832775,30.0,libpostal+regex[lpost]+photon
95,2.267.052.524,50.795416,4.653741,26.0,libpostal+regex[lpost]+photon


In [62]:
# Geocode + address
call_ws_batch(addresses, mode="short") 

Unnamed: 0,addr_key,lat,lon,place_rank,method,addr_out_street,addr_out_number,extra_house_nbr,addr_out_postcode,addr_out_city,addr_out_country
0,2.164.397.226,50.712641,4.530106,26.0,orig,Rue Robert Boisacq,,9 A,1330,Rixensart,België / Belgique / Belgien
1,2.279.599.374,50.859702,2.828440,26.0,orig,Stijn Streuvelswijk,,18,8908,Ieper,België / Belgique / Belgien
2,0669.833.203,51.209175,4.470072,30.0,orig,Boterlaarbaan,239,239,2100,Deurne,België / Belgique / Belgien
3,2.243.108.073,50.658359,5.483832,26.0,orig,Rue de l'Aéropostale,,8,4460,Grâce-Hollogne,België / Belgique / Belgien
4,2.045.477.107,51.100407,3.167726,26.0,orig,Koningin Astridstraat,,109,8210,Zedelgem,België / Belgique / Belgien
...,...,...,...,...,...,...,...,...,...,...,...
92,0857.952.627,50.834163,3.231044,26.0,regex[init],Heulsestraat,,3,8501,Kortrijk,België / Belgique / Belgien
93,2.249.583.121,51.194730,4.407469,30.0,orig,Desguinlei,88-90,90,2018,Antwerpen,België / Belgique / Belgien
94,0834.315.707,51.204519,4.398727,30.0,orig,Diercxsensstraat,39,39,2018,Antwerpen,België / Belgique / Belgien
95,0857.635.002,50.843955,4.264415,26.0,orig,Ninoofsesteenweg,,227,1700,Dilbeek,België / Belgique / Belgien


In [63]:
# Geocode + address, with rejected addresses
call_ws_batch(addresses, mode="long", with_reject=True) 

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,retry_on_26,method,extra_house_nbr,rejected
0,2.164.397.226,Belgique,1330,Rixensart,Rue Robert Boisacq,9 A,"Rue Robert Boisacq, 9 A, 1330 Rixensart, Belgique",564275.0,50.712641,4.530106,...,Rue Robert Boisacq,Rixensart,,België / Belgique / Belgien,1330,Fond du Patch,,orig,9 A,
1,2.279.599.374,Belgique,8908,Ieper,Streuvelswijk,18,"Streuvelswijk, 18, 8908 Ieper, Belgique",785053.0,50.859702,2.828440,...,Stijn Streuvelswijk,Ieper,,België / Belgique / Belgien,8908,,,orig,18,
2,0669.833.203,Belgique,2100,Antwerpen,Boterlaarbaan,239,"Boterlaarbaan, 239, 2100 Antwerpen, Belgique",2010239.0,51.209175,4.470072,...,Boterlaarbaan,Deurne,239,België / Belgique / Belgien,2100,Eksterlaar,,orig,239,
3,2.243.108.073,Belgique,4460,Grâce-Hollogne,Rue de l'Aéropostale,8,"Rue de l'Aéropostale, 8, 4460 Grâce-Hollogne, ...",1445516.0,50.658359,5.483832,...,Rue de l'Aéropostale,Grâce-Hollogne,,België / Belgique / Belgien,4460,Liège logistics,,orig,8,
4,2.045.477.107,Belgique,8210,Zedelgem,Koningin Astridstraat,109,"Koningin Astridstraat, 109, 8210 Zedelgem, Bel...",1763661.0,51.100407,3.167726,...,Koningin Astridstraat,Zedelgem,,België / Belgique / Belgien,8210,,,orig,109,"[{'index': 5.0, 'osm_addr_in': 'Koningin Astri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0857.952.627,Belgique,8501,Kortrijk,Heulsestraat(Bis),3,"Heulsestraat, 3, 8501 Kortrijk, Belgique",2738948.0,50.834163,3.231044,...,Heulsestraat,Kortrijk,,België / Belgique / Belgien,8501,,,regex[init],3,
93,2.249.583.121,Belgique,2018,Antwerpen,Desguinlei,90,"Desguinlei, 90, 2018 Antwerpen, Belgique",2323301.0,51.194730,4.407469,...,Desguinlei,Antwerpen,88-90,België / Belgique / Belgien,2018,Markgrave,,orig,90,
94,0834.315.707,Belgique,2018,Antwerpen,Diercxsensstraat,39,"Diercxsensstraat, 39, 2018 Antwerpen, Belgique",2338350.0,51.204519,4.398727,...,Diercxsensstraat,Antwerpen,39,België / Belgique / Belgien,2018,Brederode,,orig,39,
95,0857.635.002,Belgique,1700,Dilbeek,Ninoofsesteenweg,227,"Ninoofsesteenweg, 227, 1700 Dilbeek, Belgique",943444.0,50.843955,4.264415,...,Ninoofsesteenweg,Dilbeek,,België / Belgique / Belgien,1700,,,orig,227,"[{'index': 120.0, 'osm_addr_in': 'Ninoofsestee..."


### Batch blocs

In [29]:
chunk_size = 10
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)

res= [call_ws_batch(chunk, mode="long") for chunk in tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 
#50: 4:04
#100 : 2:30
#250 : 2:04
#1000 : 1:37

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [30]:
df_res = pd.concat(res, sort=False)
df_res

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,osm_addr_in,place_id,lat,lon,...,SIM_street,addr_out_street,addr_out_city,addr_out_number,addr_out_country,addr_out_postcode,addr_out_other,method,extra_house_nbr,retry_on_26
0,2.229.599.339,Belgique,2480,Dessel,Brasel,43,"Brasel, 43, 2480 Dessel, Belgique",118759816.0,51.240794,5.088575,...,1.0,Brasel,Dessel,43,België - Belgique - Belgien,2480,,orig,43,
1,0434.761.522,Belgique,8000,Brugge,Karel de Stoutelaan,172,"Karel de Stoutelaan, 172, 8000 Brugge, Belgique",129380407.0,51.211968,3.214481,...,1.0,Karel de Stoutelaan,Brugge,,België - Belgique - Belgien,8000,Brugge-Centrum,orig,172,
2,0807.528.067,Belgique,3660,Oudsbergen,Nijverheidslaan,1574,"Nijverheidslaan, 1574, 3660 Oudsbergen, Belgique",127213546.0,51.039261,5.560252,...,1.0,Nijverheidslaan,Oudsbergen,,België - Belgique - Belgien,3660,Opglabbeek-Noord,orig,1574,
3,0872.572.903,Belgique,8560,Wevelgem,Tramstraat,7,"Tramstraat, 7, 8560 Wevelgem, Belgique",296975696.0,50.814277,3.213540,...,1.0,Tramstraat,Wevelgem,7,België - Belgique - Belgien,8560,,orig,7,
4,2.053.560.076,Belgique,1370,Jodoigne,Rue du Sart(Mél.),14,"Rue du Sart, 14, 1370 Jodoigne, Belgique",96378468.0,50.739847,4.800007,...,1.0,Rue du Sart,Jodoigne,,België - Belgique - Belgien,1370,Wallonie,regex[init],14,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2.200.445.493,Belgique,9810,Nazareth,Camiel Fremaultstraat,50,"Camiel Fremaultstraat, 50, 9810 Nazareth, Belg...",114786918.0,50.957943,3.592878,...,1.0,Camiel Fremaultstraat,Nazareth,,België - Belgique - Belgien,9810,,orig,50,
6,2.203.230.878,Belgique,1200,Woluwe-Saint-Lambert,Avenue Albert Jonnart,27,"Avenue Albert Jonnart, 27, 1200 Woluwe-Saint-L...",139300758.0,50.843257,4.404007,...,1.0,Avenue Albert Jonnart - Albert Jonnartlaan,Etterbeek,27,België - Belgique - Belgien,1040,,orig,27,
7,0627.866.647,Belgique,2000,Antwerpen,Lange Gasthuisstraat,35-37,"Lange Gasthuisstraat, 35-37, 2000 Antwerpen, B...",195981861.0,51.214239,4.405437,...,1.0,Lange Gasthuisstraat,Antwerpen,35-37,België - Belgique - Belgien,2000,Wilde Zee,orig,35-37,
8,0760.284.614,Belgique,9320,Aalst,Achtermaal,18,"Achtermaal, 18, 9320 Aalst, Belgique",117553684.0,50.933103,4.002864,...,1.0,Achtermaal,Aalst,,België - Belgique - Belgien,9320,,orig,18,


In [31]:
df_res.method.value_counts()

orig                             79
regex[init]                      10
nostreet                          6
nonum                             4
libpostal+regex[lpost]+photon     1
Name: method, dtype: int64

In [56]:
df_res


NameError: name 'df_res' is not defined