In [77]:
import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import tqdm

#%matplotlib inline

tqdm.pandas(tqdm)

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


In [2]:
import urllib3

In [3]:
http = urllib3.PoolManager()

In [4]:
from config_batch import * 

# Functions

In [5]:
ws_hostname = "127.0.0.1"

In [6]:
def call_ws(addr_data): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [29]:
def call_ws_batch(addr_data, mode="geo"): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"}).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode
    })
    
#     print(r.data.decode('utf-8'))
    res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
#     display(res)
    return res

In [8]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [83]:
call_ws({street_field: "Av. Fonsny", 
          housenbr_field: "20",
          city_field: "Saint-Gilles",
          postcode_field:  "1060",
          country_field: "Belgium"})

{'match': [{'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue F

## Batch calls (row by row)

In [72]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(1000).copy()

### Simple way

In [29]:
addresses["json"] = addresses.progress_apply(call_ws, axis=1)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed |  2min  4.4s


In [30]:
expand_json(addresses)

## Batch calls (batch WS)

### Single block

In [42]:
# Only geocoding
call_ws_batch(addresses)


Unnamed: 0,addr_key,lat,lon,place_rank
0,0413.915.628,50.841404,4.354718,30.0
1,0414.808.820,50.839255,3.166077,26.0
2,0415.311.636,51.165199,4.646993,30.0
3,0415.861.368,51.174983,4.151651,30.0
4,0417.004.384,51.204335,4.394418,26.0
...,...,...,...,...
94,2.225.889.286,50.494427,5.223551,26.0
95,0450.543.224,50.069063,4.509224,21.0
96,0598.893.143,50.747180,3.224409,21.0
97,0828.156.207,50.645138,5.573420,16.0


In [47]:
# Geocode + address
call_ws_batch(addresses, mode="long") 


Unnamed: 0,0
error,Cannot connect to Photon (photon:2322): <urlo...


### Batch blocs

In [106]:
chunk_size = 250
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)

res= [call_ws_batch(chunk, mode="long") for chunk in tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 
#50: 4:04
#100 : 2:30
#250 : 2:04
#1000 : 1:37

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [107]:
df_res = pd.concat(res, sort=False)
df_res

Unnamed: 0,addr_key,country,postcode,city,street,housenumber,method
0,2.063.881.173,Belgique,7890,Ellezelles,Rue Arbre St Pierre,49,error on orig
1,2.190.391.246,Belgique,3520,Zonhoven,Kretenveldweg,11,error on orig
2,2.030.062.421,Belgique,2440,Geel,Kapelstraat,8,error on orig
3,0867.107.447,Belgique,2321,Hoogstraten,Krekelstraat,8,error on orig
4,2.137.061.339,Belgique,8820,Torhout,Rijksweg,33,error on orig
...,...,...,...,...,...,...,...
245,0821.220.311,Belgique,6060,Charleroi,Rue des Sept Actions,39,error on orig
246,0475.425.902,Belgique,8421,De Haan,Lepelemstraat,4,error on orig
247,2.100.490.260,Belgique,2030,Antwerpen,Noorderlaan,79,error on orig
248,2.166.515.685,Belgique,2890,Puurs-Sint-Amands,Lippelodorp(LIP),60,error on orig


In [108]:
df_res.method.value_counts()

error on orig    1000
Name: method, dtype: int64

In [None]:
1000 : 
orig                                   832
regex[init]                            102
libpostal+regex[lpost]+photon           35
nonum                                    9
libpostal+regex[lpost]+photon+nonum      3
photon                                   3
libpostal+regex[lpost]                   2
nostreet                                 1