In [1]:
import pandas as pd
import urllib

import numpy as np

import json

import tqdm

#%matplotlib inline

tqdm.tqdm.pandas(tqdm)

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


In [2]:
import urllib3

In [3]:
http = urllib3.PoolManager()

In [5]:
from config_batch import * 

# Functions

In [8]:
ws_hostname = "127.0.0.1"

In [9]:
def call_ws(addr_data): #lg = "en,fr,nl"
    t = datetime.now()
    
    params = urllib.parse.urlencode({"street": addr_data[street_field],
                                     "housenumber": addr_data[housenbr_field],
                                     "city": addr_data[city_field],
                                     "postcode": addr_data[postcode_field],
                                     "country": addr_data[country_field],
                                    })
    url = f"http://{ws_hostname}:5000/search/?{params}"
    
    
    try:
        with urllib.request.urlopen(url) as response:
            res = response.read()
            res = json.loads(res)
#             print(res)
            res["time"] = datetime.now() - t
            return res
    except Exception as e:
        return str(e)
    

In [10]:
def call_ws_batch(addr_data, mode="geo"): #lg = "en,fr,nl"
#     print(addr_data)
#     print(addr_data.shape)
#     print()
    file_data = addr_data.rename(columns = {
        street_field : "street",
        housenbr_field: "housenumber",
        postcode_field: "postcode",
        city_field: "city",
        country_field: "country",
        addr_key_field : "addr_key"}).to_csv(index=False)
    
    r = http.request(
    'POST',
    f'http://{ws_hostname}:5000/batch',
    fields= { 
        'media': ('addresses.csv', file_data),
        'mode': mode
    })
    
    res = pd.DataFrame(json.loads(r.data.decode('utf-8')))
#     display(res)
    return res

In [26]:
def expand_json(addresses):
    addresses["status"]= addresses.json.apply(lambda d: "error" if "error" in d else "match" if "match" in d else "rejected")
    addresses["time"]  = addresses.json.apply(lambda d: d["time"])

    addresses["timing"]  = addresses.json.apply(lambda d: d["timing"] if "timing" in d else {})

    addresses["method"]= addresses.json.apply(lambda d: d["match"][0]["method"] if len(d)>0 and "match" in d else "none")
    
    for field in ["street", "number", "postcode", "city"]:
        addresses[field]= addresses.json.apply(lambda d: d["match"][0]["addr_out_"+field] if len(d)>0 and "match" in d else "")
    return 

# Calls

## Single address calls

In [12]:
call_ws({street_field: "Av. Fonsny", 
          housenbr_field: "20",
          city_field: "Saint-Gilles",
          postcode_field:  "1060",
          country_field: "Belgium"})

{'match': [{'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue Fonsny - Fonsnylaan',
   'display_name': 'DAE (Smals), 20, Avenue Fonsny - Fonsnylaan, Saint-Gilles - Sint-Gillis, Brussel-Hoofdstad - Bruxelles-Capitale, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, België / Belgique / Belgien',
   'extra_house_nbr': '20',
   'lat': '50.8358216',
   'lon': '4.3386884',
   'method': 'orig',
   'place_id': 343087,
   'place_rank': 30}],
 'rejected': [{'SIM_city': 0.46153846153846156,
   'SIM_house_nbr': 1.0,
   'SIM_street': 1.0,
   'SIM_street_which': 'addr_out_street',
   'SIM_zip': 0.1,
   'addr_out_city': 'Saint-Gilles - Sint-Gillis',
   'addr_out_country': 'België / Belgique / Belgien',
   'addr_out_number': '20',
   'addr_out_postcode': '',
   'addr_out_street': 'Avenue F

## Batch calls (row by row)

In [6]:
addresses = get_addresses("address.csv.gz")
addresses = addresses.sample(100).copy()

### Simple way

In [29]:
addresses["json"] = addresses.progress_apply(call_ws, axis=1)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


### Using Dask

In [17]:
dd_addresses = dd.from_pandas(addresses, npartitions=4)

dask_task = dd_addresses.apply(call_ws, meta=('x', 'str'), axis=1)

with ProgressBar(): 
    addresses["json"] = dask_task.compute()

[########################################] | 100% Completed |  2min  4.4s


In [30]:
expand_json(addresses)

## Batch calls (batch WS)

### Single block

In [18]:
# Only geocoding
call_ws_batch(addresses)

Unnamed: 0,addr_key,lat,lon,place_rank
0,0400.973.848,51.026373,4.983889,30
1,0422.478.847,51.247551,4.457435,30
2,0424.894.543,50.867108,3.442699,26
3,0436.293.924,51.203257,3.208497,30
4,0441.670.494,50.831226,4.335429,30
...,...,...,...,...
80,2.276.229.417,50.927358,5.358747,26
81,2.278.033.815,51.155505,3.981056,30
82,2.279.759.524,51.057200,4.663296,30
83,2.279.978.862,50.747870,4.350929,26


In [19]:
# Geocode + address
call_ws_batch(addresses, mode="short") 

Unnamed: 0,addr_key,lat,lon,place_rank,addr_out_street,addr_out_number,extra_house_nbr,addr_out_postcode,addr_out_city,addr_out_country
0,0844.841.789,51.062821,3.689577,30,Perzikstraat,49,49,9000,Gent,België / Belgique / Belgien
1,2.279.978.862,50.747870,4.350929,26,Stationsstraat - Rue de la Station,,148,1640,Sint-Genesius-Rode,België / Belgique / Belgien
2,0455.446.969,50.585042,4.355348,26,Rue de l'Industrie,,30,1400,Nivelles,België / Belgique / Belgien
3,2.241.495.992,50.854845,4.336434,30,Rue de l'École - Schoolstraat,6,6,1080,Molenbeek-Saint-Jean - Sint-Jans-Molenbeek,België / Belgique / Belgien
4,0890.360.426,50.841763,4.377404,26,Rue De Pascale - De Pascalestraat,,4-6,1000,BXL,België / Belgique / Belgien
...,...,...,...,...,...,...,...,...,...,...
80,2.245.941.562,50.568372,4.171773,26,Rue de la Marlière,,6,7190,Écaussinnes,België / Belgique / Belgien
81,2.251.400.781,50.490712,5.103314,26,Rue du Pré des Dames,,61,5300,Andenne,België / Belgique / Belgien
82,0422.478.847,51.247551,4.457435,30,Oudebareellei,79-81,79,2170,Schoten,België / Belgique / Belgien
83,0693.673.229,50.647252,4.419385,26,Chaussée de Bruxelles,,27,1472,Genappe,België / Belgique / Belgien


### Batch blocs

In [17]:
chunk_size = 50
chunks = np.array_split(addresses, addresses.shape[0]//chunk_size)


res= [call_ws_batch(chunk, mode="short") for chunk in tqdm.tqdm(chunks)]

## TODO : find a better way with dask? It seems that map_partitions does not support function returning dataframes. 

100%|██████████| 2/2 [00:07<00:00,  3.63s/it]
