# Simulation and combination

In order to simulate re-assignment employee to a closer office, we calculate for each employee routing distance/duration between their address and all offices in the same province and confinant provinces. 

The output of the notebook is a dataframe with all distances calculated, in order to leave selection to business user or simply filter-out some offices. 


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf
import pyspark.sql.functions as F
from pyspark.sql.window import Window

import requests
import json
import pandas as pd

harbour_path = 'path/to/curated_data'

spark = SparkSession.builder\
  .master("local")\
  .appName("application-name")\
  .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 16)

In [2]:
# define udf
# calculate distances from osm api
def get_distances(origin_lat, origin_lon, destination_lat, destination_lon, by='car'):
    try:
        r = requests.get(
            "http://router.project-osrm.org/route/v1/{}/{},{};{},{}"\
            .format(by, origin_lon, origin_lat, destination_lon, destination_lat)
            )
        response = json.loads(r.content)

        distance=float(response['routes'][0].get('distance')) # in meters
        duration=float(response['routes'][0].get('duration')) # in seconds
    except:
        distance=None
        duration=None

    return [distance, duration]

# register as pandas udf
@pandas_udf('distance long, duration long')
def get_distances_udf(origin_lat: pd.Series, origin_lon: pd.Series, destination_lat: pd.Series, destination_lon: pd.Series) -> pd.DataFrame:
    frame={
        'origin_lat': origin_lat, # y
        'origin_lon': origin_lon, # x
        'destination_lat': destination_lat, # y
        'destination_lon': destination_lon  # x
        }
  
    result = pd.DataFrame(frame).apply(
        lambda x: get_distances(x['origin_lat'], x['origin_lon'], x['destination_lat'], x['destination_lon']), 
        axis=1, result_type='expand'
        )
    return result

# Import data

In [4]:
# import clustered data
anag_dip_geo_clus = spark.read.parquet(harbour_path + 'anag_dip_geo_clus')

# import sedi data
anag_sedi = spark.read.parquet(harbour_path + 'anag_sedi')

In [9]:
# create combinations
# per ogni provincia:
# - creare la lista di province limitrofe in cui guardare;
# - creare la lista di sedi appartenenti alla provincia e a quelle limitrofe;

# lookup_prov_sedi: prov_act, prov_ass, id_sede_ass, geo_ass
# prov
# .withColumn('prov_act', array())
# .join(anag_sedi, prov_ass == prov)

In [10]:
# combinazioni per ogni provincia delle province limitrofe
comb_prov = spark.createDataFrame([
    ('BO', ['BO', 'MO', 'FE', 'RA']),
    ('FE', ['FE', 'MO', 'BO', 'RA']), 
    ('FC', ['FC', 'RN', 'RA']), 
    ('RA', ['RA', 'FC', 'BO','FE']), 
    ('RN', ['RN', 'FC']), 
    ('MO', ['MO', 'BO', 'FE', 'RE']), 
    ('RE', ['RE', 'MO', 'PR']),
    ('PR', ['PR', 'RE', 'PC']), 
    ('PC', ['PC', 'PR'])
    ], schema = ['prov_act', 'prov_ass'])

In [11]:
sim_dip = anag_dip_geo_clus\
    .filter(F.col('flg_regione') == 'In regione')\
    .select('id_dipendente', F.col('lat').alias('dip_lat'), F.col('lon').alias('dip_lon'), F.col('provincia').alias('prov_act'))

sim_dip.count()

In [13]:
sim_sedi = anag_sedi\
    .filter((F.col('flg_regione') == 'In regione') & (F.col('cancellato')==0))\
    .select('id_sede', F.col('lat').alias('sede_lat'),F.col('lon').alias('sede_lon'), F.col('provincia').alias('prov_ass'))

sim_sedi.count()

In [14]:
# genero le combinazioni di coppie id_dipendente/id_sede della sua provincia o provincia confinante
comb_dip_sedi = comb_prov\
    .select('prov_act', F.explode('prov_ass').alias('prov_ass'))\
    .join(sim_dip, on='prov_act')\
    .join(sim_sedi, on='prov_ass')\
    .select('id_dipendente', 'id_sede', 'dip_lat', 'dip_lon', 'sede_lat', 'sede_lon')\
    .distinct()
    
comb_dip_sedi.count()

# Routing combination

In [16]:
comb_window = Window.partitionBy('id_dipendente').orderBy(F.col('distanza'))
comb_dip_sedi_geo = comb_dip_sedi\
    .withColumn('Route', 
        get_distances_udf('dip_lat', 'dip_lon', 'sede_lat', 'sede_lon'))\
    .withColumn('distanza', col('Route.distance')/1000.0)\
    .withColumn('durata', col('Route.duration')/60.0)\
    .select('id_dipendente', 'id_sede', 'distanza', 'durata')\
    .withColumn('rank', F.row_number().over(comb_window))

# count action to trigger computation (long running computation)
# comb_dip_sedi_geo.count()

# Persist data

In [22]:
# write back data (long computation)
comb_dip_sedi_geo.write.parquet(harbour_path + 'comb_dip_sedi_geo', mode='overwrite')