# Clustering employees

This notebook take employee with routing informations registry and perform a cluster analysis only for employees far from their workplace more than 15 Km or 15 minutes. The subset is the sample for clustering.

Out fo sample employee are marked as 'Out-of-Sample'. The final categorization is a concatenation of cluster label and province distinguish between outliers (isolate users) and cluster (agglomerates). 

The output of notebook is the enriched employee registry with labels attached to each employees as returned by DBSCAN algorithms. 

Several runs with different parameters are done in order to explore and select optimal numbers. 

In [2]:

from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

harbour_path = 'path/to/curated_data'

spark = SparkSession.builder\
  .master("local")\
  .appName("application-name")\
  .getOrCreate()

In [3]:
# import data geo con distanze
anag_dip_geo = spark.read.parquet(harbour_path + 'anag_dip_geo')

In [4]:
# select users and column
# define max distance from sede
MAX_DISTANCE = 15 # kms
MAX_DURATION = 15 # mins

# consider only the cases where the max distance for each employee is bigger than MAX_DISTANCE km
train = anag_dip_geo\
    .filter(
        ((F.col('distanza') >= MAX_DISTANCE) | (F.col('durata') >= MAX_DURATION))
        & (F.col('flg_regione') == 'In regione')
    )\
    .select('id_dipendente', 'lat', 'lon')\
    .toPandas()

print('Numero di lavoratori con residenza/domicilio a più di {} km o {}\' dalla sede assegnata: {}'.format(MAX_DISTANCE, MAX_DURATION, train.shape[0]))
coords = train[['lat', 'lon']].to_numpy()

In [5]:
# perform cluster analysis
kms_per_radian = 6371.0088 # conversion of a radiant in km

# list of hyperparameter to fit
kms = [1, 1.25, 1.5, 1.75, 2, 2.25, 2.5]
min_samples = [3, 4, 5]

# product(kms, min_samples)
# [(x, y) for x in kms for y in min_samples]
model_registry = []

for params in [(x, y) for x in kms for y in min_samples]:
    km = params[0]
    epsilon = km/kms_per_radian # define the distance between two employee in a cluster
    min_samples=params[1] # define the minimum number of members for each cluster

    model = DBSCAN(
        eps=epsilon, # max distance between two point in a cluster
        min_samples=min_samples, # min people in a cluster
        algorithm='ball_tree',
        metric='haversine')

    model.fit(np.radians(coords)) # needs radiant ot be fitted
    clusters =  len(set(model.labels_))
    outliers = len([l for l in model.labels_ if l == -1])
    model_description = {
        'model_name': 'model-' + 'km='+ str(km) + '-min_samples=' + str(min_samples),
        'model': model,
        'km': km,
        'min_samples': min_samples,
        'clusters': clusters,
        'outliers': outliers
    }

    model_registry = model_registry + [model_description]
    # print('Parameters: km={}, min_samples = {}. Found {} clusters and {} otuliers.'.format( km, min_samples, clusters, outliers)

In [6]:
model_registry_df = pd.DataFrame(model_registry).drop(columns='model').sort_values('outliers', ascending=True).reset_index()
display(model_registry_df)

In [7]:
# select best model hyperparameter
best_model_name = 'model-km=1.5-min_samples=3'

# best_model_name = model_registry_df.model_name[0]
print('Best configuration of parameters is: {}'.format(best_model_name))

best_model = [m for m in model_registry if m.get('model_name')==best_model_name][0].get('model')
train['cluster'] = best_model.labels_.astype(str)
train.cluster.value_counts()

In [8]:
# write back data
# dipendenti con cluster
# assembly data back
anag_dip_geo_clus = anag_dip_geo\
    .join(
        spark.createDataFrame(train).select('id_dipendente', 'cluster'), 
        on='id_dipendente', 
        how='left'
        )\
    .withColumn(
        'label_type', F.when(F.col('cluster').isNull(), 'Out of Sample')\
            .when(F.col('cluster')=='-1', 'Outlier')\
            .otherwise(F.lit('Cluster'))
    )\
    .withColumn(
        'label', F.when(F.col('label_type') == 'Cluster',
            F.concat(F.col('label_type'), F.lit(' '), F.col('cluster'), F.lit(' ('), F.col('provincia'), F.lit(')'))
        )\
        .otherwise(
            F.concat(F.col('label_type'), F.lit(' ('), F.col('provincia'), F.lit(')'))
        )
    )

# anag_clus

In [9]:
anag_dip_geo_clus.count()

In [10]:
anag_dip_geo_clus.write.parquet(harbour_path + 'anag_dip_geo_clus', mode='overwrite')

In [10]:
# anag_clus = anag_dip_geo_clus\
#     .groupBy('label', 'label_type', 'provincia')\
#     .agg(
#         F.count('id_dipendente').alias('n_dipendenti'),
#         F.avg('lat').alias('avg_lat'),
#         F.avg('lon').alias('avg_lon'),
        
#         )

# anag_clus.count()

In [11]:
# anag_clus.write.parquet(harbour_path + 'v2/anag_clus', mode='overwrite')