# Given notebook performs performance estimation of first DBSCAN
* **fp**	False positive: Amount of clusters not having any RFID records
* **fn**	False negative: Amount of RFID not belong to any cluster
* **tp**	True positive: Amount of RFID having cluster id
* **RFID_coverage_rate**	tp/(fp+fn+tp)

In [9]:
import os
import numpy as np
import pandas as pd
import pymongo
from sklearn.cluster import DBSCAN
import plotly.express as px
import geopandas as gpd
from multiprocessing import Pool
from ClusteringPipeline import do_clustering
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# SELECTED good candidates
candidates = pd.read_csv('/Data/Intermediate/MapMatchingReports/GOOD_RFID_ACTIVATIONS.csv')

OCTOBER_RFID = '/Data/Source/RFID/RFID_october.csv'

# SOURCE files paths
SOURCE_GPS_DIR = '/Data/Source/GoodGPSCandidates/'
FILES = os.listdir(SOURCE_GPS_DIR)
SOURCE_GPS_PATHS = list(map(lambda x,y: x+y, [SOURCE_GPS_DIR]*len(FILES), FILES))

# ML models paths
IS_MOVING = os.getcwd()+'/ClusteringPipeline/MachineLearningModels/logreg_1.sav'
SPEED_REHAB_MODEL = os.getcwd()+'/ClusteringPipeline/MachineLearningModels/reg_rehab_speed_1.sav'

# DBSCAN parameters
MIN_SAMPLES = 15
EPS = 130
MIN_POINT_AMT_RFID = 1

# PERFORMANCE FOLDER:
RFID_GPS_MERGE_DIR = '/Data/Outputs/GPS_Clusters/'

## 1. Setup clustering

In [10]:
def read_data(args):
    args['gps'] = pd.read_csv(args['source_path'], parse_dates=['time'])
    args['gps'] = args['gps'].loc[~args['gps'].unixtime.duplicated(keep='last'), :] \
        .reset_index(drop=True).copy()
    args['gps'].truck_id = args['gps'].truck_id.str.replace(' ', '-')
    return args


def clusterize(args):
    results = do_clustering(args['gps'], min_samples=args['min_samples'],
                       eps=args['eps'],
                       is_moving_log_reg_path = args['is_moving_log_reg_path'],
                       speed_rehab_model_path = args['speed_rehab_model_path'],
                       fname = args['fname'])
    args['gps'] = results['df'].copy()
    del results
    return args


def export_clustered_gps(args):
    fpath = args['merge_dir'] + args['fname']
    args['gps'].to_csv(fpath, index=False)
    del args

def score(args):

    PIPELINE = [
        read_data,
        clusterize,
        export_clustered_gps,
    ]
    
    fname = args['source_path'].rsplit('/', 1)[1]
    
    for func in PIPELINE:
        args = func(args)
    print(f"{fname}: is done!")
    return args

## Arguments preparation

In [11]:
N_ARGS = len(SOURCE_GPS_PATHS)
args = list(map(lambda src_path, # read_data
                min_samples, eps, is_moving_log_reg_path, # DBSCAN
                speed_rehab_model_path, fname, # DBSCAN
                merge_dir:  
                {
                    'source_path': src_path,
                    'min_samples': min_samples,
                    'eps': eps,
                    'is_moving_log_reg_path': is_moving_log_reg_path,
                    'speed_rehab_model_path': speed_rehab_model_path,
                    'fname': fname,
                    
                    'merge_dir': merge_dir
                },
                SOURCE_GPS_PATHS,
                [MIN_SAMPLES]*N_ARGS,
                [EPS]*N_ARGS,
                [IS_MOVING]*N_ARGS,
                [SPEED_REHAB_MODEL]*N_ARGS,
                FILES,
                [RFID_GPS_MERGE_DIR]*N_ARGS
               ))

In [12]:
%%time
with Pool(5) as pool:
    outputs = pool.starmap(score, zip(args))

XE-5598Z_2020-10-30.csv: is done!
XE-5620S_2020-10-22.csv: is done!
XE-5612R_2020-10-22.csv: is done!
XE-5598Z_2020-10-7.csv: is done!
XE-5612R_2020-10-30.csv: is done!
XE-5612R_2020-10-7.csv: is done!
XE-5630M_2020-10-22.csv: is done!
XE-5680T_2020-10-7.csv: is done!
XE-5620S_2020-10-30.csv: is done!
XE-5628X_2020-10-30.csv: is done!
XE-5598Z_2020-10-22.csv: is done!
XE-5630M_2020-10-30.csv: is done!
XE-5705G_2020-10-30.csv: is done!
CPU times: user 42.7 ms, sys: 29.4 ms, total: 72.1 ms
Wall time: 4.29 s
