# About
* **Author**: Adil Rashitov (adil.rashitov.98@gmail.com)
* **Created at**: 20.01.2022


In [1]:
# Imports / Configs / Global vars

# Import of native python tools
import os
import json
from functools import reduce

# Import of base ML stack libs
import numpy as np
import sklearn as sc

# Visualization libraries
# import plotly.express as px

# Logging configuraiton
import logging
logging.basicConfig(format='[ %(asctime)s ][ %(levelname)s ]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger()
logger.setLevel(logging.INFO)


# Ipython configs
from IPython.core.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:100% !important; }</style>"))
InteractiveShell.ast_node_interactivity = 'all'

# Pandas configs
import pandas as pd
import geopandas as gpd
pd.options.display.max_rows = 350
pd.options.display.max_columns = 250

# Jupyter configs
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

# Configure project PATH
from pathlib import Path
import sys
PROJECT_PATH = os.getcwd().rsplit('/', 1)[0]


if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

  from IPython.core.display import display, HTML




# Data

In [2]:
# Best score
logging.info("The best performed model according `fbeta_score`:")
df = pd.read_csv("../data/grid_search_results.csv")
df["name"] = df["name"].str.replace("factory_", "")
df[df["fbeta_score"] == df["fbeta_score"].max()]

[ 03/06/2023 12:22:07 PM ][ INFO ]: The best performed model according `fbeta_score`:


Unnamed: 0,name,max_velocity_hard_limit,eps,min_samples,recall,precision,beta,fbeta_score,status,status_details,min_duration_sec
3439,stcm,6.0,20.0,,0.372881,0.367521,1.0,0.370182,success,,100.0


In [3]:
print(df.sort_values(
    by=["fbeta_score"],
    ascending=False,
).head(20).to_csv(index=False).replace(",", " | "))

name | max_velocity_hard_limit | eps | min_samples | recall | precision | beta | fbeta_score | status | status_details | min_duration_sec
stcm | 6.0 | 20.0 |  | 0.3728813559322034 | 0.3675213675213675 | 1.0 | 0.3701819604774017 | success |  | 100.0
stcm | 9.0 | 20.0 |  | 0.3870056497175141 | 0.3535620052770448 | 1.0 | 0.3695286788313086 | success |  | 100.0
stcm | 7.0 | 20.0 |  | 0.3785310734463277 | 0.3598901098901099 | 1.0 | 0.3689753021544929 | success |  | 100.0
stcm | 8.0 | 20.0 |  | 0.384180790960452 | 0.3546666666666667 | 1.0 | 0.3688342407373423 | success |  | 100.0
stcm | 6.0 | 20.0 |  | 0.3728813559322034 | 0.361344537815126 | 1.0 | 0.3670223084384093 | success |  | 95.26315789473684
stcm | 6.0 | 20.0 |  | 0.3728813559322034 | 0.3603351955307262 | 1.0 | 0.3665009362691289 | success |  | 90.52631578947368
stcm | 9.0 | 20.0 |  | 0.3870056497175141 | 0.348051948051948 | 1.0 | 0.3664966410796458 | success |  | 95.26315789473684
stcm | 9.0 | 20.0 |  | 0.3870056497175141 | 0.347150

In [5]:
# Reading data
gps_records = pd.read_parquet("../data/02_intermediate/gps_records.parquet")
route_plans = pd.read_parquet("../data/02_intermediate/route_plan.parquet")

# Extraction dates that overlap between plan & gps records
overlapping_dates = np.intersect1d(
    list(pd.to_datetime(gps_records["datetime"]).dt.date.astype(str).unique()),
    list(pd.to_datetime(route_plans["date"]).dt.date.astype(str).unique()),
)

def extract_overlapping_dates(
        X: pd.DataFrame,
        column: str,
        overlapping_dates: list[str],
        ) -> pd.DataFrame:
    dates = pd.to_datetime(X[column]).dt.date.astype(str)
    X = X[dates.isin(overlapping_dates)].reset_index(drop=True)
    return X


route_plans = extract_overlapping_dates(X=route_plans.copy(), column="date", overlapping_dates=overlapping_dates)
gps_records = extract_overlapping_dates(X=gps_records.copy(), column="datetime", overlapping_dates=overlapping_dates)


def extraction_overlapping_route_ids(route_plans, gps_records):
    route_plans["route_id"] = route_plans["date"] + " :: " + route_plans["plate_no"]
    routes = np.intersect1d(
        list(gps_records["route_id"].unique()),
        list(route_plans["route_id"].unique()),
    )
    gps_records = pd.merge(
        gps_records,
        pd.DataFrame({"route_id": routes}),
    )

    route_plans = pd.merge(
        route_plans,
        pd.DataFrame({"route_id": routes}),
    )
    return gps_records, route_plans


gps_records, route_plans = extraction_overlapping_route_ids(route_plans, gps_records)
gps_records = [
    gps_records[gps_records["plate_no"] == plate_no].reset_index(drop=True)
    for plate_no in gps_records["plate_no"].unique()
]

# Main

In [6]:
# Definition of factory function
from gps_activity import ActivityExtractionSession
from gps_activity.extraction.factory.preprocessing import PreprocessingFactory as ActivityPreprocessingFactory
from gps_activity.extraction.factory.fragmentation import VelocityFragmentationFactory
from gps_activity.extraction.factory.clustering import FDBSCANFactory
from gps_activity.extraction.factory.clustering import STCMFactory


def factory_stcm(
        max_velocity_hard_limit: float,
        eps: float,
        min_duration_sec: float,
) -> ActivityExtractionSession:
    preprocessing = ActivityPreprocessingFactory.factory_pipeline(
        source_lat_column="lat",
        source_lon_column="lon",
        source_datetime="datetime",
        source_vehicle_id="plate_no",
        source_crs="EPSG:4326",
        target_crs="EPSG:2326",
    )
    fragmentation = VelocityFragmentationFactory.factory_pipeline(
        max_velocity_hard_limit=max_velocity_hard_limit,
    )
    clustering = STCMFactory.factory_pipeline(
        source_vehicle_id_column="plate_no",
        eps=eps,
        min_duration_sec=min_duration_sec,
    )

    stcm = ActivityExtractionSession(
        preprocessing=preprocessing,
        fragmentation=fragmentation,
        clustering=clustering,
    )
    return stcm

In [7]:
# Linker definition
from gps_activity.linker.factory import PreprocessingFactory as LinkerPreprocessingFactory
from gps_activity.models import DataFramePivotFields
from gps_activity.linker.factory import ClusterAggregationFactory
from gps_activity.linker.factory import ClusterAggregationFactory
from gps_activity.linker.factory import JoinValidatorFactory
from gps_activity.linker.factory import SpatialJoinerFactory
from gps_activity.linker.factory import CoverageStatisticsFactory
from gps_activity import ActivityLinkageSession


def factory_linker():
    WSG_84="EPSG:4326"
    HK_CRS="EPSG:2326"


    gps_pivot_fields = DataFramePivotFields(
        source_lat="lat",
        source_lon="lon",
        source_datetime="datetime",
        source_vehicle_id="plate_no",
    )


    gps_preprocess_pipeline = LinkerPreprocessingFactory.factory_pipeline(
        source_lat_column=gps_pivot_fields.source_lat,
        source_lon_column=gps_pivot_fields.source_lon,
        source_datetime=gps_pivot_fields.source_datetime,
        source_vehicle_id=gps_pivot_fields.source_vehicle_id,
        source_crs=WSG_84,
        target_crs=HK_CRS,
        generate_primary_key_for="gps",
        source_composite_keys=[
            gps_pivot_fields.source_vehicle_id,
            gps_pivot_fields.source_datetime,
            gps_pivot_fields.source_lat,
            gps_pivot_fields.source_lon,
        ],
    )


    plans_pivot_fields = DataFramePivotFields(
        source_lat="lat",
        source_lon="lng",
        source_datetime="date",
        source_vehicle_id="plate_no",
        plans_pk="service_point_id",
    )


    plans_preprocess_pipeline = LinkerPreprocessingFactory.factory_pipeline(
        source_lat_column=plans_pivot_fields.source_lat,
        source_lon_column=plans_pivot_fields.source_lon,
        source_datetime=plans_pivot_fields.source_datetime,
        source_vehicle_id=plans_pivot_fields.source_vehicle_id,
        source_crs=WSG_84,
        target_crs=HK_CRS,
        generate_primary_key_for="plan",
        source_composite_keys=[plans_pivot_fields.plans_pk],
    )

    cluster_agg_pipeline = ClusterAggregationFactory.factory_pipeline(
        source_lat_column=gps_pivot_fields.source_lat,
        source_lon_column=gps_pivot_fields.source_lon,
        source_datetime=gps_pivot_fields.source_datetime,
        source_vehicle_id=gps_pivot_fields.source_vehicle_id,
        source_crs=WSG_84,
        target_crs=HK_CRS,
    )


    spatial_joiner = SpatialJoinerFactory.factory_pipeline(how="inner", max_distance=80)
    spatial_validator = JoinValidatorFactory.factory_pipeline(
        max_days_distance=1,
        ensure_vehicle_overlap=False,
    )

    coverage_stats_extractor = CoverageStatisticsFactory.factory_pipeline()

    linkage_session = ActivityLinkageSession(
        gps_preprocessor=gps_preprocess_pipeline,
        plan_preprocessor=plans_preprocess_pipeline,
        cluster_aggregator=cluster_agg_pipeline,
        spatial_joiner=spatial_joiner,
        spatial_validator=spatial_validator,
        coverage_stats_extractor=coverage_stats_extractor
    )
    return linkage_session

In [8]:
# Metrics
from gps_activity import ActivityMetricsSession

def factory_metrics():
    return ActivityMetricsSession()

In [9]:
from multiprocess import Pool


def process_and_estimate(clustering_specs: dict, gps: pd.DataFrame, plan: pd.DataFrame):

    # 1. Factory clustering instance
    instance_id = clustering_specs["id"]
    factory_func = clustering_specs["factory_function"]
    kwargs = clustering_specs["kwargs"]
    clustering_instance = factory_func(**kwargs)

    # 2. Definition of dependencies
    linker = factory_linker()
    metrics = factory_metrics()

    # 3. Clustering
    clustered_gps = [
        clustering_instance.predict(gps_part.copy())
        for gps_part in gps
    ]
    clustered_gps = pd.concat(clustered_gps).reset_index(drop=True)
    clustered_gps["cluster_id"] = clustered_gps["cluster_id"].astype(int)

    # 4. Linkage
    linkage_data_container = linker.transform(
        gps=clustered_gps,
        plan=plan,
    )

    # 5. Performance estimate
    return linkage_data_container


In [10]:
linked_data = process_and_estimate(
    clustering_specs={
        "id": 1,
        "factory_function": factory_stcm,
        "kwargs": {
            "max_velocity_hard_limit": 6,
            "eps": 20,
            "min_duration_sec": 100,
        }
    },
    gps=gps_records,
    plan=route_plans,
)


In [11]:
gps_plan = linked_data.full_gps_plan_join

In [12]:
from keplergl import KeplerGl

In [13]:
from gps_activity.models import DataFramePivotFields

In [14]:
fields = DataFramePivotFields()

In [15]:
columns = [
    'datetime',
    'lat_gps',
    'lon_gps',
    'plate_no',
    'route_id',
    'x',
    'y',
    'unixtime',
    'computed_velocity',
    'is_clustering_candidate',
    'cluster_id',
    'gps_primary_key',
    'date',
    'lon_cluster',
    'lat_cluster',
    'cluster_primary_key',
    'plans_primary_key',
    'sjoin_temporal_dist',
    'sjoin_spatial_dist',
    'sjoin_overall_dist',
    'plate_no_plan',
    'date_plan',
    'lon_plan',
    'lat_plan',
]
gps_plan = gps_plan[columns]

In [16]:
import pandas as pd
import pandera as pa


# define schema
schema = pa.DataFrameSchema({
    "datetime": pa.Column(str, coerce=True),
    "lat_gps": pa.Column(float, coerce=True),
    "lon_gps": pa.Column(float, coerce=True),
    "plate_no": pa.Column(str, coerce=True),
    "computed_velocity": pa.Column(float),
    "is_clustering_candidate": pa.Column(bool),
    "cluster_id": pa.Column(int),
    "gps_primary_key": pa.Column(str),
    "date": pa.Column(str),
    "lon_cluster": pa.Column(float, nullable=True),
    "lat_cluster": pa.Column(float, nullable=True),
    "cluster_primary_key": pa.Column(str, nullable=True),
    "plans_primary_key": pa.Column(str, nullable=True),
    "sjoin_temporal_dist": pa.Column(float, nullable=True),
    "sjoin_spatial_dist": pa.Column(float, nullable=True),
    "sjoin_overall_dist": pa.Column(float, nullable=True),
    "lon_plan": pa.Column(float, nullable=True),
    "lat_plan": pa.Column(float, nullable=True),
}, coerce=True)

In [17]:
gps_plan = schema.validate(gps_plan[list(schema.columns.keys())])

In [18]:
gps_plan = gps_plan[(
    (gps_plan["plate_no"] == "Vehicle #1") &
    (gps_plan["date"] == "2022-05-10")
)]

In [None]:
import json
from keplergl import KeplerGl



with open('map_configuration.json', 'r') as f:
    config = json.load(f)


kepler = KeplerGl(data={"gps_plan": gps_plan}, height=900, config=config)
kepler

In [None]:
# import json


# with open('map_configuration.json', 'w', encoding='utf-8') as f:
#     json.dump(kepler.config, f, ensure_ascii=False, indent=4)