In [None]:
%load_ext autoreload
%autoreload

In [None]:
import pandas as pd
import geopandas as gpd
import shapely
import nomad.io.base as loader
import numpy as np
import nomad.stop_detection.hdbscan as HDBSCAN
import nomad.stop_detection.lachesis as LACHESIS
import nomad.stop_detection.ta_dbscan as TADBSCAN
import geopandas as gpd
import nomad.visit_attribution as va
import nomad.filters as filters
from nomad.contact_estimation import overlapping_visits, compute_visitation_errors, compute_precision_recall_f1
import matplotlib.pyplot as plt

In [None]:
traj_cols = {'uid':'uid',
             'x':'x',
             'y':'y',
             'timestamp':'timestamp'}

diaries_df = loader.from_file("../../nomad/data/diaries", format="parquet", traj_cols=traj_cols,
                       parse_dates=True)
sparse_df = loader.from_file("../../nomad/data/sparse_traj/", format="parquet", traj_cols=traj_cols,
                      parse_dates=True)

In [None]:
# Reproject from gc_coords to web mercator
sparse_df.loc[:,'x'] = (sparse_df['x'] - 4265699)/15
sparse_df.loc[:,'y'] = (sparse_df['y'] + 4392976)/15

diaries_df.loc[:,'x'] = (diaries_df['x'] - 4265699)/15
diaries_df.loc[:,'y'] = (diaries_df['y'] + 4392976)/15

In [None]:
# Select data from 2 users
user1 = diaries_df.uid.unique()[0]
user2 = diaries_df.uid.unique()[10]

sparse1 = sparse_df.loc[sparse_df['uid'] == user1]
sparse2 = sparse_df.loc[sparse_df['uid'] == user2]

diary1 = diaries_df.loc[diaries_df.uid == user1]
diary2 = diaries_df.loc[diaries_df.uid == user2]

In [None]:
completeness_df = filters.q_stats(sparse_df, user_id='uid', timestamp='timestamp')

In [None]:
completeness_df.q_stat.quantile([0.5, 0.7, 0.85])

In [None]:
plt.figure()
completeness_df.boxplot(column="q_stat")
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
DUR_MIN=5
DT_MAX=60
DELTA_ROAM=100

stop_table_lachesis = LACHESIS.lachesis(traj=sparse1,
                                        dur_min=DUR_MIN,
                                        dt_max=DT_MAX,
                                        delta_roam=DELTA_ROAM,
                                        traj_cols=traj_cols,
                                        keep_col_names=True,
                                        complete_output=True)


labels_lachesis = LACHESIS._lachesis_labels(traj=sparse1,
                                            dur_min=DUR_MIN,
                                            dt_max=DT_MAX,
                                            delta_roam=DELTA_ROAM,
                                            traj_cols=traj_cols)
labels_lachesis.name = 'cluster'


pred_lachesis = va.point_in_polygon(traj=sparse1,
                 labels=labels_lachesis,
                 stop_table=stop_table_lachesis,
                 traj_cols=traj_cols,
                 is_datetime=False,
                 is_long_lat=False)

pred_lachesis.location.nunique()

# truth_df = diary1

# va.majority_poi(traj=sparse1,
#              labels=labels_lachesis,
#              stop_table=stop_table_lachesis,
#              traj_cols=traj_cols,
#              is_datetime=False,
#              is_long_lat=False)

In [None]:
stop_table_hdbscan = HDBSCAN.st_hdbscan(
                traj=sparse1,
                traj_cols=traj_cols,
                time_thresh=60,
                min_pts=2,
                min_cluster_size=3)

labels_hdbscan = HDBSCAN.hdbscan_labels(
                traj=sparse1,
                traj_cols=traj_cols,
                time_thresh=60,
                min_pts=2,
                min_cluster_size=3)

labels_hdbscan.name = 'cluster'

pred_hdbscan = va.point_in_polygon(traj=sparse1,
                 labels=labels_hdbscan,
                 stop_table=stop_table_hdbscan,
                 traj_cols=traj_cols,
                 is_datetime=False,
                 is_long_lat=False)

pred_hdbscan.location.nunique()

In [None]:
len(pred_hdbscan)

In [None]:
diary1.location.nunique()

In [None]:
sparse_df

In [None]:
# Step 1: Compute overlaps
overlaps = overlapping_visits(left=pred_hdbscan,
                              right=diary1,
                              match_location=False)

# Step 2: Compute visitation errors (missed, merged, split)
errors = compute_visitation_errors(overlaps=overlaps,
                                   true_visits=diary1)
print("Visitation Errors:", errors)

# Step 3: Compute precision, recall, and F1
prf1 = compute_precision_recall_f1(overlaps=overlaps,
                                   pred_visits=pred_hdbscan,
                                   true_visits=diary1)
print("Precision / Recall / F1:", prf1)

In [None]:
stop_table_hdbscan = HDBSCAN.st_hdbscan(
                traj=sparse1,
                traj_cols=traj_cols,
                time_thresh=60,
                min_pts=2,
                min_cluster_size=3
            )

labels_hdbscan = HDBSCAN.hdbscan_labels(
                traj=sparse1,
                traj_cols=traj_cols,
                time_thresh=60,
                min_pts=2,
                min_cluster_size=3
            )

labels_hdbscan.name = 'cluster'

va.point_in_polygon(traj=sparse1,
                 labels=labels_hdbscan,
                 stop_table=stop_table_hdbscan,
                 traj_cols=traj_cols,
                 is_datetime=False,
                 is_long_lat=False)

In [None]:
traj_cols = {'uid':'uid',
             'x':'x',
             'y':'y',
             'timestamp':'timestamp'}
stop_detection_algos = ['lachesis', 'ta-dbscan', 'hdbscan']
metrics_df = pd.DataFrame()
TIME_THRESH=60
DIST_THRESH=5
MIN_PTS=2

DUR_MIN=5
DT_MAX=60
DELTA_ROAM=3

for user in diaries_df.uid.unique():
    # truth_df = diaries_df.loc[diaries_df.uid == user].dropna().reset_index(drop=True)
    sparse = sparse_df[sparse_df['uid'] == user]
    sparse.loc[:,'x'] = (sparse['x'] - 4265699)/15
    sparse.loc[:,'y'] = (sparse['y'] + 4392976)/15
    
    truth = diaries_df.loc[diaries_df['uid'] == user]
    truth.loc[:,'x'] = (truth['x'] - 4265699)/15
    truth.loc[:,'y'] = (truth['y'] + 4392976)/15

    for algo in stop_detection_algos:
        if algo == 'lachesis':
            stop_table = LACHESIS.lachesis(
                traj=sparse,
                dur_min=DUR_MIN,
                dt_max=DT_MAX,
                delta_roam=DELTA_ROAM,
                traj_cols=traj_cols
            )
            labels = LACHESIS._lachesis_labels(
                traj=sparse,
                dur_min=5,
                dt_max=60,
                delta_roam=3,
                traj_cols=traj_cols
            )
            labels.name = 'cluster'
            # print(algo)
            # print(stop_table)
            # print(labels)
        elif algo == 'ta-dbscan':
            stop_table = TADBSCAN.temporal_dbscan(
                data=sparse,
                time_thresh=TIME_THRESH,
                dist_thresh=DIST_THRESH,
                min_pts=MIN_PTS,
                traj_cols=traj_cols
            )
            labels = TADBSCAN._temporal_dbscan_labels(
                data=sparse,
                time_thresh=TIME_THRESH,
                dist_thresh=DIST_THRESH,
                min_pts=MIN_PTS,
                traj_cols=traj_cols
            )
            labels.name = 'cluster'
            # print(algo)
            # print(stop_table)
            # print(labels)
        else:  # 'hdbscan'
            stop_table = HDBSCAN.st_hdbscan(
                traj=sparse,
                traj_cols=traj_cols,
                time_thresh=TIME_THRESH,
                min_pts=2,
                min_cluster_size=3
            )
            labels = HDBSCAN.hdbscan_labels(
                traj=sparse,
                traj_cols=traj_cols,
                time_thresh=TIME_THRESH,
                min_pts=2,
                min_cluster_size=3
            )
            labels.name = 'cluster'
            # print(algo)
            # print(stop_table)
            # print(labels)
        
        pred = va.point_in_polygon(traj=sparse,
                 labels=labels,
                 stop_table=stop_table,
                 traj_cols=traj_cols,
                 is_datetime=False,
                 is_long_lat=False)

        # Step 1: Compute overlaps
        overlaps = overlapping_visits(left=pred,
                                      right=truth,
                                      match_location=False)

        # Step 2: Compute visitation errors (missed, merged, split)
        errors = compute_visitation_errors(overlaps=overlaps,
                                           true_visits=truth)

        # Step 3: Compute precision, recall, and F1
        prf1 = compute_precision_recall_f1(overlaps=overlaps,
                                           pred_visits=pred,
                                           true_visits=truth)

        all_metrics = {**errors, **prf1, 'user': user, 'algorithm': algo}
        metrics_df = pd.concat([metrics_df, pd.DataFrame([all_metrics])], ignore_index=True)

In [None]:
metrics = ["missed_fraction", "merged_fraction", "split_fraction", "precision", "recall", "f1"]

for metric in metrics:
    plt.figure()
    metrics_df.boxplot(column=metric, by='algorithm')
    plt.title(f'{metric} by Algorithm')
    plt.suptitle('')
    plt.xlabel('Algorithm')
    plt.ylabel(metric)
    plt.grid(True)
    plt.tight_layout()

plt.show()