# HDBSCAN Profiling Analysis

This notebook identifies and analyzes the bottleneck in the HDBSCAN implementation.

## Setup

In [1]:
import sys
import pandas as pd
import numpy as np
import time
import cProfile
import pstats
from io import StringIO
import matplotlib.pyplot as plt
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path().absolute().parent))

from nomad.stop_detection import hdbscan

## Test Data Generator

In [12]:
def generate_test_data(n_points=1000, seed=42):
    """
    Generate test data with GUARANTEED unique timestamps.
    """
    np.random.seed(seed)
    
    # Random spatial coordinates
    x = np.random.uniform(-100, 100, n_points)
    y = np.random.uniform(-100, 100, n_points)
    
    # Generate UNIQUE timestamps using cumulative sum
    base_time = int(pd.Timestamp('2024-01-01').timestamp())
    intervals = np.random.randint(60, 300, n_points)  # 1-5 min apart
    timestamps = base_time + np.cumsum(intervals)
    
    data = pd.DataFrame({
        'timestamp': timestamps,
        'x': x,
        'y': y,
        'user_id': 1
    })
    
    print(f"Generated {len(data)} points")
    
    return data

# Test it
test_data = generate_test_data(100)
test_data.head()

Generated 100 points


Unnamed: 0,timestamp,x,y,user_id
0,1704067411,-25.091976,-93.714163,1
1,1704067586,90.142861,27.282082,1
2,1704067720,46.398788,-37.128804,1
3,1704067892,19.731697,1.714138,1
4,1704068151,-68.796272,81.513295,1


## 1. cProfile Analysis

Identify which functions consume the most time.

In [13]:
# Generate test data
data_500 = generate_test_data(500, seed=99)

# Profile with cProfile
profiler = cProfile.Profile()
profiler.enable()

labels = hdbscan.hdbscan_labels(
    data=data_500,
    time_thresh=30,
    min_pts=2,
    min_cluster_size=2,
    dur_min=5,
    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}
)

profiler.disable()

# Print stats
s = StringIO()
ps = pstats.Stats(profiler, stream=s)
ps.strip_dirs()
ps.sort_stats('cumulative')
ps.print_stats(40)

print("="*80)
print("cProfile Results - Top 40 Functions by Cumulative Time")
print("="*80)
print(s.getvalue())

Generated 500 points
cProfile Results - Top 40 Functions by Cumulative Time
         2959533 function calls (2906807 primitive calls) in 0.772 seconds

   Ordered by: cumulative time
   List reduced from 979 to 40 due to restriction <40>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.772    0.386 interactiveshell.py:3514(run_code)
        2    0.000    0.000    0.772    0.386 {built-in method builtins.exec}
        1    0.000    0.000    0.772    0.772 1367433339.py:8(<module>)
        1    0.002    0.002    0.771    0.771 hdbscan.py:624(hdbscan_labels)
        1    0.009    0.009    0.543    0.543 hdbscan.py:219(cluster_hierarchy)
      294    0.010    0.000    0.231    0.001 hdbscan.py:160(_build_border_map)
      456    0.007    0.000    0.144    0.000 hdbscan.py:339(_build_graph_pd)
     1004    0.003    0.000    0.116    0.000 indexing.py:883(__setitem__)
      293    0.000    0.000    0.099    0.000 multi.py:216(new_meth)

## 2. line_profiler Analysis

Line-by-line profiling of suspected O(n²) functions.

In [5]:
# Install line_profiler if needed
!pip install line_profiler -q

In [6]:
%load_ext line_profiler

In [None]:
data_300 = generate_test_data(300, seed=88)

print("="*80)
print("Line-by-line profile of _find_temp_neighbors()")
print("="*80)

%lprun -f hdbscan._find_temp_neighbors hdbscan.hdbscan_labels(\
    data=data_300,\
    time_thresh=30,\
    min_pts=2,\
    min_cluster_size=2,\
    dur_min=5,\
    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\
)

Generated 300 points
Line-by-line profile of _find_temp_neighbors()


Timer unit: 1e-09 s

Total time: 0.00294 s
File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py
Function: _find_temp_neighbors at line 11

Line #      Hits         Time  Per Hit   % Time  Line Contents
    11                                           def _find_temp_neighbors(times, time_thresh, use_datetime):
    12                                               """
    13                                               Find timestamp pairs that are within time threshold.
    14                                           
    15                                               Parameters
    16                                               ----------
    17                                               times : array of timestamps.
    18                                               time_thresh : time threshold for finding what timestamps are close in time.
    19                                               use_datetime : Whether to process timestamps as datetime objec

In [15]:
# Profile _compute_core_distance
print("="*80)
print("Line-by-line profile of _compute_core_distance()")
print("="*80)

%lprun -f hdbscan._compute_core_distance hdbscan.hdbscan_labels(\
    data=data_300,\
    time_thresh=30,\
    min_pts=2,\
    min_cluster_size=2,\
    dur_min=5,\
    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\
)

Line-by-line profile of _compute_core_distance()


Timer unit: 1e-09 s

Total time: 0.055209 s
File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py
Function: _compute_core_distance at line 53

Line #      Hits         Time  Per Hit   % Time  Line Contents
    53                                           def _compute_core_distance(data, time_pairs, times, use_lon_lat, traj_cols, min_pts = 2):
    54                                               """
    55                                               Calculate the core distance for each ping in data.
    56                                               It gives local density estimate: small core distance → high local density.
    57                                           
    58                                               Parameters
    59                                               ----------
    60                                               data : dataframe
    61                                           
    62                                         

In [16]:
# Profile _build_hdbscan_graphs
print("="*80)
print("Line-by-line profile of _build_hdbscan_graphs()")
print("="*80)

%lprun -f hdbscan._build_hdbscan_graphs hdbscan.hdbscan_labels(\
    data=data_300,\
    time_thresh=30,\
    min_pts=2,\
    min_cluster_size=2,\
    dur_min=5,\
    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\
)

Line-by-line profile of _build_hdbscan_graphs()


Timer unit: 1e-09 s

Total time: 0.031336 s
File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py
Function: _build_hdbscan_graphs at line 555

Line #      Hits         Time  Per Hit   % Time  Line Contents
   555                                           def _build_hdbscan_graphs(coords, ts_idx, neighbors, core_dist, use_lon_lat):
   556                                               """
   557                                               Computes all graphs required for the HDBSCAN algorithm in one pass.
   558                                           
   559                                               Returns
   560                                               -------
   561                                               edges_sorted : np.recarray
   562                                                   [from, to, weight] sorted descending by weight.
   563                                               d_graph : pd.Series
   564                                