# **Tutorial 4: Attributing visits and detecting homes**

In this notebook we will explore how to attribute stops to building geometries in two ways, and use the output to detect user homes and workplaces.

### Download the data
Download the file [IC2S2-2025.zip](https://drive.google.com/file/d/1wk3nrNsmAiBoTtWznHjjjPkmWAZfxk0P/view?usp=drive_link) and extract it to this folder to obtain the sample trajectory data used in this tutorial.

In [1]:
import geopandas as gpd
from shapely.geometry import Polygon, box, Point
import matplotlib.pyplot as plt

import nomad.io.base as loader
from nomad import filters
from nomad.stop_detection.viz import plot_pings

ModuleNotFoundError: No module named 'nomad'

In [None]:
poi_table = gpd.read_file('garden_city.gpkg').set_index('building_id')

city_bounds = box(*poi_table.total_bounds)
color_map = {"home":"#86ceeb", "park":"#90ee90", "retail":"#d3d3d3", "work":"#c9a0dc"}

tc = {"user_id": "gc_identifier","timestamp": "unix_ts","x":"dev_x", "y":"dev_y"}

traj = loader.sample_from_file('gc_data_long/',
                               format='parquet',
                               users=['confident_aryabhata'],
                               within=city_bounds, # spatial filters
                               filters=[("ha", "<", 30), ("date", "<", '2024-01-04')], # regular filters
                               data_crs="EPSG:3857",
                               traj_cols=tc)

fix, ax = plt.subplots(figsize=(5,5))

ax.set_axis_off()
poi_table.plot(ax=ax, color=poi_table["type"].map(color_map), edgecolor="black")

plot_pings(traj, ax=ax, point_color='black',
           radius="ha", circle_alpha=0.06, circle_color="red", # for horizontal accuracy
           s=5, alpha=0.4, # for pings
           traj_cols=tc)

[plt.plot([],[], marker="s", ls="", color=color_map[t], label=t) for t in color_map]
ax.legend(loc="upper center", ncol=4, fontsize=10, framealpha=1)

plt.tight_layout()
plt.title("Homes and workplaces are in the \n inner two rings, can we detect them?")
plt.show()

In [None]:
import numpy as np
from nomad import filters


stops = loader.sample_from_file("gc_data_stops/",
                                format='parquet',
                                users=['confident_aryabhata'], # <<<< single user
                                user_id="gc_identifier")

stops['datetime'] = filters.to_zoned_datetime(stops['start_timestamp'], stops['tz_offset'])
stops_sample = stops.query("'2024-01-05T00-0400' <= datetime <='2024-01-07T00-0400'")

## Attribute visits from centroid of stops

In [None]:
import nomad.visit_attribution.visit_attribution as visits

stops["location_id"] = visits.point_in_polygon(
                         data=stops,
                         poi_table=poi_table,
                         max_distance=15,  # TRY max_distance = 0
                         x='x',
                         y='y',
                         method='centroid',
                         data_crs='EPSG:3857')

stops.location_id.value_counts().head(15)

In the presence of **noise**, it is possible that pings for a single stop are scattered, and maybe are **split between several buildings**. `nomad.visit_attribution` implements the method `majority`, with more robustness to such cases, in which the location assigned to a stop is the **"majority vote" of the pings making up the stop**. For this we need the raw data with cluster labels.

In [None]:
import nomad.stop_detection.lachesis as LACHESIS
from nomad.stop_detection.utils import summarize_stop

tc = {"user_id": "gc_identifier", "timestamp": "unix_ts", "x": "dev_x", "y": "dev_y", "ha":"ha", "date":"date"}

traj = loader.sample_from_file("gc_data_long/", format='parquet', users=['confident_aryabhata'], traj_cols=tc)
traj["cluster"] = LACHESIS.lachesis_labels(traj, delta_roam=30, dt_max=240, complete_output=True, traj_cols=tc)

traj["location_id"] = visits.point_in_polygon(
                         data=traj,
                         poi_table=poi_table,
                         max_distance=15,  # try switching to max_distance = 0
                         x='dev_x',
                         y='dev_y',
                         method='majority',
                         data_crs='EPSG:3857')

stops_maj = traj[traj.cluster!=-1].groupby('cluster', as_index=False).apply(lambda df: summarize_stop(
                                                                                df,
                                                                                complete_output=True,
                                                                                keep_col_names=False,
                                                                                passthrough_cols=['location_id', 'gc_identifier'],
                                                                                traj_cols=tc), include_groups=False
                                                                            )

stops_maj.location_id.value_counts().head(15)

## Detecting homes from a stop table with locations

In [None]:
stops = loader.from_file("gc_data_stops/", format='parquet', user_id="gc_identifier")
stops['datetime'] = filters.to_zoned_datetime(stops['start_timestamp'], stops['tz_offset'])

stops["location_id"] = visits.point_in_polygon(
                         data=stops,
                         poi_table=poi_table,
                         max_distance=10,
                         x='x',
                         y='y',
                         method='centroid',
                         data_crs='EPSG:3857')

Let's plot the most visited locations, normalizing by time visited (keep in mind some stops are split!)

In [None]:
import matplotlib.pyplot as plt

poi_table['total_visit_time_hrs'] = stops.groupby("location_id").agg({"duration":"sum"})//60

fig, ax1 = plt.subplots(figsize=(5,5))
ax1.set_axis_off()

poi_table.plot(ax=ax1, column='total_visit_time_hrs', cmap='Reds', edgecolor='black', linewidth=0.75, legend=True, legend_kwds={'shrink': 0.75})
plt.title("Total visit time (h) by building")
plt.show()

### Homes ~ **most recurrent night time location** (don't use unix!)

In [None]:
stops['start_datetime'] = filters.to_zoned_datetime(stops['start_timestamp'], stops['tz_offset'])
stops['end_datetime'] = filters.to_zoned_datetime(stops['end_timestamp'], stops['tz_offset'])
stops.drop(['start_timestamp', 'end_timestamp'], axis=1, inplace=True)

In [None]:
%%time
import nomad.visit_attribution.home_attribution as homes
from datetime import date

cand_homes = homes.compute_candidate_homes(stops,
                                           datetime="datetime",
                                           location_id="location_id",
                                           user_id="gc_identifier",
                                           dawn_hour=6,
                                           dusk_hour=19
                                           )
cand_homes

A simple query can now find candidate locations satisfying:
- `num_nights >= min_days`
- `num_weeks >= min_weeks`
- break ties using the total dwell at night (`total_duration`)

In [None]:
last_date = date(year=2024, month=1, day=21) # needed for rolling home computations
home_table = homes.select_home(cand_homes, min_days=4, min_weeks=2, last_date=last_date, user_id='gc_identifier')
home_table

In [None]:
print(f"{100*len(home_table)/len(stops.gc_identifier.unique()):.2f}% of users have a detected home")
print(f"{100*(home_table.location_id.str[0] == 'h').sum()/len(stops.gc_identifier.unique()):.2f}% of users have a home of type 'home'")

## Work locations and OD matrix

In [None]:
cand_works = homes.compute_candidate_workplaces(stops,
                                               datetime="datetime",
                                               location_id="location_id",
                                               user_id="gc_identifier",
                                               work_start_hour=8,
                                               work_end_hour=18,
                                               include_weekends=False)

work_table = homes.select_workplace(cand_works, last_date=last_date, min_days=3,min_weeks=2, user_id='gc_identifier')

print(f"{100*len(work_table)/len(stops.gc_identifier.unique()):.2f}% of users have a detected workplace")
print(f"{100*(work_table.location_id.str[0] == 'w').sum()/len(stops.gc_identifier.unique()):.2f}% of users have a workplace of type 'work'.")

## Visualization of network from home to work using pydeck

In [None]:
import pandas as pd

origin = home_table.set_index('gc_identifier').location_id
origin.name = "origin"

destination = work_table.set_index('gc_identifier').location_id
destination.name = "destination"

od = (pd.DataFrame([origin,destination]).T).dropna()
od = od.groupby(by=['origin', 'destination']).size().reset_index(name='count')

In [None]:
from nomad.visit_attribution.viz import plot_od_map

# a little background
outer_box = box(*poi_table.total_bounds).buffer(15, join_style='mitre')
outer_box = outer_box.difference(poi_table.geometry.union_all())

background = poi_table.copy()
background['color'] = '#adadad'
background = pd.concat([background, gpd.GeoDataFrame({'geometry':outer_box, 'color':'#616161'}, index=['outline'], crs="EPSG:3857")])

plot_od_map(od_df=od,
   region_gdf=poi_table,
   origin_col="origin",
   dest_col="destination",
   weight_col="count",
   edge_alpha=0.8,
   edge_cmap="Reds", # try Reds, viridis, plasma
   w_min=3, # try 1 or 3
   background_gdf=background)