In [1]:
%load_ext autoreload
%autoreload

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import date, time, datetime, timedelta
from dateutil.parser import parse

In [3]:
import nomad.io.base as loader
import nomad.visit_attribution as visits
import nomad.stop_detection.lachesis as LACHESIS

In [4]:
def dawn_time(day_part, dawn_hour=6): # extracts the duration of dawn for a day part
    s,e = day_part
    return np.min([(e.hour*60 + e.minute),dawn_hour*60]) - np.min([(s.hour*60 + s.minute),dawn_hour*60]) 

def dusk_time(day_part, dusk_hour=19): # extracts the duration of dusk for a day part
    s,e = day_part
    return np.max([(e.hour*60 + e.minute)-dusk_hour*60,0]) - np.max([(s.hour*60 + s.minute)-dusk_hour*60, 0])

def slice_datetimes_interval_fast(start, end): # counts full days and tails
    full_days = (datetime.combine(end, time.min) - datetime.combine(start, time.max)).days
    if full_days >= 0:
        day_parts = [(start.time(), time.max), (time.min, end.time())]
    else:
        full_days = 0
        day_parts = [(start.time(), end.time()), (start.time(), start.time())]
    return full_days, day_parts

def duration_at_night_fast(start, end): #computes overlap
    dawn_hour = 6
    dusk_hour = 19
    full_days, (part1, part2) = slice_datetimes_interval_fast(start, end)
    total_dawn_time = dawn_time(part1, dawn_hour)+dawn_time(part2, dawn_hour)
    total_dusk_time = dusk_time(part1, dusk_hour)+dusk_time(part2, dusk_hour)
    return int(total_dawn_time + total_dusk_time + full_days*(dawn_hour + (24-dusk_hour))*60)

def clip_stays_date(traj, dates):
    start = pd.to_datetime(traj['start_datetime'])
    duration = traj['duration']

    # Ensure timezone-aware clipping bounds
    tz = start.dt.tz
    date_0 = pd.Timestamp(parse(dates[0]), tz=tz)
    date_1 = pd.Timestamp(parse(dates[1]), tz=tz)

    end = start + pd.to_timedelta(duration, unit='m')

    # Clip to date range
    start_clipped = start.clip(lower=date_0, upper=date_1)
    end_clipped = end.clip(lower=date_0, upper=date_1)

    # Recompute durations
    duration_clipped = ((end_clipped - start_clipped).dt.total_seconds() // 60).astype(int)
    duration_night = [duration_at_night_fast(s, e) for s, e in zip(start_clipped, end_clipped)]

    return pd.DataFrame({
        'id': traj['id'].values,
        'start': start_clipped,
        'duration': duration_clipped,
        'duration_night': duration_night,
        'location': traj['location']
    })

def count_nights(usr_polygon):   
    min_dwell = 10
    dawn_hour = 6
    dusk_hour = 19
    nights = set()
    weeks = set()

    for _, row in usr_polygon.iterrows():
        d = row['start']
        d = pd.to_datetime(d)
        full_days, (part1, part2) = slice_datetimes_interval_fast(d, d + pd.to_timedelta(row['duration'], unit='m'))

        dawn1 = dawn_time(part1, dawn_hour)
        dusk1 = dusk_time(part1, dusk_hour)
        dawn2 = dawn_time(part2, dawn_hour)
        dusk2 = dusk_time(part2, dusk_hour)

        if full_days == 0:
            if dawn1 >= min_dwell:
                night = d - timedelta(days=1)
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())

            if (dusk1 + dawn2) >= min_dwell:
                night = d
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())

            if dusk2 >= min_dwell:
                night = d + timedelta(days=1)
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())
        else:
            if dawn1 >= min_dwell:
                night = d - timedelta(days=1)
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())

            for t in range(full_days + 1):
                night = d + timedelta(days=t)
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())

            if dusk2 >= min_dwell:
                night = d + timedelta(days=full_days + 1)
                nights.add(night.date())
                weeks.add((night - timedelta(days=night.weekday())).date())

    identifier = usr_polygon['id'].iloc[0]
    location = usr_polygon['location'].iloc[0]

    return pd.DataFrame([{
        'id': identifier,
        'location': location,
        'night_count': len(nights),
        'week_count': len(weeks)
    }])

In [5]:
traj_cols = {'user_id':'uid',
             'x':'x',
             'y':'y',
             'timestamp':'timestamp'}

sparse_df = loader.from_file("./long-gc-data/", format="parquet", traj_cols=traj_cols,
                       parse_dates=True)
poi_table = gpd.read_file('garden_city.geojson')

# Reproject from gc_coords to web mercator
sparse_df.loc[:,'x'] = (sparse_df['x'] - 4265699)/15
sparse_df.loc[:,'y'] = (sparse_df['y'] + 4392976)/15

# Select data from 1 user
user = sparse_df.uid.unique()[0]
user_sample = sparse_df.loc[sparse_df.uid == user].copy()

user_sample

Unnamed: 0,uid,timestamp,longitude,latitude,x,y,datetime,ha,date,tz_offset
0,allen,1704119640,38.321027,-36.667517,11.884196,7.485533,2024-01-01 16:34:00,11.25,2024-01-01,7200
1,allen,1704120120,38.320959,-36.667444,11.375352,8.156611,2024-01-01 16:42:00,11.25,2024-01-01,7200
160,allen,1704161760,38.320959,-36.667465,11.372794,7.969700,2024-01-02 04:16:00,11.25,2024-01-02,7200
161,allen,1704162600,38.321069,-36.667482,12.193677,7.811347,2024-01-02 04:30:00,11.25,2024-01-02,7200
162,allen,1704179040,38.320935,-36.667496,11.198534,7.681230,2024-01-02 09:04:00,11.25,2024-01-02,7200
...,...,...,...,...,...,...,...,...,...,...
6077,allen,1706091960,38.321040,-36.667467,11.977526,7.943842,2024-01-24 12:26:00,11.25,2024-01-24,7200
6078,allen,1706093400,38.320935,-36.667509,11.198295,7.555486,2024-01-24 12:50:00,11.25,2024-01-24,7200
6079,allen,1706094120,38.320977,-36.667514,11.507877,7.510655,2024-01-24 13:02:00,11.25,2024-01-24,7200
6080,allen,1706114760,38.320012,-36.666997,4.350183,12.294505,2024-01-24 18:46:00,11.25,2024-01-24,7200


In [None]:
DUR_MIN=5
DT_MAX=60
DELTA_ROAM=100

traj_cols = {'user_id':'uid',
             'x':'x',
             'y':'y',
             'datetime':'datetime'}

stop_table_lachesis = LACHESIS.lachesis(traj=user_sample,
                                        dur_min=DUR_MIN,
                                        dt_max=DT_MAX,
                                        delta_roam=DELTA_ROAM,
                                        traj_cols=traj_cols,
                                        keep_col_names=False,
                                        complete_output=True,
                                        datetime = 'datetime')

user_sample['cluster'] = LACHESIS._lachesis_labels(traj=user_sample,
                                            dur_min=DUR_MIN,
                                            dt_max=DT_MAX,
                                            delta_roam=DELTA_ROAM,
                                            traj_cols=traj_cols,
                                            datetime = 'datetime').values

pred_lachesis = visits.point_in_polygon(
                 data=user_sample,
                 poi_table=poi_table,
                 traj_cols=traj_cols,
                 max_distance=2,
                 x='x',
                 y='y',
                 data_crs=None,
                 method='majority')

pred_lachesis




> [1;32mc:\users\pacob\documents\repositories\nomad-repo\nomad\visit_attribution.py[0m(68)[0;36mpoint_in_polygon[1;34m()[0m
[1;32m     66 [1;33m            [1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     67 [1;33m            [0mpdb[0m[1;33m.[0m[0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 68 [1;33m            pings_df.groupby(cluster_label, as_index=False)['location'].agg(
[0m[1;32m     69 [1;33m                [1;32mlambda[0m [0mx[0m[1;33m:[0m [0mx[0m[1;33m.[0m[0mmode[0m[1;33m([0m[1;33m)[0m[1;33m.[0m[0miloc[0m[1;33m[[0m[1;36m0[0m[1;33m][0m [1;32mif[0m [1;32mnot[0m [0mx[0m[1;33m.[0m[0mmode[0m[1;33m([0m[1;33m)[0m[1;33m.[0m[0mempty[0m [1;32melse[0m [1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     70 [1;33m            [0mstop_table[0m[1;33m[[0m[1;34m'location'[0m[1;33m][0m [1;33m=[0m [0mpings_df[0m[1;33m.[0m[0mlocation[0m[1;33m.[0m[0mvalues[0

ipdb>  p data


        uid   timestamp  longitude   latitude          x          y  \
0     allen  1704119640  38.321027 -36.667517  11.884196   7.485533   
1     allen  1704120120  38.320959 -36.667444  11.375352   8.156611   
160   allen  1704161760  38.320959 -36.667465  11.372794   7.969700   
161   allen  1704162600  38.321069 -36.667482  12.193677   7.811347   
162   allen  1704179040  38.320935 -36.667496  11.198534   7.681230   
...     ...         ...        ...        ...        ...        ...   
6077  allen  1706091960  38.321040 -36.667467  11.977526   7.943842   
6078  allen  1706093400  38.320935 -36.667509  11.198295   7.555486   
6079  allen  1706094120  38.320977 -36.667514  11.507877   7.510655   
6080  allen  1706114760  38.320012 -36.666997   4.350183  12.294505   
6081  allen  1706114880  38.320372 -36.667317   7.021152   9.335032   

                datetime     ha        date  tz_offset  cluster  
0    2024-01-01 16:34:00  11.25  2024-01-01       7200        0  
1    2024-01-01

ipdb>  p pings_df


        uid   timestamp  longitude   latitude          x          y  \
0     allen  1704119640  38.321027 -36.667517  11.884196   7.485533   
1     allen  1704120120  38.320959 -36.667444  11.375352   8.156611   
160   allen  1704161760  38.320959 -36.667465  11.372794   7.969700   
161   allen  1704162600  38.321069 -36.667482  12.193677   7.811347   
163   allen  1704209520  38.321391 -36.666826  14.580977  13.880322   
...     ...         ...        ...        ...        ...        ...   
6075  allen  1706091480  38.320877 -36.667459  10.769694   8.020328   
6076  allen  1706091600  38.321020 -36.667476  11.829863   7.869353   
6077  allen  1706091960  38.321040 -36.667467  11.977526   7.943842   
6078  allen  1706093400  38.320935 -36.667509  11.198295   7.555486   
6079  allen  1706094120  38.320977 -36.667514  11.507877   7.510655   

                datetime     ha        date  tz_offset  cluster location  
0    2024-01-01 16:34:00  11.25  2024-01-01       7200        0      NaN

ipdb>  p pings_df.location.value_counts()


Series([], Name: count, dtype: int64)


ipdb>  p poi_map(                 data=pings_df,                 poi_table=poi_table,                 max_distance=max_distance,                 data_crs=data_crs,                 traj_cols=traj_cols,                 **kwargs                             )





0       NaN
1       NaN
160     NaN
161     NaN
163     NaN
       ... 
6075    NaN
6076    NaN
6077    NaN
6078    NaN
6079    NaN
Name: building_id, Length: 226, dtype: object


In [22]:
user_sample

Unnamed: 0,uid,timestamp,longitude,latitude,x,y,datetime,ha,date,tz_offset,cluster
0,allen,1704119640,38.321027,-36.667517,11.884196,7.485533,2024-01-01 16:34:00,11.25,2024-01-01,7200,0
1,allen,1704120120,38.320959,-36.667444,11.375352,8.156611,2024-01-01 16:42:00,11.25,2024-01-01,7200,0
160,allen,1704161760,38.320959,-36.667465,11.372794,7.969700,2024-01-02 04:16:00,11.25,2024-01-02,7200,1
161,allen,1704162600,38.321069,-36.667482,12.193677,7.811347,2024-01-02 04:30:00,11.25,2024-01-02,7200,1
162,allen,1704179040,38.320935,-36.667496,11.198534,7.681230,2024-01-02 09:04:00,11.25,2024-01-02,7200,-1
...,...,...,...,...,...,...,...,...,...,...,...
6077,allen,1706091960,38.321040,-36.667467,11.977526,7.943842,2024-01-24 12:26:00,11.25,2024-01-24,7200,37
6078,allen,1706093400,38.320935,-36.667509,11.198295,7.555486,2024-01-24 12:50:00,11.25,2024-01-24,7200,37
6079,allen,1706094120,38.320977,-36.667514,11.507877,7.510655,2024-01-24 13:02:00,11.25,2024-01-24,7200,37
6080,allen,1706114760,38.320012,-36.666997,4.350183,12.294505,2024-01-24 18:46:00,11.25,2024-01-24,7200,-1


In [None]:
cluster = LACHESIS._lachesis_labels(traj=user_sample,
                                            dur_min=DUR_MIN,
                                            dt_max=DT_MAX,
                                            delta_roam=DELTA_ROAM,
                                            traj_cols=traj_cols,
                                            datetime = 'datetime')
user_sample

In [None]:
visits.point_in_polygon(user_sample, poi_table)

In [None]:
# stop detection output
stop_table_lachesis['start_datetime'] = pd.to_datetime(stop_table_lachesis['start_datetime'])

if 'id' not in stop_table_lachesis.columns:
    stop_table_lachesis['id'] = user

# Date range
start_date = "2024-01-02"
weeks = 2
end_date = (parse(start_date) + timedelta(weeks=weeks)).date().isoformat()
dates = (start_date, end_date)
df_clipped = clip_stays_date(stop_table_lachesis, dates)
df_clipped = df_clipped[(df_clipped['duration'] > 0) & (df_clipped['duration_night'] >= 15)]
df_clipped.groupby(['id', 'location'], group_keys=False).apply(count_nights).reset_index(drop=True)