In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-muted')
from matplotlib import cm
import geopandas as gpd

from pyproj import Transformer
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures
import multiprocessing
from multiprocessing import Pool
from functools import partial
import numpy.random as npr
import matplotlib.dates as mdates
from itertools import product
import copy
import pickle
from tqdm import tqdm
import boto3
import io

import nomad.io.base as loader
import nomad.city_gen as cg
from nomad.city_gen import City, Building
import nomad.traj_gen as tg
from nomad.traj_gen import Agent, Population
import nomad.stop_detection.ta_dbscan as DBSCAN
import nomad.stop_detection.lachesis as Lachesis
from nomad.generation.sparsity import gen_params_target_q, gen_params_ranges

In [3]:
# garden city

city_geojson = gpd.read_file('garden_city.geojson')

city = cg.load('garden-city.pkl')

In [4]:
# # synthetic philly

# city_geojson = gpd.read_file('philly.geojson')

# s3 = boto3.client('s3', region_name="us-east-2")
# pickle_buffer = io.BytesIO()
# s3.download_fileobj("synthetic-philly", "philadelphia-city.pkl", pickle_buffer)
# #response = s3.get_object(Bucket = "synthetic-philly", Key = "philadelphia-city.pkl")
# pickle_buffer.seek(0)
# city = pickle.load(pickle_buffer)
# #city = response['Body'].read()

### Generate N agents

The following code maps our Garden City coordinates to a location in the Atlantic Ocean (Atlantis?).

In [5]:
def garden_city_to_lat_long(agent, sparse_traj=True, full_traj=False, diaries=True):
    def project_city_blocks_to_web_mercator(df):
        """Convert (x, y) from 15m block units to Web Mercator meters via affine shift and projection."""
        transformer = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True)
        df['x'] = 15 * df['x'] - 4265699
        df['y'] = 15 * df['y'] + 4392976
        if 'ha' in df:
            df['ha'] = 15 * df['ha']
        df['longitude'], df['latitude'] = transformer.transform(df['x'].values, df['y'].values)
        df['date'] = df['datetime'].dt.date
        return df

    def finalize(df):
        front = ['identifier', 'timestamp', 'longitude', 'latitude', 'x', 'y', 'date']
        cols = [col for col in front if col in df] + [col for col in df.columns if col not in front]
        return df[cols].rename(columns={'identifier': 'uid', 'timestamp': 'timestamp'}).reset_index(drop=True)

    if sparse_traj:
        agent.sparse_traj = finalize(project_city_blocks_to_web_mercator(agent.sparse_traj))
    if full_traj:
        agent.trajectory = finalize(project_city_blocks_to_web_mercator(agent.trajectory))
        
    if diaries:
        diary = agent.diary.copy()
        xs = []
        ys = []
        for loc in diary["location"]:
            if loc is None:
                xs.append(None)
                ys.append(None)
            else:
                pt = agent.city.buildings[loc].geometry.centroid
                xs.append(pt.x)
                ys.append(pt.y)
        diary["x"] = xs
        diary["y"] = ys
        agent.diary = finalize(project_city_blocks_to_web_mercator(diary))

In [6]:
def philly_to_lat_long(agent, sparse_traj=True, full_traj=False, diaries=True):
    def project_point_to_web_mercator(x, y):
        """
        Project a fractional (x, y) block coord to Web Mercator using affine interpolation.
        philly_grid_map is the grid_map produced by RealCityGenerator in virtual_philly.ipynb
           import it into this notebook through pkl
        """
        i, j = int(math.floor(x)), int(math.floor(y))
        poly = philly_grid_map.get((i, j))

        if poly is None:
            raise ValueError(f"No polygon found at grid cell ({i}, {j})")

        # Bounds of the 1x1 block polygon in EPSG:3857
        minx, miny, maxx, maxy = poly.bounds

        dx = x - i
        dy = y - j

        X = minx + dx * (maxx - minx)
        Y = miny + dy * (maxy - miny)

        return X, Y

    def apply_projection_to_df(df):
        """Apply Web Mercator projection to a DataFrame with 'x' and 'y' columns."""
        def safe_project(row):
            try:
                return project_point_to_web_mercator(row['x'], row['y'])
            except Exception:
                return (None, None)

        projected = df.apply(safe_project, axis=1)
        df[['x', 'y']] = pd.DataFrame(projected.tolist(), index=df.index)

        transformer = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True)
        if 'ha' in df:
            df['ha'] = 10 * df['ha']  # 10 because thats the sidelength of a block
        df['longitude'], df['latitude'] = transformer.transform(df['x'].values, df['y'].values)
        df['date'] = df['datetime'].dt.date
        return df

    def finalize(df):
        front = ['identifier', 'timestamp', 'longitude', 'latitude', 'x', 'y', 'date']
        cols = [col for col in front if col in df] + [col for col in df.columns if col not in front]
        return df[cols].rename(columns={'identifier': 'uid', 'timestamp': 'timestamp'}).reset_index(drop=True)

    if sparse_traj:
        agent.sparse_traj = finalize(apply_projection_to_df(agent.sparse_traj))
    if full_traj:
        agent.trajectory = finalize(apply_projection_to_df(agent.trajectory))

    if diaries:
        diary = agent.diary.copy()
        xs = []
        ys = []
        for loc in diary["location"]:
            if loc is None:
                xs.append(None)
                ys.append(None)
            else:
                pt = agent.city.buildings[loc].geometry.centroid
                xs.append(pt.x)
                ys.append(pt.y)
        diary["x"] = xs
        diary["y"] = ys
        agent.diary = finalize(apply_projection_to_df(diary))

## Simple trajectory generation

For simple trajectory generation tasks that don't require too much computation power and can be done on a personal laptop, the following code generates a trajectory for each agent and saves it to a csv.

In [7]:
def generate_trajectory_data(agent, seed_trajectory=None, seed_sparsity=None):
    beta_params = gen_params_target_q(q_range=(0.3, 0.9), seed=seed_sparsity)
    rng = npr.default_rng(seed_sparsity)
    ha_sample = rng.uniform(11.5/15, 1)

    agent.reset_trajectory()
    agent.generate_trajectory(
        datetime = "2024-01-01T08:00 -04:00",
        end_time = pd.Timestamp('2024-01-15T08:30 -04:00'),
        seed=seed_trajectory,
        dt=1)

    agent.sample_trajectory(
        **beta_params,
        seed=seed_sparsity,
        ha=ha_sample,
        replace_sparse_traj=True)

    philly_to_lat_long(agent, sparse_traj=True, full_traj=False)
    agent.reset_trajectory(trajectory = False, sparse = False, diary = False)
    return None

# Generate trajectories with progress bar
N = 200
population = Population(city)
population.generate_agents(N=N, seed=1, name_count=2)

for i, agent in enumerate(tqdm(population.roster.values(), desc="Generating trajectories")):
    generate_trajectory_data(agent, seed_trajectory=i, seed_sparsity=i)

Generating trajectories: 100%|████████████████| 200/200 [03:00<00:00,  1.11it/s]


In [8]:
agent.sparse_traj.to_csv("philly_sparse_traj.csv", index=False)
agent.trajectory.to_csv("philly_full_traj.csv", index=False)

In [22]:
def generate_agent_trajectory(agent_id, agent, seed):

    beta_params = gen_params_target_q(q_range=(0.4, 0.85), seed=seed)

    agent.generate_trajectory(
        datetime = "2024-01-01T08:00 -04:00",
        end_time = pd.Timestamp('2024-01-15T08:30:00 -04:00'),
        seed=1,
        dt=0.25)
    print('finished generating trajectory')
    agent.sample_trajectory(
        **beta_params,
        seed=seed,
        ha=13/15, # <<<<<<
        replace_sparse_traj=True)
    
    garden_city_to_lat_long(agent,
                            sparse_traj=True,
                            full_traj=False,
                           diaries = True)
    agent.reset_trajectory(trajectory = True, sparse = False, diary = False)
    
    return agent_id, copy.deepcopy(agent)

In [23]:
population = Population(city)
population.generate_agents(N=2, seed=2, name_count=2)

In [24]:
# %%time 
# # time for 2 weeks of data at dt = 0.25
# agent_1 = list(population.roster.values())[1]
# generate_agent_trajectory(agent_1, 10, 6)
# agent_1.sparse_traj.head()

In [25]:
def generate_trajectory_data(agent, seed_trajectory=None, seed_sparsity=None, use_datetime=None, use_lon_lat=True):
    beta_params = gen_params_ranges(seed=seed_sparsity)
    rng = npr.default_rng(seed_sparsity)
    ha_sample = rng.uniform(11.5/15, 16.5/15)

    agent.reset_trajectory()
    agent.generate_trajectory(
        datetime = "2024-01-01T08:00 -04:00",
        end_time = pd.Timestamp('2024-01-15T08:30:00 -04:00'),
        seed=seed_trajectory,
        dt=1)

    agent.sample_trajectory(
        **beta_params,
        seed=seed_sparsity,
        ha=ha_sample,
        replace_sparse_traj=True)

    garden_city_to_lat_long(agent, sparse_traj=True, full_traj=False, diaries = True)
    agent.reset_trajectory(trajectory = True, sparse = False, diary = False)
    return None

# Generate trajectories with progress bar
N = 200  
population = Population(city)
population.generate_agents(N=N, seed=1, name_count=2)

for i, agent in enumerate(tqdm(population.roster.values(), desc="Generating trajectories")):
    generate_trajectory_data(agent, seed_trajectory=i, seed_sparsity=i, use_datetime=None, use_lon_lat=True)
    #agent.sparse_traj.rename(columns={'uid': 'identifier', 'timestamp': 'unix_timestamp', 'latitude':'device_lat', 'longitude':'device_lon', 'datetime':'local_datetime'}, inplace=True)
    agent.sparse_traj.rename(columns={'uid': 'user_id'}, inplace=True)

Generating trajectories: 100%|████████████████| 200/200 [03:01<00:00,  1.10it/s]


In [26]:
agent_id, agent = [(agent_id, agent) for agent_id, agent in population.roster.items()][0]
agent_id

'admiring_allen'

In [27]:
# <<<<<<< REMOTE CELL DELETED >>>>>>>
# generate_agent_trajectory(agent_id, agent, 150)

In [28]:
# <<<<<<< REMOTE CELL DELETED >>>>>>>
# def generate_trajectory_data(agent_id, agent, seed):
#     agent.reset_trajectory()
    
#     agent.generate_trajectory(
#         local_timestamp="2024-01-01T08:00:00 -04:00",
#         end_time=pd.Timestamp("2024-01-15T8:30:00 -04:00"),
#         seed=105,
#         dt=1)

#     beta_duration = npr.uniform(25, 170)
#     beta_start = max(npr.uniform(25, 520), beta_duration)
#     beta_ping = min(npr.uniform(3, 15), beta_duration//2)

#     agent.sample_trajectory(
#         beta_start=beta_start,
#         beta_durations=beta_duration,
#         beta_ping=beta_ping,
#         seed=seed,
#         replace_sparse_traj=True)

#     garden_city_to_lat_long(agent, sparse_traj=True, full_traj=False)
#     return None

# # Generate trajectories with progress bar
# for agent_id, agent in tqdm(population.roster.items(), desc="Generating trajectories"):
#     generate_trajectory_data(agent_id, agent, seed=150)

In [29]:
# dataset no 1
traj_cols = {
    "user_id": "user_id",
    "timestamp": "timestamp",
    "latitude": "latitude",
    "longitude": "longitude",
    "datetime": "datetime"}
# Save only sparse trajectories and diaries
population.save_pop(
    sparse_path="output/sparse_data",
    diaries_path="output/diaries_data",
    partition_cols=None,
    traj_cols=traj_cols,
    fmt="parquet"
)



In [None]:
# dataset no 2
traj_cols = {
    "user_id": "user_id",
    "timestamp": "timestamp",
    "latitude": "latitude",
    "longitude": "longitude",
    "datetime": "datetime"}
# Save only sparse trajectories and diaries
population.save_pop(
    sparse_path="output/gc_data/",
    diaries_path=None,
    partition_cols=['date'],
    traj_cols=traj_cols,
    fmt="csv"
)

In [None]:
sparse_df = loader.from_file("output/gc_data/", format="csv", traj_cols=traj_cols,
                      parse_dates=True)

## Generate dataset 3 for tutorial

In [None]:
def generate_trajectory_data(agent, seed_trajectory=None, seed_sparsity=None, use_datetime=None, use_lon_lat=True):
    beta_params = gen_params_target_q(q_range=(0.2, 0.8), beta_dur_range=(25, 180), beta_ping_range=(1.5, 6), seed=seed_sparsity)
    rng = npr.default_rng(seed_sparsity)
    ha_sample = rng.uniform(11.5/15, 16.5/15)

    agent.reset_trajectory()
    agent.generate_trajectory(
        datetime = "2024-01-01T07:00 -04:00",
        end_time = pd.Timestamp('2024-01-21T09:00:00 -04:00'),
        seed=seed_trajectory,
        dt=0.15)

    agent.sample_trajectory(
        **beta_params,
        seed=seed_sparsity,
        ha=ha_sample,
        replace_sparse_traj=True)

    garden_city_to_lat_long(agent, sparse_traj=True, full_traj=False, use_datetime=use_datetime, use_lon_lat=use_lon_lat)
    agent.reset_trajectory(trajectory = True, sparse = False, diary = False)
    return None

# Generate trajectories with progress bar
N = 350  
population = Population(city)
population.generate_agents(N=N, seed=5, name_count=2)

for i, agent in enumerate(tqdm(population.roster.values(), desc="Generating trajectories")):
    if i == 0:
        continue
    generate_trajectory_data(agent, seed_trajectory=i, seed_sparsity=i, use_datetime=False, use_lon_lat=False)
    agent.sparse_traj.rename(columns={'uid': 'gc_identifier', 'timestamp': 'unix_ts', 'x':'dev_x', 'y':'dev_y'}, inplace=True)

In [None]:
# Make data for agent 0
start_time = pd.date_range(start='2024-01-01T07:00 -04:00', periods=4, freq='60min')
tz_offset = loader._offset_seconds_from_ts(start_time[0])
unix_timestamp = [int(t.timestamp()) for t in start_time]
duration = [60]*4  # in minutes
location = ['h-x13-y11'] * 1 + ['h-x13-y9'] * 1 + ['w-x18-y10'] * 1 + ['w-x18-y8'] * 1

destinations = pd.DataFrame(
    {"datetime":start_time,
     "timestamp":unix_timestamp,
     "duration":duration,
     "location":location}
     )
destinations = condense_destinations(destinations)

agent_0 = list(population.roster.values())[0]

rng = npr.default_rng(0)
ha_sample = rng.uniform(11.5/15, 16.5/15)

agent_0.reset_trajectory()
agent_0.generate_trajectory(destination_diary=destinations, seed=0, dt=0.15)

agent_0.sample_trajectory(
    beta_ping=2,
    beta_start=None,
    beta_durations=None,
    seed=0,
    ha=ha_sample,
    replace_sparse_traj=True)

garden_city_to_lat_long(agent_0, sparse_traj=True, full_traj=False, use_datetime=False, use_lon_lat=False)
agent_0.reset_trajectory(trajectory = True, sparse = False, diary = False)
agent_0.sparse_traj.rename(columns={'uid': 'gc_identifier', 'timestamp': 'unix_ts', 'x':'dev_x', 'y':'dev_y'}, inplace=True)

In [None]:
# dataset no 3
traj_cols = {
    "user_id": "gc_identifier",
    "timestamp": "unix_ts",
    "x": "dev_x",
    "y": "dev_y"}
# Save only sparse trajectories and diaries
population.save_pop(
    sparse_path="output/gc_data_long/",
    diaries_path=None,
    partition_cols=['date'],
    traj_cols=traj_cols,
    fmt="parquet"
)

In [None]:
sparse_df.head()

For larger trajectory generation tasks that require a lot of compute power, we can parallelize the trajectory generation using the following code. We generate ground-truth trajectories in agent-month "chunks", sparsify each chunk, then reset the ground-truth trajectory field to lessen the memory usage. 

In [None]:
# Using parallel processing (e.g., using a cluster)
%%time

def generate_agent_trajectory(agent_id, agent, seed):
    
    beta_duration = npr.uniform(15, 180)
    beta_start = max(npr.uniform(60, 1200), beta_duration*3)
    beta_ping = npr.uniform(1.5, 30)
    
    param = (beta_start, beta_duration, beta_ping)
    
    for month in range(1,13):
        days = calendar.monthrange(2024, month)[1]
        population_n.generate_trajectory(agent, 
                                         T=datetime(2024, month, days, hour=23, minute=59), 
                                         seed=seed)
    
        agent.sample_traj_hier_nhpp(*param, 
                                    seed=seed,
                                    reset_traj=True)
    
    garden_city_to_lat_long(agent,
                            sparse_traj=True,
                            full_traj=False)
    
    return agent_id, copy.deepcopy(agent)

manager = multiprocessing.Manager()
shared_roster = manager.dict(population_n.roster)

start = 6001  # 12001  # can modify
end = 12001   # 18001  # can modify
roster = dict(population_n.roster)
batch = islice(roster.items(), start, end)

with ProcessPoolExecutor() as executor:
    with tqdm(total=(end-start), desc="Processing agents") as pbar:
        futures = [
            executor.submit(generate_agent_trajectory, agent_id, agent, i+15000)
            for i, (agent_id, agent) in enumerate(batch, start=start)
        ]
        results = []
        for future in futures:
            results.append(future.result())
            pbar.update(1)

for agent_id, agent in results:
    population_n.roster[agent_id] = agent

This code saves the generated trajectories in a parquet file, using the date as the partition column.

In [None]:
partition_cols = {
    'sparse_traj': ['date'],
    'diaries': ['identifier']
}

roster = dict(islice(population_n.roster.items(), start, end))

population.save_pop(bucket="synthetic-raw-data",
                    prefix=f"agents-{start+15000}-{end+15000-1}/",
                    save_full_traj=False,
                    save_sparse_traj=True,
                    save_homes=True,
                    save_diaries=True,
                    partition_cols=partition_cols,
                    roster=roster)

In [None]:
# Load the Parquet files
s3_path = "s3://synthetic-raw-data/agents-1-1001/sparse_trajectories.parquet/"
df1 = pd.read_parquet(s3_path)
s3_path = "s3://synthetic-raw-data/agents-1001-2000/sparse_trajectories.parquet/"
df2 = pd.read_parquet(s3_path)