In [None]:
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from humobi.structures.trajectory import TrajectoriesFrame
from humobi.measures.individual import *
from humobi.tools.processing import *
from humobi.tools.user_statistics import *

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
from distfit import distfit

import skmob
from shapely import Point
import geopandas as gpd
from tqdm import tqdm
from scipy.optimize import minimize, curve_fit
tqdm.pandas()
import csv
from skmob.measures.individual import k_radius_of_gyration, max_distance_from_home
from skmob.measures.individual import radius_of_gyration as skmobrog
from shapely.geometry import Point

In [None]:
def start_end(trajectories_frame):
	"""
	Compresses stops in TrajectoriesFrame by adding start and end of visits in locations

	Args:
		trajectories_frame: TrajectoriesFrame object class

	Returns:
		compressed TrajectoriesFrame
	"""
	to_concat = []
	if 'date' not in trajectories_frame.columns:
		trajectories_frame['date'] = trajectories_frame.index.get_level_values(1)
	grouped = trajectories_frame.groupby(level=0)
	for gs in tqdm(grouped,total=len(grouped)):
		firsts = gs[1][gs[1]['geometry'].shift() != gs[1]['geometry']]
		lasts = gs[1][gs[1]['geometry'].shift(-1) != gs[1]['geometry']]
		firsts.loc[:, 'start'] = firsts['date']
		lasts = lasts.set_index(firsts.index)
		firsts.loc[:, 'end'] = lasts['date']
		firsts = firsts[firsts['start'] != firsts['end']]
		to_concat.append(firsts)
	return pd.concat(to_concat)

In [None]:
def _recalcuate_centres(single_trajectory):
    """
    Based on labels, recalculates spatial coordinates of points to their clusters centers.

    Args:
        single_trajectory: single movement trajectory

    Returns:
        a TrajectoriesFrame with overwritten coordinates in lon and lat columns
    """
    centres = single_trajectory.groupby(by='labels').mean()
    single_trajectory = single_trajectory.join(centres, on='labels', rsuffix='_')
    single_trajectory['lat'] = single_trajectory['lat_']
    single_trajectory['long'] = single_trajectory['long_']
    return single_trajectory

In [None]:
def radius_of_gyration(trajectories_frame, time_evolution=True):
	"""
	Calculates radii of gyration for each user. Optionally uses time steps to express their growth.

	Args:
		trajectories_frame: TrajectoriesFrame class object
		time_evolution: If true, radii of gyration are calculated over time

	Returns:
		a Series with radii of gyration for each user
	"""
	mean_locs = center_of_mass(trajectories_frame)
	to_concat_dict = {}
	to_concat_list = []
	for ind, vals in tqdm(trajectories_frame.groupby(level=0), total=len(trajectories_frame.groupby(level=0))):
		vals = vals.dropna()
		rog_ind = vals.distance(mean_locs.loc[ind]) ** 2
		if time_evolution:
			rog_ind = groupwise_expansion(np.sqrt(rog_ind))
			to_concat_list.append(rog_ind)
		else:
			rog_ind = np.sqrt(rog_ind.mean())
			to_concat_dict[ind] = rog_ind
	if time_evolution:
		radius = pd.concat(to_concat_list)
	else:
		radius = pd.DataFrame.from_dict(to_concat_dict, orient='index')
	return radius

In [None]:
# Define the function to calculate the time spent between 22:00 and 07:00
def calculate_night_time(start, end):
    time_spent = pd.Timedelta(0)
    
    # Create a date range for every day between start and end
    days_range = pd.date_range(start=start.date(), end=end.date(), freq='D')
    
    for day in days_range:
        night_start = pd.Timestamp(day).replace(hour=22, minute=0).tz_localize('America/Sao_Paulo')
        night_end = pd.Timestamp(day + pd.DateOffset(1)).replace(hour=7, minute=0).tz_localize('America/Sao_Paulo')
        
        # Handle edge cases where the interval crosses into or out of the night hours
        current_start = max(start, night_start)
        current_end = min(end, night_end)
        
        # If there is an overlap, accumulate the time spent
        if current_start < current_end:
            time_spent += current_end - current_start
    
    return time_spent

# # Calculate time spent between 22:00 and 07:00 for each row
# df.groupby(level=0).apply(lambda row: calculate_night_time(row['start'], row['end']), axis=1)

# # Group by 'labels' and sum the night time spent for each label
# night_time_spent_per_label = df.groupby('labels')['night_time_spent'].sum()

In [None]:
#Read data
parquet_path = r'YOUR_DATA_PATH'
df = pd.read_parquet(parquet_path)
#Convert to TrajectoriesFrame
df = TrajectoriesFrame(df)
#Compress
startended = start_end(df)
#Filter
periods = df.groupby(level=0).apply(lambda x: x.end.iloc[-1] - x.start.iloc[0])
known_location_selected = known_location[known_location >= .6].index
df = df[df['new_user_id'].isin(known_location_selected)]
#Convert datetime
df['datetime'] = pd.to_datetime(df['unix'],unit='s')
df['datetime'] = df['datetime'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
df['datetime'] = df['datetime'].dt.tz_convert('UTC')
#Filter time
start_period = '2021-01-30 00:00:00+00:00'
end_period = '2021-03-01 00:00:00+00:00'
df = df[(df.datetime >= start_period) & (df.datetime <= end_period)]
resampled = df.groupby('new_user_id').apply(lambda x: x.resample('1H',on='datetime').lat.count())
resampled[resampled > 1] = 1
complete_mask = resampled.groupby(level=0).apply(lambda x: x.sum()/x.shape[0] >= .4)
df = df[df.new_user_id.isin(complete_mask[complete_mask].index)]
df = TrajectoriesFrame(df)
#Filter to pick moving objects
labels_mask = ~df.groupby(level=0).apply(lambda x: x.labels.unique().size == 1)
df = df.loc[labels_mask[labels_mask].index]
df = TrajectoriesFrame(df)
time_mask = df.groupby(level=0).apply(lambda x: x.index.get_level_values(1).max() - x.index.get_level_values(1).min() <= pd.Timedelta('21D'))
print(df.get_users().size,time_mask.sum())
time_mask = time_mask[~time_mask]
df = df.loc[time_mask.index]
df = df.set_crs(4326)
df = df.sort_index()
df = df.to_crs(3857)

In [None]:
#Detect homes
actual_time_df = df.copy().reset_index()
actual_time_df['datetime'] = df.reset_index()['datetime'].dt.tz_convert('America/Sao_Paulo')
actual_time_df['start'] = actual_time_df['start'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
actual_time_df['end'] = actual_time_df['end'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
actual_time_df = TrajectoriesFrame(actual_time_df)
actual_time_df['nighttime'] = actual_time_df.apply(lambda row: calculate_night_time(row['start'], row['end']), axis=1)
most_popular_place = actual_time_df.groupby(level=0).apply(lambda x: x.groupby('labels').nighttime.sum())
most_popular_place = most_popular_place.groupby(level=0).apply(lambda x: x.idxmax()[1])
to_concat ={}
for uid,vals in actual_time_df.groupby(level=0):
    to_concat[uid] = vals[vals.labels == most_popular_place.loc[uid]].iloc[0]
homes = pd.DataFrame(to_concat).T[['labels','lat','long']]

In [None]:
#Calculate parameters
#Jump Length
humobi_jl = jump_lengths(df)
humobi_jl = humobi_jl[(~humobi_jl.isna()) & (humobi_jl != 0)]
#Number of distinct locations
nodl = num_of_distinct_locations(df)
#Number of days
nod = df.groupby(level=0).apply(lambda x: x.index.get_level_values(1).max() - x.index.get_level_values(1).min())
#Locations per day
lpd = nodl/nod.dt.days
#Frequency top 1
time_at_loc = df.groupby(level=0).progress_apply(lambda x: x.groupby('labels').apply(lambda x: (x.end - x.start).sum()))
total_time = df.groupby(level=0).apply(lambda x: x.index.get_level_values(1).max() - x.index.get_level_values(1).min())
time_at_loc = time_at_loc/total_time
vf_top1 = time_at_loc.groupby(level=0).nlargest(1).reset_index(level=[1,2],drop=True)
#Entropies
re = random_entropy(df)
real_ent = real_entropy(df)
#KROG and ROG
com = df[['lat','long']].groupby(level=0).apply(lambda x: x[['lat','long']].mean())  # center of mass
freq_rank = df.groupby(level=0).apply(lambda x: x.groupby('labels')['labels'].count().sort_values(ascending=False)/x.groupby('labels')['labels'].count().sum())
freq_rank.columns = ['freq']

rogfull = {}
for uid,vals in df.groupby(level=0):
    if not df.crs == 3857:
        raise ValueError
    freq_user = freq_rank.loc[uid]
    freq_user.name = 'freq'
    vals = vals.join(freq_user,on='labels',rsuffix='r_')
    vals = vals[['freq','lat','long']]
    lat_diff = ((vals['lat'] - com['lat'])**2 + (vals['long'] - com['long'])**2)*vals['freq']
    lat_diff = lat_diff.mean()**.5
    rogfull[uid] = lat_diff
rogfull = pd.DataFrame().from_dict(rogfull,orient='index')

krog2 = {}
k = 2
for uid,vals in df.groupby(level=0):
    if not df.crs == 3857:
        raise ValueError
    freq_user = freq_rank.loc[uid]
    freq_user.name = 'freq'
    freq_user = freq_user.head(k)
    vals = vals.join(freq_user,on='labels',rsuffix='r_',how='right')
    com_user = vals[['lat','long']].mean()
    vals = vals[['freq','lat','long']]
    lat_diff = ((vals['lat'] - com_user['lat'])**2 + (vals['long'] - com_user['long'])**2)*vals['freq']
    lat_diff = lat_diff.mean()**.5
    krog2[uid] = lat_diff
krog2 = pd.DataFrame().from_dict(krog2,orient='index')

krog4 = {}
k = 4
for uid,vals in df.groupby(level=0):
    if not df.crs == 3857:
        raise ValueError
    freq_user = freq_rank.loc[uid]
    freq_user.name = 'freq'
    freq_user = freq_user.head(k)
    vals = vals.join(freq_user,on='labels',rsuffix='r_',how='right')
    com_user = vals[['lat','long']].mean()
    vals = vals[['freq','lat','long']]
    lat_diff = ((vals['lat'] - com_user['lat'])**2 + (vals['long'] - com_user['long'])**2)*vals['freq']
    lat_diff = lat_diff.mean()**.5
    krog4[uid] = lat_diff
krog4 = pd.DataFrame().from_dict(krog4,orient='index')

krog8 = {}
k = 8
for uid,vals in df.groupby(level=0):
    if not df.crs == 3857:
        raise ValueError
    freq_user = freq_rank.loc[uid]
    freq_user.name = 'freq'
    freq_user = freq_user.head(k)
    vals = vals.join(freq_user,on='labels',rsuffix='r_',how='right')
    com_user = vals[['lat','long']].mean()
    vals = vals[['freq','lat','long']]
    lat_diff = ((vals['lat'] - com_user['lat'])**2 + (vals['long'] - com_user['long'])**2)*vals['freq']
    lat_diff = lat_diff.mean()**.5
    krog8[uid] = lat_diff
krog8 = pd.DataFrame().from_dict(krog8,orient='index')

In [None]:
#MOVE TO SCIKITMOBILITY
t_df = df.to_crs(4326)
t_df = skmob.TrajDataFrame(df)
t_df = t_df.reset_index()
t_df.columns = ['user_id','datetime','lat','lng','labels','geometry','date']

#AVGFROMHOME
mean_from_home = {}
for uid, vals in df.groupby(level=0):
    home_loc = homes.loc[uid]
    home_loc = Point(home_loc['lat'],home_loc.long)
    all_dist = vals.distance(home_loc)
    all_dist = all_dist[all_dist != 0]
    mean_from_home[uid] = all_dist.median()
mfh = pd.DataFrame.from_dict(mean_from_home,orient='index')

In [None]:
combined = pd.concat([rogfull,humobi_jl.groupby(level=0).median(),nodl,nod,vf_top1,re,krog2,krog4,krog8,mfh],axis=1)
combined.columns = ['RoG','Jump (median)', 'Num_of_loc', 'Num_of_days', 'Frequency (top1)', 'Random Entropy',
                    'RoG k2', 'RoG k4', 'RoG k8', 'Median from Home',]