In [None]:
import os
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import matplotlib.pyplot as plt

## 1. Load Pseudo PFLOW data from CSV files

In [None]:
# Define the folder containing Pseudo PFLOW CSV files
folder_path = '/mnt/large/data/PseudoPFLOW/ver2.0/activity/13/'  # Replace with the actual folder path containing your CSV files

# Read all CSV files from the folder and concatenate them into a single DataFrame
pflow_data_list = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read each CSV file
        df = pd.read_csv(file_path, names=['pid', 'age', 'gender', 'occupation', 'activity_type', 'activity_starttime', 'activity_duration', 'activity_longtitude', 'activity_latitude', 'address_code'])
        pflow_data_list.append(df)

# Concatenate all DataFrames into a single DataFrame
pflow_data = pd.concat(pflow_data_list, ignore_index=True)

In [None]:
# Function to calculate the radius of gyration for each individual
def calculate_radius_of_gyration(group):
    coords = group[['activity_latitude', 'activity_longtitude']].to_numpy()
    centroid = coords.mean(axis=0)  # Calculate centroid of the locations
    radius_of_gyration = np.sqrt(np.mean([great_circle(coord, centroid).meters**2 for coord in coords]))
    return radius_of_gyration

# Function to count the number of unique visited locations
def count_visited_locations(group):
    unique_locations = group[['activity_latitude', 'activity_longtitude']].drop_duplicates()
    return len(unique_locations)

# Function to calculate the stay time distribution for each individual
def calculate_stay_time_distribution(group):
    return group['activity_duration'].values

# Function to calculate jump lengths (distances between consecutive activity points)
def calculate_jump_lengths(group):
    # Sort the group by activity start time to ensure correct order
    group = group.sort_values(by='activity_starttime')
    coords = group[['activity_latitude', 'activity_longtitude']].to_numpy()

    # Calculate the distance between consecutive points
    jump_lengths = [
        great_circle(coords[i], coords[i+1]).meters for i in range(len(coords) - 1)
    ]
    return jump_lengths

# Aggregating the results for all individuals
results = {'pid': [], 'radius_of_gyration': [], 'num_locations': [], 'stay_times': [], 'jump_lengths': []}

# Group the data by individual (pid)
for pid, group in pflow_data.groupby('pid'):
    radius_of_gyration = calculate_radius_of_gyration(group)
    num_locations = count_visited_locations(group)
    stay_times = calculate_stay_time_distribution(group)
    jump_lengths = calculate_jump_lengths(group)

    results['pid'].append(pid)
    results['radius_of_gyration'].append(radius_of_gyration)
    results['num_locations'].append(num_locations)
    results['stay_times'].append(stay_times)
    results['jump_lengths'].append(jump_lengths)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Visualize the distribution of results
plt.figure(figsize=(12, 10))

# Activity radius distribution
plt.subplot(4, 1, 1)
plt.hist(results_df['radius_of_gyration'], bins=30, edgecolor='black')
plt.title('Distribution of Radius of Gyration')
plt.xlabel('Radius of Gyration (meters)')
plt.ylabel('Frequency')

# Number of locations visited distribution
plt.subplot(4, 1, 2)
plt.hist(results_df['num_locations'], bins=30, edgecolor='black')
plt.title('Distribution of Number of Locations Visited')
plt.xlabel('Number of Locations')
plt.ylabel('Frequency')

# Stay time distribution (flatten the stay_times array)
stay_times_all = np.concatenate(results_df['stay_times'].values)
plt.subplot(4, 1, 3)
plt.hist(stay_times_all, bins=30, edgecolor='black')
plt.title('Distribution of Stay Times')
plt.xlabel('Stay Time (minutes)')
plt.ylabel('Frequency')

# Jump length distribution (flatten the jump_lengths array)
jump_lengths_all = np.concatenate(results_df['jump_lengths'].values)
plt.subplot(4, 1, 4)
plt.hist(jump_lengths_all, bins=30, edgecolor='black')
plt.title('Distribution of Jump Lengths')
plt.xlabel('Jump Length (meters)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()