# Example notebook to load and filter a raw data in S3

In [None]:
import sys
import os

In [None]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import core.daphmeIO as loader
import core.filters as filters

We load a sample of Gravy trajectory data for the Philadelphia area

In [None]:
part_path = '../data/sample3/date=2024-01-07/aad4a23f7a90441aa0f55f06e5e4313d-0.parquet'
part_path = "s3://phl-pings/gravy_clean/date=2019-11-01/part-00007-a7eb387d-1b0c-4aa7-b6a1-47023f1940bd.c000.snappy.parquet"

traj_cols =  {"user_id":"identifier",
              "latitude":"x",
              "longitude":"y",
              "datetime":"local_timestamp",
              "timestamp":"timestamp"}

### Get a sample of users

In [None]:
u_sample = loader.sample_users(part_path, format='parquet', frac_users=0.2, user_id='identifier')

### Load data for users in u_sample for 3 days

In [None]:
filepath = ['s3://phl-pings/gravy_clean/date=2019-11-01/',
            's3://phl-pings/gravy_clean/date=2019-11-02/',
            's3://phl-pings/gravy_clean/date=2019-11-03/',
            's3://phl-pings/gravy_clean/date=2019-11-04/']

data = loader.sample_from_file(part_path, users=u_sample, format='parquet', traj_cols=traj_cols, user_id = 'identifier')

In [None]:
data['timestamp'] = data[traj_cols['datetime']].astype(int) // 10**9

### Project coordinates to Web Mercator

In [None]:
data = filters.to_projection(data, latitude='x', longitude='y')

### Compute the q-statistic for the users in this sample

In [None]:
q_stats = filters.q_stats(data, user_id= traj_cols['user_id'], timestamp= traj_cols['timestamp'])

In [None]:
# Plotting the histogram

plt.figure(figsize=(4, 3))
plt.hist(q_stats['q_stat'], bins=20, edgecolor='black')
plt.xlabel('Q Statistic')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()