# Imports

In [None]:
import os

import pandas as pd
from tqdm.autonotebook import tqdm

from srai.datasets import PortoTaxiDataset

# Parameters

In [None]:
subset_size = 10_000

target_column = "trip_id"
bucket_number = 3

# Data Loading

In [None]:
%load_ext dotenv

%dotenv

In [None]:
porto_taxi = PortoTaxiDataset()
hf_token = os.getenv("HF_TOKEN")
gdf_porto_taxi = porto_taxi.load(hf_token=hf_token)

# [Temp] Use subset

In [None]:
gdf_porto_taxi = gdf_porto_taxi.head(subset_size)

# Data Splitting

In [None]:
train_gdf, dev_gdf, test_gdf = porto_taxi.train_dev_test_split_bucket_trajectory(
    gdf_porto_taxi, target_column=target_column, bucket_number=bucket_number
)

In [None]:
def calculate_trajectory_duration(df: pd.DataFrame):
    """
    Calculate the duration of a trajectory based on timestamps in a DataFrame.

    This function computes the duration of a trajectory by finding the
    difference between the minimum and maximum timestamps in the given
    DataFrame and returns the duration in seconds.

    Parameters:
    df (pandas.DataFrame): A DataFrame containing a column 'timestamp'
                           with datetime objects.

    Returns:
    float: The duration of the trajectory in seconds.
    """
    min_time = df["timestamp"].min()
    max_time = df["timestamp"].max()
    return (max_time - min_time).total_seconds()

In [None]:
tqdm.pandas()

train_gdf.groupby(target_column).progress_apply(calculate_trajectory_duration).reset_index(
    name="duration"
).describe()

In [None]:
dev_gdf.groupby(target_column).progress_apply(calculate_trajectory_duration).reset_index(
    name="duration"
).describe()

In [None]:
test_gdf.groupby(target_column).progress_apply(calculate_trajectory_duration).reset_index(
    name="duration"
).describe()