# Imports

In [None]:
import os
from datetime import datetime

import geopandas as gpd
import movingpandas as mpd
import numpy as np
import torch
import torch.nn as nn
from shapely.geometry import Point
from sklearn.metrics import mean_absolute_error, mean_squared_error
from srai.datasets import PortoTaxiDataset
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer
from srai.regionalizers import geocode_to_region_gdf
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [None]:
gpd.options.io_engine = "pyogrio"

# Enable loading Environment Variables

In [None]:
%load_ext dotenv

%dotenv

In [None]:
hf_token = os.getenv("HF_TOKEN")

# Data Loading

In [None]:
subset_size = 50_000
use_subset = True

gdf_porto_taxi_full_path = os.path.join("data", "porto_taxi.feather")
gdf_porto_taxi_subset_path = os.path.join(
    "data", f"porto_taxi_subset_{subset_size}.feather"
)

In [None]:
if not use_subset:
    if not os.path.exists(gdf_porto_taxi_full_path):
        porto_taxi_dataset = PortoTaxiDataset()
        gdf_porto_taxi = porto_taxi_dataset.load(hf_token=hf_token)
        gdf_porto_taxi.to_feather(gdf_porto_taxi_full_path)
    else:
        gdf_porto_taxi = gpd.read_feather(gdf_porto_taxi_full_path)
else:
    if not os.path.exists(gdf_porto_taxi_subset_path):
        if not os.path.exists(gdf_porto_taxi_full_path):
            porto_taxi_dataset = PortoTaxiDataset()
            gdf_porto_taxi = porto_taxi_dataset.load(hf_token=hf_token)
            gdf_porto_taxi.to_feather(gdf_porto_taxi_full_path)
            gdf_porto_taxi = gdf_porto_taxi.head(subset_size)
            gdf_porto_taxi.to_feather(gdf_porto_taxi_subset_path)
        else:
            gdf_porto_taxi = gpd.read_feather(gdf_porto_taxi_full_path)
            gdf_porto_taxi = gdf_porto_taxi.head(subset_size)
            gdf_porto_taxi.to_feather(gdf_porto_taxi_subset_path)
    else:
        gdf_porto_taxi = gpd.read_feather(gdf_porto_taxi_subset_path)

In [None]:
gdf_porto_taxi.drop(
    [
        "taxi_id",
        "call_type",
        "origin_call",
        "origin_stand",
        "day_type",
        "travel_time_seconds",
    ],
    axis=1,
    inplace=True,
)

# Convert LineString to Point

In [None]:
exploded_rows = []

for idx, row in tqdm(gdf_porto_taxi.iterrows(), total=gdf_porto_taxi.shape[0]):
    start_timestamp = row.timestamp
    current_timestamp = start_timestamp
    for xy in row.geometry.coords:
        point = Point(xy)
        row_dict = row.to_dict()
        row_dict["geometry"] = point
        row_dict["timestamp"] = current_timestamp
        current_timestamp += 15
        exploded_rows.append(row_dict)

In [None]:
gdf_porto_taxi_points = gpd.GeoDataFrame(exploded_rows, crs="EPSG:4326")

In [None]:
gdf_porto_taxi_points["timestamp"] = gdf_porto_taxi_points["timestamp"].apply(
    lambda x: datetime.fromtimestamp(x)
)

### Restricting to Porto Area

In [None]:
porto_area = geocode_to_region_gdf("Porto District, Portugal")

In [None]:
gdf_porto_taxi_points_inside_porto = gdf_porto_taxi_points.sjoin(porto_area)

In [None]:
gdf_merged = gdf_porto_taxi_points.merge(
    gdf_porto_taxi_points_inside_porto, how="left", indicator=True
)
df_porto_taxi_points_outside_porto = gdf_merged[gdf_merged["_merge"] == "left_only"]

In [None]:
trajectories_outside_porto = list(
    df_porto_taxi_points_outside_porto["trip_id"].unique()
)

In [None]:
gdf_porto_taxi_points = gdf_porto_taxi_points[
    ~gdf_porto_taxi_points["trip_id"].isin(trajectories_outside_porto)
]

# Trajectory Collection

In [None]:
trajectory_collection = mpd.TrajectoryCollection(
    data=gdf_porto_taxi_points, traj_id_col="trip_id", t="timestamp"
)

### Speed calculation

In [None]:
trajectory_collection.add_speed(units=("km", "h"), n_threads=24, overwrite=True)

### Outliers removal

In [None]:
trajectory_collection = mpd.OutlierCleaner(trajectory_collection).clean(
    v_max=120, units=("km", "h")
)

In [None]:
filtered_trajectory_collection = [
    trajectory
    for trajectory in trajectory_collection.trajectories
    if trajectory.size() >= 10
]

In [None]:
trajectory_collection = mpd.TrajectoryCollection(filtered_trajectory_collection)

### Generalization

In [None]:
trajectory_collection = mpd.DouglasPeuckerGeneralizer(trajectory_collection).generalize(
    tolerance=0.0001
)

# Conversion to Point GeoDataFrame

In [None]:
gdf_trajectory_point_collection = trajectory_collection.to_point_gdf().sort_values(
    by=["trip_id", "timestamp"]
)

In [None]:
# scaler = MinMaxScaler()
# gdf_trajectory_point_collection[["speed", "distance"]] = scaler.fit_transform(
#     gdf_trajectory_point_collection[["speed", "distance"]]
# )

# Add Spatial Embedding

In [None]:
regionalizer = H3Regionalizer(resolution=9)
gdf_regions = regionalizer.transform(gdf_trajectory_point_collection)

In [None]:
loader = OSMPbfLoader()
gdf_features = loader.load(gdf_regions, HEX2VEC_FILTER)

In [None]:
joiner = IntersectionJoiner()
gdf_joint = joiner.transform(gdf_regions, gdf_features)

In [None]:
neighbourhood = H3Neighbourhood(gdf_regions)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embedder_hidden_sizes = [150, 100, 50, 10]
embedder = Hex2VecEmbedder(embedder_hidden_sizes)

df_embeddings = embedder.fit_transform(
    gdf_regions,
    gdf_features,
    gdf_joint,
    neighbourhood,
    trainer_kwargs={"max_epochs": 15, "accelerator": device},
    batch_size=64,
)

In [None]:
df_embeddings.rename(
    columns={
        0: "embedding_0",
        1: "embedding_1",
        2: "embedding_2",
        3: "embedding_3",
        4: "embedding_4",
        5: "embedding_5",
        6: "embedding_6",
        7: "embedding_7",
        8: "embedding_8",
        9: "embedding_9",
    },
    inplace=True,
)

In [None]:
gdf_joined = gpd.sjoin(gdf_trajectory_point_collection, gdf_regions, how="left")

In [None]:
gdf_joined.rename(ckolumns={"index_right": "region_id"}, inplace=True)
gdf_joined.reset_index(inplace=True)

In [None]:
gdf_points_embeddings = gdf_joined.merge(df_embeddings, on="region_id", how="left")

# Preparing DataSet

In [None]:
class TrajectoryDataset(Dataset):
    def __init__(self, df, seq_length):
        self.sequences = []
        self.labels = []
        self.seq_length = seq_length
        self._create_sequences(df)

    def _create_sequences(self, df):
        for trajectory_id in df["trip_id"].unique():
            trajectory_data = df[df["trip_id"] == trajectory_id]
            for i in range(len(trajectory_data) - self.seq_length):
                sequence = trajectory_data.iloc[i : i + self.seq_length]
                travel_time = (
                    sequence["timestamp"].iloc[-1] - sequence["timestamp"].iloc[0]
                ).total_seconds()
                self.sequences.append(
                    sequence[
                        [
                            "embedding_0",
                            "embedding_1",
                            "embedding_2",
                            "embedding_3",
                            "embedding_4",
                            "embedding_5",
                            "embedding_6",
                            "embedding_7",
                            "embedding_8",
                            "embedding_9",
                        ]
                    ].values
                )
                self.labels.append(travel_time)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.float32), torch.tensor(
            self.labels[idx], dtype=torch.float32
        )

In [None]:
seq_length = 10
dataset = TrajectoryDataset(gdf_points_embeddings, seq_length)

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, test_size]
)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, num_workers=24, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=24, shuffle=False)

In [None]:
class TravelTimeLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(TravelTimeLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out


input_size = 10
hidden_size = 64
num_layers = 2
output_size = 1

model = TravelTimeLSTM(input_size, hidden_size, num_layers, output_size)
model = model.to(device)

In [None]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 10

In [None]:
for epoch in range(num_epochs):
    model.train()
    for sequences, labels in train_loader:
        sequences, labels = sequences.to(device), labels.to(device)

        # Forward pass
        outputs = model(sequences)
        loss = criterion(outputs, labels.unsqueeze(1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
model.eval()
test_loss = 0
with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = model(sequences)
        loss = criterion(outputs, labels.unsqueeze(1))
        test_loss += loss.item()

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

In [None]:
predictions = []
actuals = []

with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = model(sequences)
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 5))
plt.plot(actuals, color="blue", label="Actual Travel Time")
plt.plot(predictions, color="red", label="Predicted Travel Time")
plt.title("Travel Time Prediction")
plt.xlabel("Time")
plt.ylabel("Travel Time")
plt.legend()
plt.show()

In [None]:
mean_time_travelled = np.mean(actuals)
print(f"Mean average travelled time: {mean_time_travelled}")

mae = mean_absolute_error(actuals, predictions)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

mse = mean_squared_error(actuals, predictions)
print(f"Mean Squared Error (MSE): {mse:.4f}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

In [None]:
np.min(actuals), np.max(actuals), np.mean(actuals), np.median(actuals)

In [None]:
np.min(predictions), np.max(predictions), np.mean(predictions), np.median(predictions)