In [1]:
# Bus Time Prediction Model
# This notebook implements an encoder-decoder architecture to predict bus arrival times

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import random
import math
from tqdm import tqdm

In [2]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [3]:
# 1. Load and preprocess the data
def load_data(file_path):
    """Load the bus stop data from a CSV file"""
    df = pd.read_csv(file_path)  # Assuming space-separated values based on your example
    print(f"Loaded data with {len(df)} rows and {df.columns.tolist()} columns")
    return df

In [4]:
from google.colab import files
uploaded = files.upload()
file_path = next(iter(uploaded))
print(file_path)
df = load_data(file_path)

Saving test.csv to test.csv
test.csv
Loaded data with 87545 rows and ['id', 'route_name', 'day', 'time', 'stop_id', 'scheduled_arrival_time', 'scheduled_departure_time', 'distance_to_stop', 'time_to_stop', 'residual_stop_time'] columns


## Fields
- id [string]
- route_name [string, categorical]
- day [int, categorical]
- time [minutes, linear]
- stop_distance [meters, linear]
- scheduled_stop_time [seconds, linear]
- residual_stop_time [seconds, linear]

In [5]:
# 2. Group data by trip_id
def preprocess_data(df):
    """Preprocess the data to prepare for model training"""
    # Extract unique trip IDs
    trip_ids = df['id'].unique()
    print(f"Found {len(trip_ids)} unique trips")

    # Extract unique route names for one-hot encoding
    route_names = df['route_name'].unique()
    print(f"Found {len(route_names)} unique routes: {route_names}")

    # Create route name encoder
    route_encoder = OneHotEncoder(sparse_output=False)
    route_encoder.fit(df[['route_name']])

    # Extract unique days for one hot encoding
    days = df['day'].unique()
    print(f"Found {len(days)} unique days: {days}")

    # Create day encoder
    day_encoder = OneHotEncoder(sparse_output=False)
    day_encoder.fit(df[['day']])

    # Create a scaler for time
    time_scaler = StandardScaler()
    time_scaler.fit(df[['time']])

    # Create a scaler for stop distances
    distance_scaler = StandardScaler()
    distance_scaler.fit(df[['distance_to_stop']])

    # Create a scaler for scheduled stop times
    scheduled_time_scaler = StandardScaler()
    scheduled_time_scaler.fit(df[['time_to_stop']])

    # Create a scaler for residual stop times
    residual_time_scaler = StandardScaler()
    residual_time_scaler.fit(df[['residual_stop_time']])

    return trip_ids, route_encoder, day_encoder, time_scaler, distance_scaler, scheduled_time_scaler, residual_time_scaler

In [6]:
trip_ids, route_encoder, day_encoder, time_scaler, distance_scaler, scheduled_time_scaler, residual_time_scaler = preprocess_data(df)

Found 4187 unique trips
Found 23 unique routes: ['206' '203' '220' '202A' '202' '207' '216' '215' '214' '208' '207A' '205'
 '220X' '223' '219' '201' '215A' '225L' '225' '209A' '209' '223X' '212']
Found 7 unique days: [6 0 1 2 3 4 5]


In [7]:
# 3. Create a custom dataset for the bus trip data
class BusTripDataset(Dataset):
    def __init__(self, df,
                 trip_ids, route_encoder, day_encoder, time_scaler,
                 distance_scaler, scheduled_time_scaler, residual_time_scaler,
                 observed_ratio_range=(0.3, 0.9), max_target=15, max_stops=50, overlap=2):
        """
        Dataset for bus trip prediction
        """
        # Dataframe
        self.df = df
        # General Features
        self.trip_ids = trip_ids
        self.route_encoder = route_encoder
        self.day_encoder = day_encoder
        # Stop specific features
        self.time_scaler = time_scaler
        self.distance_scaler = distance_scaler
        self.scheduled_time_scaler = scheduled_time_scaler
        self.residual_time_scaler = residual_time_scaler
        # Config
        self.observed_ratio_range = observed_ratio_range
        self.max_stops = max_stops
        self.max_target = max_target
        self.overlap = overlap

    def __len__(self):
        return len(self.trip_ids)

    def __getitem__(self, idx):
        trip_id = self.trip_ids[idx]
        trip_df = self.df[self.df['id'] == trip_id] #.sort_values('stop_distance')

        # Ensure we don't exceed max_stops
        if len(trip_df) > self.max_stops:
            trip_df = trip_df.head(self.max_stops)

        # Number of stops to use as observed
        num_target = int(min(self.max_target, len(trip_df)//2))
        num_observed = int(len(trip_df) - num_target)
        # observed_ratio = random.uniform(*self.observed_ratio_range)
        # num_observed = max(1, int(len(trip_df) * observed_ratio))
        # num_observed = max(1, int(len(trip_df) * self.observed_ratio))

        # Features for the trip
        route_name = trip_df['route_name'].iloc[0]
        day = trip_df['day'].iloc[0]

        # One-hot encode the route name
        route_name_df = pd.DataFrame([[route_name]], columns=["route_name"])
        route_encoded = route_encoder.transform(route_name_df)[0]

        # One-hot encode the day
        day_df = pd.DataFrame([[day]], columns=["day"])
        day_encoded = day_encoder.transform(day_df)[0]

        # Create general trip features
        trip_features = np.concatenate([
            route_encoded,
            day_encoded
        ])

        # Create arrays for all stops
        stop_ids = trip_df['stop_id'].values
        times = self.time_scaler.transform(trip_df[['time']]).flatten()
        distances = self.distance_scaler.transform(trip_df[['distance_to_stop']]).flatten()
        scheduled_times = self.scheduled_time_scaler.transform(trip_df[['time_to_stop']]).flatten()
        residual_times = self.residual_time_scaler.transform(trip_df[['residual_stop_time']]).flatten()

        # Split into observed and target
        observed_stop_ids = stop_ids[:num_observed]
        observed_times = times[:num_observed]
        observed_distances = distances[:num_observed]
        observed_scheduled_times = scheduled_times[:num_observed]
        observed_residual_times = residual_times[:num_observed]

        target_stop_ids = stop_ids[num_observed - self.overlap:]
        target_times = times[num_observed - self.overlap:]
        target_distances = distances[num_observed - self.overlap:]
        target_scheduled_times = scheduled_times[num_observed - self.overlap:]
        target_residual_times = residual_times[num_observed - self.overlap:]

        # Pad sequences if needed
        # max_observed = math.ceil(self.max_stops * self.observed_ratio_range[1])
        # max_target = self.max_stops - math.floor(self.max_stops * self.observed_ratio_range[0])
        max_observed = self.max_stops
        max_target = self.max_target + self.overlap

        # Pad observed sequences
        if len(observed_stop_ids) < max_observed:
            pad_length = max_observed - len(observed_stop_ids)
            observed_stop_ids = np.pad(observed_stop_ids, (0, pad_length), 'constant', constant_values='PAD')

            observed_times = np.pad(observed_times, (0, pad_length), 'constant', constant_values=0)
            observed_distances = np.pad(observed_distances, (0, pad_length), 'constant', constant_values=0)
            observed_scheduled_times = np.pad(observed_scheduled_times, (0, pad_length), 'constant', constant_values=0)
            observed_residual_times = np.pad(observed_residual_times, (0, pad_length), 'constant', constant_values=0)

        # Pad target sequences
        if len(target_stop_ids) < max_target:
            pad_length = max_target - len(target_stop_ids)
            target_stop_ids = np.pad(target_stop_ids, (0, pad_length), 'constant', constant_values='PAD')

            target_times = np.pad(target_times, (0, pad_length), 'constant', constant_values=0)
            target_distances = np.pad(target_distances, (0, pad_length), 'constant', constant_values=0)
            target_scheduled_times = np.pad(target_scheduled_times, (0, pad_length), 'constant', constant_values=0)
            target_residual_times = np.pad(target_residual_times, (0, pad_length), 'constant', constant_values=0)

        # Create mask for valid targets (non-padded values)
        target_mask = (target_stop_ids != 'PAD').astype(float)

        # Convert to tensors
        trip_features = torch.tensor(trip_features, dtype=torch.float32)

        observed_times = torch.tensor(observed_times, dtype=torch.float32)
        observed_distances = torch.tensor(observed_distances, dtype=torch.float32)
        observed_scheduled_times = torch.tensor(observed_scheduled_times, dtype=torch.float32)
        observed_residual_times = torch.tensor(observed_residual_times, dtype=torch.float32)

        target_times = torch.tensor(target_times, dtype=torch.float32)
        target_distances = torch.tensor(target_distances, dtype=torch.float32)
        target_scheduled_times = torch.tensor(target_scheduled_times, dtype=torch.float32)
        target_residual_times = torch.tensor(target_residual_times, dtype=torch.float32)

        target_mask = torch.tensor(target_mask, dtype=torch.float32)

        return {
            'trip_features': trip_features,

            'observed_times': observed_times,
            'observed_distances': observed_distances,
            'observed_scheduled_times': observed_scheduled_times,
            'observed_residual_times': observed_residual_times,

            'target_times': target_times,
            'target_distances': target_distances,
            'target_scheduled_times': target_scheduled_times,
            'target_residual_times': target_residual_times,

            'target_mask': target_mask,
            'num_observed': num_observed,
            'num_target': len(trip_df) - num_observed
        }

In [8]:
print("Creating datasets...")
dataset = BusTripDataset(df,
                 trip_ids, route_encoder, day_encoder, time_scaler,
                 distance_scaler, scheduled_time_scaler, residual_time_scaler)
print(f"trips in dataset: {len(dataset)}")

Creating datasets...
trips in dataset: 4187


In [9]:
example_trip = dataset[0]
print(example_trip)

{'trip_features': tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), 'observed_times': tensor([1.2548, 1.2564, 1.2580, 1.2597, 1.2613, 1.2629, 1.2661, 1.2693, 1.2710,
        1.2726, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000]), 'observed_distances': tensor([-0.9600, -0.0217, -0.3413, -0.4399, -0.1216, -0.2694,  0.0071, -0.4887,
        -0.1056, -0.4989,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,

In [10]:
# 4. Define the encoder-decoder model
class BusTimeEncoder(nn.Module):
    def __init__(self, trip_feature_dim, hidden_dim):
        super(BusTimeEncoder, self).__init__()
        self.hidden_dim = hidden_dim

        # Process trip features
        self.trip_fc = nn.Sequential(
            nn.Linear(trip_feature_dim, hidden_dim),
            nn.ReLU()
        )

        # Process observed stops
        self.stop_embedding = nn.Linear(4, hidden_dim)  # time, distance, scheduled, residual

        # LSTM to process the sequence of stops
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)

    def forward(self, trip_features, observed_times, observed_distances, observed_scheduled_times, observed_residual_times):
        # Process trip features
        trip_embedding = self.trip_fc(trip_features)



        # Combine distance and time for each stop
        # batch_size = observed_distances.size(0)
        # seq_length = observed_distances.size(1)
        observed_features = torch.stack([observed_times, observed_distances, observed_scheduled_times, observed_residual_times], dim=2)

        # Process each stop
        observed_embedded = self.stop_embedding(observed_features)

        # Initialize hidden state with trip features
        h0 = trip_embedding.unsqueeze(0)
        c0 = torch.zeros_like(h0)

        # Process the sequence
        output, (hn, cn) = self.lstm(observed_embedded, (h0, c0))

        return output, (hn, cn)

In [11]:
class BusTimeDecoder(nn.Module):
    def __init__(self, hidden_dim, output_dim=1):
        super(BusTimeDecoder, self).__init__()
        self.hidden_dim = hidden_dim

        # Embedding for target residuals
        self.stop_embedding = nn.Linear(3, hidden_dim)

        # LSTM for decoding
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)

        # Output layer
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, target_times, target_distances, target_scheduled_times, encoder_hidden):

        target_features = torch.stack([target_times, target_distances, target_scheduled_times], dim=2)
        # Embed the target distances
        #target_residuals = target_residuals.unsqueeze(2) # Add feature dimension

        target_embedded = self.stop_embedding(target_features)

        # Use encoder's hidden state
        output, _ = self.lstm(target_embedded, encoder_hidden)

        # Generate time predictions
        residual_pred = self.fc_out(output)

        return residual_pred.squeeze(2)

In [12]:
class BusTimeEncoderDecoder(nn.Module):
    def __init__(self, trip_feature_dim, hidden_dim):
        super(BusTimeEncoderDecoder, self).__init__()
        self.encoder = BusTimeEncoder(trip_feature_dim, hidden_dim)
        self.decoder = BusTimeDecoder(hidden_dim)

    def forward(self, trip_features, observed_times, observed_distances, observed_scheduled_times, observed_residual_times, target_times, target_distances, target_scheduled_times):
        # Encode the observed stops
        _, encoder_hidden = self.encoder(trip_features, observed_times, observed_distances, observed_scheduled_times, observed_residual_times)

        # Decode to predict target times
        residual_pred = self.decoder(target_times, target_distances, target_scheduled_times, encoder_hidden)

        return residual_pred

In [13]:
# 5. Training function
def train_model(model, train_loader, val_loader, epochs=50, learning_rate=0.001):
    """Train the encoder-decoder model"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss(reduction='none')

    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        # Training
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} - Training'):
            # Move data to device
            trip_features = batch['trip_features'].to(device)

            observed_times = batch['observed_times'].to(device)
            observed_distances = batch['observed_distances'].to(device)
            observed_scheduled_times = batch['observed_scheduled_times'].to(device)
            observed_residual_times = batch['observed_residual_times'].to(device)

            target_times = batch['target_times'].to(device)
            target_distances = batch['target_distances'].to(device)
            target_scheduled_times = batch['target_scheduled_times'].to(device)
            target_residual_times = batch['target_residual_times'].to(device)

            target_mask = batch['target_mask'].to(device)

            # Forward pass
            predicted_time_residuals = model(trip_features, observed_times, observed_distances,
                                    observed_scheduled_times, observed_residual_times,
                                             target_times, target_distances, target_scheduled_times)

            # Compute loss (only for non-padded values)
            loss = criterion(predicted_time_residuals, target_residual_times)
            masked_loss = loss * target_mask
            loss = masked_loss.sum() / (target_mask.sum() + 1e-8)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} - Validation'):
                # Move data to device
                trip_features = batch['trip_features'].to(device)

                observed_times = batch['observed_times'].to(device)
                observed_distances = batch['observed_distances'].to(device)
                observed_scheduled_times = batch['observed_scheduled_times'].to(device)
                observed_residual_times = batch['observed_residual_times'].to(device)

                target_times = batch['target_times'].to(device)
                target_distances = batch['target_distances'].to(device)
                target_scheduled_times = batch['target_scheduled_times'].to(device)
                target_residual_times = batch['target_residual_times'].to(device)

                target_mask = batch['target_mask'].to(device)

                # Forward pass
                predicted_time_residuals = model(trip_features, observed_times, observed_distances,
                                        observed_scheduled_times, observed_residual_times,
                                                 target_times, target_distances, target_scheduled_times)

                # Compute loss (only for non-padded values)
                loss = criterion(predicted_time_residuals, target_residual_times)
                masked_loss = loss * target_mask
                loss = masked_loss.sum() / (target_mask.sum() + 1e-8)

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

    # Plot training curves
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    return model, train_losses, val_losses

In [14]:
# 6. Function to make predictions
def predict(model, test_loader, residual_time_scaler):
    """Generate predictions for test data"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()

    all_predictions = []
    all_targets = []
    trip_info = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Generating predictions'):
        # Move data to device
            trip_features = batch['trip_features'].to(device)

            observed_times = batch['observed_times'].to(device)
            observed_distances = batch['observed_distances'].to(device)
            observed_scheduled_times = batch['observed_scheduled_times'].to(device)
            observed_residual_times = batch['observed_residual_times'].to(device)

            target_times = batch['target_times'].to(device)
            target_distances = batch['target_distances'].to(device)
            target_scheduled_times = batch['target_scheduled_times'].to(device)
            target_residual_times = batch['target_residual_times'].to(device)

            target_mask = batch['target_mask'].to(device)

            num_observed = batch['num_observed']
            num_target = batch['num_target']

            # Generate predictions
            predicted_time_residuals = model(trip_features, observed_times, observed_distances,
                        observed_scheduled_times, observed_residual_times,
                                             target_times, target_distances, target_scheduled_times)

            # Convert predictions back to original scale
            predicted_time_residuals_np = predicted_time_residuals.cpu().numpy()
            predicted_time_residuals_orig = residual_time_scaler.inverse_transform(
                predicted_time_residuals_np.reshape(-1, 1)
            ).reshape(predicted_time_residuals_np.shape)

            # Convert targets back to original scale
            target_residual_times_np = target_residual_times.cpu().numpy()
            target_residual_times_orig = residual_time_scaler.inverse_transform(
                target_residual_times_np.reshape(-1, 1)
            ).reshape(target_residual_times_np.shape)

            # Store results
            for i in range(len(num_observed)):
                n_obs = num_observed[i].item()
                n_target = num_target[i].item()

                # Extract valid predictions and targets
                valid_pred = predicted_time_residuals_orig[i, :n_target]
                valid_target = target_residual_times_orig[i, :n_target]

                all_predictions.append(valid_pred)
                all_targets.append(valid_target)
                trip_info.append({
                    'num_observed': n_obs,
                    'num_target': n_target
                })

    return all_predictions, all_targets, trip_info

In [15]:
# 7. Evaluate predictions
def evaluate_predictions(predictions, targets, trip_info):
    """Evaluate the quality of predictions"""
    # Overall MAE and RMSE
    all_pred = np.concatenate([p.flatten() for p in predictions])
    all_target = np.concatenate([t.flatten() for t in targets])

    mae = np.mean(np.abs(all_pred - all_target))
    rmse = np.sqrt(np.mean((all_pred - all_target) ** 2))

    print(f'Overall MAE: {mae:.2f} seconds')
    print(f'Overall RMSE: {rmse:.2f} seconds')

    # Plot a few examples
    num_examples = min(20, len(predictions))
    plt.figure(figsize=(15, 2*num_examples))

    for i in range(num_examples):
        plt.subplot(num_examples, 1, i+1)

        pred = predictions[i]
        target = targets[i]
        n_obs = trip_info[i]['num_observed']
        n_target = trip_info[i]['num_target']

        # Create x-axis based on relative position
        x_obs = np.arange(n_obs)
        x_target = np.arange(n_obs, n_obs + n_target)

        # Plot the observed times (assuming we have them)
        plt.scatter(x_obs, np.zeros(n_obs), color='gray', label='Observed stops')

        # Plot predictions vs targets
        plt.plot(x_target, pred, 'o-', label='Predicted')
        plt.plot(x_target, target, 'x-', label='Actual')

        plt.title(f'Trip Example {i+1}')
        plt.xlabel('Stop Index')
        plt.ylabel('Time (seconds)')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.show()

    # Calculate error by position
    errors_by_position = []
    max_targets = max(len(t) for t in targets)

    for pos in range(max_targets):
        pos_errors = []
        for i in range(len(predictions)):
            if pos < len(predictions[i]):
                pos_errors.append(abs(predictions[i][pos] - targets[i][pos]))

        if pos_errors:
            errors_by_position.append(np.mean(pos_errors))

    # Plot error by position
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(errors_by_position)), errors_by_position, 'o-')
    plt.title('Mean Absolute Error by Stop Position')
    plt.xlabel('Stops into the Future')
    plt.ylabel('MAE (seconds)')
    plt.grid(True)
    plt.show()

In [None]:
# 8. Main execution flow
def main():
    # Create dataset
    print("Creating datasets...")
    dataset = BusTripDataset(df,
                 trip_ids, route_encoder, day_encoder, time_scaler,
                 distance_scaler, scheduled_time_scaler, residual_time_scaler)

    # Split into train, validation, and test sets
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size

    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size, test_size])

    # Create data loaders
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Calculate input dimensions
    sample = next(iter(train_loader))
    trip_feature_dim = sample['trip_features'].shape[1]

    # Create and train the model
    print(f"Creating model with trip_feature_dim={trip_feature_dim}...")
    hidden_dim = 120
    model = BusTimeEncoderDecoder(trip_feature_dim, hidden_dim)

    print("Training model...")
    model, train_losses, val_losses = train_model(
        model, train_loader, val_loader, epochs=40)

    # Make predictions
    print("Generating predictions...")
    predictions, targets, trip_info = predict(model, test_loader, residual_time_scaler)

    # Evaluate predictions
    print("Evaluating predictions...")
    evaluate_predictions(predictions, targets, trip_info)

    # Save the model
    torch.save({
        'model_state_dict': model.state_dict(),
        'route_encoder': route_encoder,
        'day_encoder': day_encoder,
        'time_scaler': time_scaler,
        'distance_scaler': distance_scaler,
        'scheduled_time_scaler': scheduled_time_scaler,
        'residual_time_scaler': residual_time_scaler,
    }, 'bus_time_prediction_model.pth')

    print("Model saved to 'bus_time_prediction_model.pth'")

if __name__ == "__main__":
    main()

Creating datasets...
Creating model with trip_feature_dim=30...
Training model...
Using device: cpu


Epoch 1/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 1/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.78it/s]


Epoch 1/50, Train Loss: 1.1867, Val Loss: 1.4579


Epoch 2/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.64it/s]
Epoch 2/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


Epoch 2/50, Train Loss: 1.1764, Val Loss: 1.4537


Epoch 3/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.57it/s]
Epoch 3/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


Epoch 3/50, Train Loss: 1.1380, Val Loss: 1.4375


Epoch 4/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.67it/s]
Epoch 4/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


Epoch 4/50, Train Loss: 1.1197, Val Loss: 1.4050


Epoch 5/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 5/50 - Validation: 100%|██████████| 20/20 [00:10<00:00,  1.87it/s]


Epoch 5/50, Train Loss: 1.1329, Val Loss: 1.1489


Epoch 6/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.64it/s]
Epoch 6/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


Epoch 6/50, Train Loss: 0.6602, Val Loss: 0.8377


Epoch 7/50 - Training: 100%|██████████| 92/92 [00:54<00:00,  1.67it/s]
Epoch 7/50 - Validation: 100%|██████████| 20/20 [00:12<00:00,  1.59it/s]


Epoch 7/50, Train Loss: 0.4954, Val Loss: 0.6643


Epoch 8/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 8/50 - Validation: 100%|██████████| 20/20 [00:10<00:00,  1.85it/s]


Epoch 8/50, Train Loss: 0.4231, Val Loss: 0.6540


Epoch 9/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 9/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


Epoch 9/50, Train Loss: 0.3676, Val Loss: 0.5883


Epoch 10/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 10/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Epoch 10/50, Train Loss: 0.3570, Val Loss: 0.6247


Epoch 11/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 11/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


Epoch 11/50, Train Loss: 0.3442, Val Loss: 0.5362


Epoch 12/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 12/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Epoch 12/50, Train Loss: 0.3104, Val Loss: 0.5189


Epoch 13/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.58it/s]
Epoch 13/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 13/50, Train Loss: 0.3207, Val Loss: 0.5304


Epoch 14/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 14/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 14/50, Train Loss: 0.3012, Val Loss: 0.4967


Epoch 15/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 15/50 - Validation: 100%|██████████| 20/20 [00:12<00:00,  1.60it/s]


Epoch 15/50, Train Loss: 0.2911, Val Loss: 0.4637


Epoch 16/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 16/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 16/50, Train Loss: 0.2885, Val Loss: 0.4616


Epoch 17/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 17/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 17/50, Train Loss: 0.2803, Val Loss: 0.4566


Epoch 18/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.58it/s]
Epoch 18/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Epoch 18/50, Train Loss: 0.2460, Val Loss: 0.4066


Epoch 19/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 19/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 19/50, Train Loss: 0.2265, Val Loss: 0.3926


Epoch 20/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 20/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 20/50, Train Loss: 0.2103, Val Loss: 0.3715


Epoch 21/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.59it/s]
Epoch 21/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.72it/s]


Epoch 21/50, Train Loss: 0.1920, Val Loss: 0.3625


Epoch 22/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.61it/s]
Epoch 22/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


Epoch 22/50, Train Loss: 0.1807, Val Loss: 0.3450


Epoch 23/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 23/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Epoch 23/50, Train Loss: 0.1980, Val Loss: 0.3648


Epoch 24/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.60it/s]
Epoch 24/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


Epoch 24/50, Train Loss: 0.1857, Val Loss: 0.3245


Epoch 25/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.62it/s]
Epoch 25/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Epoch 25/50, Train Loss: 0.1692, Val Loss: 0.3118


Epoch 26/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.57it/s]
Epoch 26/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Epoch 26/50, Train Loss: 0.1628, Val Loss: 0.3245


Epoch 27/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 27/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 27/50, Train Loss: 0.1605, Val Loss: 0.2971


Epoch 28/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 28/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.68it/s]


Epoch 28/50, Train Loss: 0.1457, Val Loss: 0.2881


Epoch 29/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.60it/s]
Epoch 29/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Epoch 29/50, Train Loss: 0.1440, Val Loss: 0.2821


Epoch 30/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.62it/s]
Epoch 30/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 30/50, Train Loss: 0.1324, Val Loss: 0.2795


Epoch 31/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.58it/s]
Epoch 31/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.72it/s]


Epoch 31/50, Train Loss: 0.1254, Val Loss: 0.2635


Epoch 32/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.58it/s]
Epoch 32/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.72it/s]


Epoch 32/50, Train Loss: 0.1207, Val Loss: 0.2679


Epoch 33/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.62it/s]
Epoch 33/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 33/50, Train Loss: 0.1268, Val Loss: 0.2610


Epoch 34/50 - Training: 100%|██████████| 92/92 [00:58<00:00,  1.59it/s]
Epoch 34/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


Epoch 34/50, Train Loss: 0.1265, Val Loss: 0.2658


Epoch 35/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.61it/s]
Epoch 35/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


Epoch 35/50, Train Loss: 0.1206, Val Loss: 0.2656


Epoch 36/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.61it/s]
Epoch 36/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Epoch 36/50, Train Loss: 0.1183, Val Loss: 0.3947


Epoch 37/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.62it/s]
Epoch 37/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 37/50, Train Loss: 0.1595, Val Loss: 0.3663


Epoch 38/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.65it/s]
Epoch 38/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Epoch 38/50, Train Loss: 0.1340, Val Loss: 0.2854


Epoch 39/50 - Training: 100%|██████████| 92/92 [00:57<00:00,  1.60it/s]
Epoch 39/50 - Validation: 100%|██████████| 20/20 [00:10<00:00,  1.84it/s]


Epoch 39/50, Train Loss: 0.1203, Val Loss: 0.2816


Epoch 40/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.62it/s]
Epoch 40/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


Epoch 40/50, Train Loss: 0.1130, Val Loss: 0.4554


Epoch 41/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.66it/s]
Epoch 41/50 - Validation: 100%|██████████| 20/20 [00:12<00:00,  1.59it/s]


Epoch 41/50, Train Loss: 0.1370, Val Loss: 0.3646


Epoch 42/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 42/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Epoch 42/50, Train Loss: 0.1325, Val Loss: 0.3341


Epoch 43/50 - Training: 100%|██████████| 92/92 [00:55<00:00,  1.66it/s]
Epoch 43/50 - Validation: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Epoch 43/50, Train Loss: 0.1116, Val Loss: 0.3366


Epoch 44/50 - Training: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s]
Epoch 44/50 - Validation:  75%|███████▌  | 15/20 [00:08<00:02,  1.88it/s]

In [1]:
# 9. Function for inference on new data
def predict_bus_times(model_path, new_trip_data, observed_stops):
    """
    Predict bus times for remaining stops in a trip

    Args:
        model_path: Path to the saved model
        new_trip_data: DataFrame containing trip data
        observed_stops: List of stop IDs that have been observed

    Returns:
        DataFrame with predicted arrival times for remaining stops
    """
    # Load the model and preprocessing objects
    checkpoint = torch.load(model_path)

    # Create model
    trip_feature_dim = len(checkpoint['route_encoder'].categories_[0]) + len(checkpoint['day_encoder'].categories_[0])  # routes + days
    hidden_dim = 100
    model = BusTimeEncoderDecoder(trip_feature_dim, hidden_dim)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    # Get the preprocessing objects
    route_encoder = checkpoint['route_encoder']
    day_encoder = checkpoint['day_encoder']
    time_scaler = checkpoint['time_scaler']
    distance_scaler = checkpoint['distance_scaler']
    scheduled_time_scaler = checkpoint['scheduled_time_scaler']
    residual_time_scaler = checkpoint['residual_time_scaler']

    # Extract trip details
    trip_id = new_trip_data['id'].iloc[0]
    route_name = new_trip_data['route_name'].iloc[0]
    day = new_trip_data['day'].iloc[0]

    # Filter to observed stops
    observed_df = new_trip_data[~new_trip_data['residual_stop_time'].isna()]
    remaining_df = new_trip_data[new_trip_data['residual_stop_time'].isna()]

    if observed_df.empty or remaining_df.empty:
        print("Error: No observed stops or no remaining stops to predict")
        return None

    # Sort by distance
    # observed_df = observed_df.sort_values('stop_distance')
    # remaining_df = remaining_df.sort_values('stop_distance')

    # Prepare features
    # One-hot encode route
    route_name_df = pd.DataFrame([[route_name]], columns=['route_name'])
    route_encoded = route_encoder.transform(route_name_df)[0]

    # One-hot encode day
    day_df = pd.DataFrame([[day]], columns=['day'])
    day_encoded = day_encoder.transform(day_df)[0]

    # Trip features
    trip_features = np.concatenate([
        route_encoded,
        day_encoded
    ])
    trip_features = torch.tensor(trip_features, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    # Observed stops
    observed_times = time_scaler.transform(observed_df[['time']])
    observed_distances = distance_scaler.transform(observed_df[['stop_distance']])
    observed_scheduled_times = scheduled_time_scaler.transform(observed_df[['scheduled_stop_time']])
    observed_residual_times = residual_time_scaler.transform(observed_df[['residual_stop_time']])

    observed_times = torch.tensor(observed_times, dtype=torch.float32).unsqueeze(0)
    observed_distances = torch.tensor(observed_distances, dtype=torch.float32).unsqueeze(0)
    observed_scheduled_times = torch.tensor(observed_scheduled_times, dtype=torch.float32).unsqueeze(0)
    observed_residual_times = torch.tensor(observed_residual_times, dtype=torch.float32).unsqueeze(0)

    # Target stops (for which we want predictions)
    target_times = time_scaler.transform(remaining_df[['time']])
    target_distances = distance_scaler.transform(remaining_df[['stop_distance']])
    target_scheduled_times = scheduled_time_scaler.transform(remaining_df[['scheduled_stop_time']])

    target_times = torch.tensor(target_times, dtype=torch.float32).unsqueeze(0)
    target_distances = torch.tensor(target_distances, dtype=torch.float32).unsqueeze(0)
    target_scheduled_times = torch.tensor(target_scheduled_times, dtype=torch.float32).unsqueeze(0)

    # Generate predictions
    with torch.no_grad():
        predicted_time_residuals = model(trip_features,
                                         observed_times, observed_distances, observed_scheduled_times, observed_residual_times,
                                         target_times, target_distances, target_scheduled_times)

    # Convert predictions back to original scale
    predicted_time_residuals_np = predicted_time_residuals.numpy().reshape(-1, 1)
    predicted_time_residuals_orig = residual_time_scaler.inverse_transform(predicted_time_residuals_np).flatten()

    # Add predictions to the dataframe
    remaining_df['predicted_time'] = predicted_time_residuals_orig

    # Combine all stops for display
    all_stops_df = pd.concat([
        observed_df,
        remaining_df
    ]).sort_values('stop_distance')

    return all_stops_df

In [None]:
class BusTimesInference():
    """
    Class holding the bus time prediction model
    Offering prediction method
    """

    HIDDEN_DIM = 120
    OVERLAP = 3

    def __init__(self, model_filename):
        # Load the model and preprocessing objects
        model_path = get_root() / "models" / model_filename
        checkpoint = torch.load(model_path, weights_only=False)

        # Create model
        self.trip_feature_dim = len(checkpoint['route_encoder'].categories_[0]) + len(checkpoint['day_encoder'].categories_[0])  # routes + days
        self.model = BusTimeEncoderDecoder(self.trip_feature_dim, self.HIDDEN_DIM)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()

        # Get the preprocessing objects
        self.route_encoder = checkpoint['route_encoder']
        self.day_encoder = checkpoint['day_encoder']
        self.time_scaler = checkpoint['time_scaler']
        self.distance_scaler = checkpoint['distance_scaler']
        self.scheduled_time_scaler = checkpoint['scheduled_time_scaler']
        self.residual_time_scaler = checkpoint['residual_time_scaler']

    def get_trip_features(self, new_trip_data):
        """Prepares the trip_features"""
        # Extract trip details
        # trip_id = new_trip_data['id'].iloc[0]
        route_name = new_trip_data['route_name'].iloc[0]
        day = new_trip_data['day'].iloc[0]

        # Prepare features
        # One-hot encode route
        route_name_df = pd.DataFrame([[route_name]], columns=['route_name'])
        route_encoded = self.route_encoder.transform(route_name_df)[0]

        # One-hot encode day
        day_df = pd.DataFrame([[day]], columns=['day'])
        day_encoded = self.day_encoder.transform(day_df)[0]

        # Trip features
        trip_features = np.concatenate([
            route_encoded,
            day_encoded
        ])
        trip_features = torch.tensor(trip_features, dtype=torch.float32).unsqueeze(0)
        return trip_features

    def get_observed_data(self, observed_df):
        """Extract the stop features for observed"""
        observed_times = self.time_scaler.transform(
                observed_df[['time']])
        observed_distances = self.distance_scaler.transform(
                observed_df[['distance_to_stop']])
        observed_scheduled_times = self.scheduled_time_scaler.transform(
                observed_df[['time_to_stop']])
        observed_residual_times = self.residual_time_scaler.transform(
                observed_df[['residual_stop_time']])

        observed_times = torch.tensor(
                observed_times, dtype=torch.float32).transpose(0, 1)
        observed_distances = torch.tensor(
                observed_distances, dtype=torch.float32).transpose(0, 1)
        observed_scheduled_times = torch.tensor(
                observed_scheduled_times, dtype=torch.float32).transpose(0, 1)
        observed_residual_times = torch.tensor(
                observed_residual_times, dtype=torch.float32).transpose(0, 1)

        return observed_times, observed_distances, observed_scheduled_times, observed_residual_times

    def get_target_data(self, remaining_df):
        """Extract the stop features for target"""
        target_times = self.time_scaler.transform(
                remaining_df[['time']])
        target_distances = self.distance_scaler.transform(
                remaining_df[['distance_to_stop']])
        target_scheduled_times = self.scheduled_time_scaler.transform(
                remaining_df[['time_to_stop']])

        target_times = torch.tensor(
                target_times, dtype=torch.float32).transpose(0, 1)
        target_distances = torch.tensor(
                target_distances, dtype=torch.float32).transpose(0, 1)
        target_scheduled_times = torch.tensor(
                target_scheduled_times, dtype=torch.float32).transpose(0, 1)

        return target_times, target_distances, target_scheduled_times

    def split_dataframes(self, new_trip_data):
        """Splits the dataframe to observed and target"""
        observed_df = new_trip_data[~new_trip_data['residual_stop_time'].isna()]
        remaining_df = new_trip_data[new_trip_data['residual_stop_time'].isna()]
        if observed_df.empty or remaining_df.empty:
            return [None, None]
        return observed_df, remaining_df

    def add_overlap(self, observed_df, remaining_df):
        """Add the overlapping stops from observed to remaining"""
        overlap_rows = observed_df.tail(self.OVERLAP)
        remaining_df = pd.concat([overlap_rows, remaining_df], ignore_index=True)
        return remaining_df

    def remove_overlap(self, predicted_time_residuals):
        """Remove the overlapped predicted stops"""
        return predicted_time_residuals[self.OVERLAP:]

    def rescale_predictions(self, predicted_time_residuals):
        """Convert predicted times back to scale"""
        # Convert predictions back to original scale
        predicted_time_residuals_np = predicted_time_residuals.numpy().reshape(-1, 1)
        predicted_time_residuals_orig = self.residual_time_scaler.inverse_transform(predicted_time_residuals_np).flatten()
        return predicted_time_residuals_orig

    def predict_trip(self, trip):
        """
        Predict the bus trip for the next target stops
        """
        if not trip.inference_eligible():
            raise ValueError("Trip did not pass enough stops (3)")
        observed_df, remaining_df = trip.observed_df, trip.target_df
        if observed_df is None or remaining_df is None:
            raise ValueError("Failed in resolving the dataframes")
        remaining_df = self.add_overlap(observed_df, remaining_df)
        trip_features = self.get_trip_features(pd.concat([observed_df, remaining_df]))
        observed_times, observed_distances, observed_scheduled_times, observed_residual_times = self.get_observed_data(observed_df)
        target_times, target_distances, target_scheduled_times = self.get_target_data(remaining_df)
        with torch.no_grad():
            predicted_time_residuals = self.model(trip_features,
                                             observed_times, observed_distances, observed_scheduled_times, observed_residual_times,
                                             target_times, target_distances, target_scheduled_times)
        predicted_time_residuals = self.rescale_predictions(predicted_time_residuals)
        # predicted_time_residuals = predicted_time_residuals# .squeeze(0)
        predicted_time_residuals = self.remove_overlap(predicted_time_residuals)
        trip.add_predictions(predicted_time_residuals)
        return True


In [None]:
# Example of how to use the inference function:
# new_trip_data = pd.read_csv('new_trip.csv')
# observed_stops = ['8380B2407401', '8380B2407501', '8380B2407601']
# predictions = predict_bus_times('bus_time_prediction_model.pth', new_trip_data, observed_stops)
# print(predictions)