# Predictor Model Playground

### Import Models

In [1]:
# Preprocess import
import pandas as pd
import numpy as np

# pytorch import
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

### Run and test the simple 2-layer lstm predictor

Before that, go to `/utils` and run `preparation.py` then `rnn_traj_token.py`, make sure your data is located at `/data/taxi`

In [2]:
# Specify the file path
file_path = "../data/token_traj.csv"  # Replace with your actual file path

# Load the CSV file
data = pd.read_csv(file_path)

# Display the first five rows
print(data.head(5))

# Get the number of rows
num_rows = len(data)

print(f"The dataset contains {num_rows} rows.")


                                processed_trajectory  time_elapsed
0  [[100, 100], [2.6204999999990264, 5.3615999999...         247.0
1  [[100, 100], [2.6082999999999856, 5.0733000000...        1031.0
2  [[100, 100], [-2.9032000000000835, 8.980099999...         203.0
3  [[100, 100], [-3.342600000000573, 8.4917999999...        3890.0
4  [[100, 100], [-2.2126999999997565, 9.253199999...         212.0
The dataset contains 290077 rows.


### Load, Split and Train
To make it easier to play the model, just sample 10000 rows in the csv file. Remember set sample = 0 if you want to use the whole dataset.

In [4]:
class TrajectoryDataset(Dataset):
    def __init__(self, data_path, sample=1, max_rows=10000):
        if sample:
            self.data = pd.read_csv(data_path, nrows=max_rows)
        else:
            self.data = pd.read_csv(data_path)
        self.x = self.data['processed_trajectory'].apply(eval).tolist()
        self.y = self.data['time_elapsed'].values.astype(np.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.x[idx], dtype=torch.float32)
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return x, y

In [5]:
data_path = "../data/token_traj.csv"
dataset = TrajectoryDataset(data_path, sample = 1)

# Split the dataset into train/val/test = 7/1/2
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

In [23]:
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, device="cpu", batch_size=32):
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}")
        cnt = 0
        for x_batch, y_batch in train_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += (loss.item() / batch_size)
            cnt += 1
            # print("Loss item: ", loss.item(), ", train_loss: ", train_loss)
            train_bar.set_postfix({"Train Loss": (train_loss / cnt) / (len(train_bar) + 1)})

        # Validation phase
        model.eval()
        val_loss = 0
        val_bar = tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{epochs}")
        cnt_val = 0
        with torch.no_grad():
            for x_batch, y_batch in val_bar:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                predictions = model(x_batch)
                loss = criterion(predictions, y_batch)
                val_loss += loss.item() / batch_size
                cnt += 1
                val_bar.set_postfix({"Val Loss": (val_loss / cnt) / (len(val_bar) + 1)})

In [24]:
def test_model(model, test_loader, device="cpu", batch_size = 32):
    model = model.to(device)
    model.eval()
    test_loss = 0
    criterion = nn.MSELoss()
    test_bar = tqdm(test_loader, desc="Testing")
    cnt = 0
    with torch.no_grad():
        for x_batch, y_batch in test_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)
            test_loss += loss.item() / batch_size
            cnt += 1
            test_bar.set_postfix({"Test Loss": (test_loss / cnt) / (len(test_bar) + 1)})
    print(f"Test Loss: {test_loss/len(test_loader):.4f}")

### Bi-LSTM

In [26]:
# Define the bi-LSTM model
class BiLSTMTimePredictor(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, num_layers=2):
        super(BiLSTMTimePredictor, self).__init__()
        self.encoder = nn.LSTM(
            input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True
        )
        self.classifier = nn.Linear(hidden_dim * 4, 1)  # *2 because bidirectional
        
    def forward(self, x):
        # print("DEBUG: x shape: ", x.shape)
        _, (hidden, _) = self.encoder(x)  # hidden shape: (num_layers*2, batch_size, hidden_dim)
        # print("DEBUG: hidden shape before: ", hidden.shape)
        hidden = hidden.permute(1, 0, 2).reshape(x.size(0), -1)  # Flatten hidden states
        # print("DEBUG: hidden shape: ", hidden.shape)
        return self.classifier(hidden).squeeze(-1)

In [25]:
# Train the model on both CPU and GPU
model = BiLSTMTimePredictor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, val_loader, epochs=10, device=device)

test_model(model, test_loader, device=device)

Training Epoch 1/10: 100%|██████████| 219/219 [00:12<00:00, 17.35it/s, Train Loss=172]
Validation Epoch 1/10: 100%|██████████| 32/32 [00:01<00:00, 30.66it/s, Val Loss=140] 
Training Epoch 2/10: 100%|██████████| 219/219 [00:12<00:00, 17.78it/s, Train Loss=147]
Validation Epoch 2/10: 100%|██████████| 32/32 [00:01<00:00, 30.96it/s, Val Loss=123] 
Training Epoch 3/10: 100%|██████████| 219/219 [00:12<00:00, 17.85it/s, Train Loss=128]
Validation Epoch 3/10: 100%|██████████| 32/32 [00:01<00:00, 31.24it/s, Val Loss=108] 
Training Epoch 4/10: 100%|██████████| 219/219 [00:12<00:00, 17.67it/s, Train Loss=114]
Validation Epoch 4/10: 100%|██████████| 32/32 [00:01<00:00, 31.56it/s, Val Loss=97.6]
Training Epoch 5/10: 100%|██████████| 219/219 [00:12<00:00, 17.85it/s, Train Loss=102]
Validation Epoch 5/10: 100%|██████████| 32/32 [00:01<00:00, 31.87it/s, Val Loss=89.3]
Training Epoch 6/10: 100%|██████████| 219/219 [00:12<00:00, 17.67it/s, Train Loss=93.9]
Validation Epoch 6/10: 100%|██████████| 32/32 [

Test Loss: 16603.6074





### Transformer Encoder

In [19]:
class TransformerTimePredictor(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, nhead=4, num_encoder_layers=4, mlp_hidden_dim=256):
        """
        Transformer-based model for time prediction.

        Args:
            input_dim (int): Number of features in each input step (e.g., 2 for [x, y]).
            hidden_dim (int): Embedding dimension for the Transformer.
            nhead (int): Number of attention heads in the Transformer.
            num_encoder_layers (int): Number of Transformer encoder layers.
            mlp_hidden_dim (int): Number of hidden units in the MLP classifier.
        """
        super(TransformerTimePredictor, self).__init__()

        # Input embedding layer
        self.embedding = nn.Linear(input_dim, hidden_dim)

        # Transformer encoder
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=nhead, dim_feedforward=hidden_dim * 4, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_encoder_layers)

        # MLP classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, mlp_hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim // 2, 1),  # Output a single value
        )

    def forward(self, x):
        # Embed input
        x = self.embedding(x)  # Shape: (batch_size, seq_length, hidden_dim)
        
        # Transformer encoding
        x = self.transformer_encoder(x)  # Shape: (batch_size, seq_length, hidden_dim)
        
        # Use only the first token's representation for classification
        x = x[:, 0, :]  # Shape: (batch_size, hidden_dim)
        
        # Classifier
        output = self.classifier(x)  # Shape: (batch_size, 1)
        # print(output)
        return output.squeeze(-1)

In [None]:
model = TransformerTimePredictor(input_dim=2, hidden_dim=256, nhead=8, num_encoder_layers=6, mlp_hidden_dim=256)

train_model(model, train_loader, val_loader, epochs=20, device=device)
test_model(model, test_loader, device=device)

Training Epoch 1/20:  23%|██▎       | 51/219 [00:43<02:24,  1.16it/s, Train Loss=146]