# Predictor Model Playground

### Import Models

In [1]:
# Preprocess import
import pandas as pd
import numpy as np

# pytorch import
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

### Run and test the simple 2-layer lstm predictor

Before that, go to `/utils` and run `preparation.py` then `rnn_traj_token.py`, make sure your data is located at `/data/taxi`

In [2]:
# Specify the file path
file_path = "../data/token_traj.csv"  # Replace with your actual file path

# Load the CSV file
data = pd.read_csv(file_path)

# Display the first five rows
print(data.head(5))

# Get the number of rows
num_rows = len(data)

print(f"The dataset contains {num_rows} rows.")


                                processed_trajectory  time_elapsed
0  [[100, 100], [2.6204999999990264, 5.3615999999...         247.0
1  [[100, 100], [2.6082999999999856, 5.0733000000...        1031.0
2  [[100, 100], [-2.9032000000000835, 8.980099999...         203.0
3  [[100, 100], [-3.342600000000573, 8.4917999999...        3890.0
4  [[100, 100], [-2.2126999999997565, 9.253199999...         212.0
The dataset contains 290077 rows.


## Build the model

In [3]:
# Define the bi-LSTM model
class BiLSTMTimePredictor(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, num_layers=2):
        super(BiLSTMTimePredictor, self).__init__()
        self.encoder = nn.LSTM(
            input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True
        )
        self.classifier = nn.Linear(hidden_dim * 4, 1)  # *2 because bidirectional
        
    def forward(self, x):
        # print("DEBUG: x shape: ", x.shape)
        _, (hidden, _) = self.encoder(x)  # hidden shape: (num_layers*2, batch_size, hidden_dim)
        # print("DEBUG: hidden shape before: ", hidden.shape)
        hidden = hidden.permute(1, 0, 2).reshape(x.size(0), -1)  # Flatten hidden states
        # print("DEBUG: hidden shape: ", hidden.shape)
        return self.classifier(hidden).squeeze(-1)

### Load, Split and Train
To make it easier to play the model, just sample 10000 rows in the csv file. Remember set sample = 0 if you want to use the whole dataset.

In [4]:
class TrajectoryDataset(Dataset):
    def __init__(self, data_path, sample=1, max_rows=10000):
        if sample:
            self.data = pd.read_csv(data_path, nrows=max_rows)
        else:
            self.data = pd.read_csv(data_path)
        self.x = self.data['processed_trajectory'].apply(eval).tolist()
        self.y = self.data['time_elapsed'].values.astype(np.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.x[idx], dtype=torch.float32)
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return x, y

In [5]:
data_path = "../data/token_traj.csv"
dataset = TrajectoryDataset(data_path, sample = 1)

# Split the dataset into train/val/test = 7/1/2
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

In [9]:
def train_model(model, train_loader, val_loader, epochs=10, lr=0.0001, device="cpu"):
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}")
        for x_batch, y_batch in train_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_bar.set_postfix({"Train Loss": train_loss / (len(train_bar) + 1)})

        # Validation phase
        model.eval()
        val_loss = 0
        val_bar = tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{epochs}")
        with torch.no_grad():
            for x_batch, y_batch in val_bar:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                predictions = model(x_batch)
                loss = criterion(predictions, y_batch)
                val_loss += loss.item()
                val_bar.set_postfix({"Val Loss": val_loss / (len(val_bar) + 1)})

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}")

In [7]:
def test_model(model, test_loader, device="cpu"):
    model = model.to(device)
    model.eval()
    test_loss = 0
    criterion = nn.MSELoss()
    test_bar = tqdm(test_loader, desc="Testing")
    with torch.no_grad():
        for x_batch, y_batch in test_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)
            test_loss += loss.item()
            test_bar.set_postfix({"Test Loss": test_loss / (len(test_bar) + 1)})
    print(f"Test Loss: {test_loss/len(test_loader):.4f}")

In [10]:
# Train the model on both CPU and GPU
model = BiLSTMTimePredictor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, val_loader, epochs=10, device=device)

test_model(model, test_loader, device=device)

Training Epoch 1/10:  31%|███       | 68/219 [03:40<08:10,  3.25s/it, Train Loss=3.92e+5]


KeyboardInterrupt: 