In [1]:
!pip install transformers
!pip install torch
!pip install osmnx

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [5]:
import pandas as pd
import numpy as np
import osmnx as ox
import matplotlib.pyplot as plt
import math
from pathlib import Path


In [6]:
eved_folder = "./eVED"

def load_csv_files(folder_path, chunk_size=100000):
    folder_path = Path(folder_path)
    csv_files = list(folder_path.glob('*.csv'))
    print(f"Found {len(csv_files)} CSV files")

    for file in csv_files:
        print(f"  - {file.name} ({file.stat().st_size / (1024*1024):.2f} MB)")

    dataframes = []
    for file in csv_files:
        try:
            file_size_mb = file.stat().st_size / (1024*1024)

            if file_size_mb > 500:
                print(f"Loading {file.name} in chunks...")

                chunks = pd.read_csv(file, chunksize=chunk_size, low_memory=False)
                df_chunks = []
                for chunk in chunks:

                    df_chunks.append(chunk)
                df = pd.concat(df_chunks, ignore_index=True)
            else:
                df = pd.read_csv(file, low_memory=False)

            df['source_file'] = file.name
            dataframes.append(df)
            print(f"Successfully loaded {file.name}")

        except Exception as e:
            print(f"Error loading {file}: {e}")

    if not dataframes:
        print("No files were successfully loaded")
        return pd.DataFrame()

    return pd.concat(dataframes, ignore_index=True)

ev_df = load_csv_files(eved_folder)
print(f"Total rows in combined dataset: {len(ev_df)}")
print(f"Columns: {ev_df.columns.tolist()}")
ev_df.head()

Found 21 CSV files
  - eVED_181003_week.csv (86.24 MB)
  - eVED_180822_week.csv (32.00 MB)
  - eVED_180321_week.csv (100.22 MB)
  - eVED_181010_week.csv (78.33 MB)
  - eVED_180725_week.csv (33.00 MB)
  - eVED_180905_week.csv (80.00 MB)
  - eVED_181024_week.csv (108.75 MB)
  - eVED_181017_week.csv (95.58 MB)
  - eVED_180926_week.csv (94.47 MB)
  - eVED_180801_week.csv (32.00 MB)
  - eVED_180328_week.csv (91.37 MB)
  - eVED_180912_week.csv (49.00 MB)
  - eVED_180829_week.csv (84.26 MB)
  - eVED_181107_week.csv (29.06 MB)
  - eVED_180919_week.csv (92.77 MB)
  - eVED_180314_week.csv (105.02 MB)
  - eVED_180411_week.csv (108.36 MB)
  - eVED_180425_week.csv (111.57 MB)
  - eVED_180404_week.csv (114.65 MB)
  - eVED_180815_week.csv (40.00 MB)
  - eVED_181031_week.csv (91.56 MB)
Successfully loaded eVED_181003_week.csv
Successfully loaded eVED_180822_week.csv
Successfully loaded eVED_180321_week.csv
Successfully loaded eVED_181010_week.csv
Successfully loaded eVED_180725_week.csv
Successfully l

Unnamed: 0,DayNum,VehId,Trip,Timestamp(ms),Latitude[deg],Longitude[deg],Vehicle Speed[km/h],MAF[g/sec],Engine RPM[RPM],Absolute Load[%],...,Matchted Latitude[deg],Matched Longitude[deg],Match Type,Class of Speed Limit,Speed Limit[km/h],Speed Limit with Direction[km/h],Intersection,Bus Stops,Focus Points,source_file
0,337.031765,135.0,2253.0,0.0,42.272892,-83.74937,35.0,3.67,1062.0,14.117647,...,42.272891,-83.749365,0.0,0.0,40,40.0,,,crossing,eVED_181003_week.csv
1,337.031765,135.0,2253.0,500.0,42.272892,-83.74937,35.0,9.46,1320.0,14.117647,...,42.272891,-83.749365,1.0,0.0,40,40.0,,,crossing,eVED_181003_week.csv
2,337.031765,135.0,2253.0,1100.0,42.272892,-83.74937,34.0,9.46,1320.0,14.117647,...,42.272891,-83.749365,1.0,0.0,40,40.0,,,crossing,eVED_181003_week.csv
3,337.031765,135.0,2253.0,2600.0,42.272892,-83.74937,33.0,11.48,1500.0,14.117647,...,42.272891,-83.749365,1.0,0.0,40,40.0,,,crossing,eVED_181003_week.csv
4,337.031765,135.0,2253.0,3100.0,42.272633,-83.74945,33.0,11.48,1500.0,14.117647,...,42.272633,-83.74945,0.0,-1.0,40-48,40.0,,,,eVED_181003_week.csv


In [7]:
grouped = ev_df.groupby(['VehId', 'Trip'])

# Extract complete routes as sequences
route_sequences = []
routes_total = 0;

for (vehid, trip), group in grouped:
    # Sort by timestamp to ensure correct sequence
    group = group.sort_values('Timestamp(ms)')

    # Extract coordinates as sequence
    route = list(zip(group['Latitude[deg]'], group['Longitude[deg]']))

    # Store as (source, destination, full_route)
    source = route[0]
    destination = route[-1]
    route_sequences.append({
        'source': source,
        'destination': destination,
        'full_route': route,
        'vehicle': vehid,
        'trip': trip,
    })

    routes_total += 1


print(routes_total)

10486


In [8]:
def create_features(route_data):
    features = []
    for data in route_data:
        src_lat, src_lng = data['source']
        dst_lat, dst_lng = data['destination']

        feature = f"[SRC] {src_lat:.6f} {src_lng:.6f} [DST] {dst_lat:.6f} {dst_lng:.6f}"

        features.append(feature)

    return features


In [9]:
LAT_MIN, LAT_MAX = 33.0, 38.0
LNG_MIN, LNG_MAX = -123.0, -117.0

def normalize(lat, lng):
    norm_lat = (lat - LAT_MIN) / (LAT_MAX - LAT_MIN)
    norm_lng = (lng - LNG_MIN) / (LNG_MAX - LNG_MIN)
    return norm_lat, norm_lng

In [10]:
from torch.utils.data import Dataset, DataLoader

class RouteDataset(Dataset):
  def __init__(self, route_data, tokenizer, max_points=100):
    self.route_data = route_data
    self.tokenizer = tokenizer
    self.max_points = max_points
    self.features = create_features(route_data)
    self.route_targets = self.tokenize_routes(route_data)

  def __len__(self):
    return len(self.route_data)

  def __getitem__(self, idx):
    item_ft = self.features[idx]
    item_tokenized = self.tokenizer(
        item_ft, padding="max_length", truncation=True, max_length=128,
        return_tensors="pt"
    )

    target = self.route_targets[idx]
    item = {
        'input_ids': item_tokenized['input_ids'].squeeze(),
        'attention_mask': item_tokenized['attention_mask'].squeeze(),
        'target_routes': torch.tensor(target, dtype=torch.float32)
    }

    return item

  def tokenize_routes(self, route_data):
    tokenized_routes = []
    for data in route_data:
        route = data['full_route']
        if len(route) > self.max_points:
            indices = np.linspace(0, len(route)-1, self.max_points).astype(int)
            route = [route[i] for i in indices]
        else:
            route = route + [(0, 0)] * (self.max_points - len(route))

        flat_route = []
        for lat, lng in route:
          norm_lat, norm_lng = normalize(lat, lng)
          flat_route.append(norm_lat)
          flat_route.append(norm_lng)

        tokenized_routes.append(flat_route)
    return tokenized_routes


In [11]:
def get_dataloader(route_data, tokenizer, batch_size=8, shuffle=True, max_points=100):
  dataset = RouteDataset(route_data, tokenizer, max_points=max_points)
  loader = DataLoader(
      dataset, batch_size=batch_size,
      shuffle=shuffle,
      num_workers=0,
      pin_memory=True
  )

  return loader

In [12]:
from transformers import BertModel, BertConfig, BertTokenizer
import torch.nn as nn
import torch

class RoutePredictor(nn.Module):
    def __init__(self):
        super(RoutePredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        hidden_size = self.bert.config.hidden_size

        # Adjust the output dimension to match your target shape (100 points × 2 coordinates)
        self.route_head = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 200)  # 100 points with lat/lng for each point
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # Use [CLS] token representation
        route_prediction = self.route_head(cls_output)
        return route_prediction.view(route_prediction.size(0), -1, 2)  # Reshape to (batch, points, 2)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [14]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(route_sequences, test_size=0.2, random_state=42)

def train_model(model, train_data, val_data, tokenizer, epochs=10):
    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_loader = get_dataloader(train_data, tokenizer)
    val_loader = get_dataloader(val_data, tokenizer)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
      model.train()
      train_loss = 0

      for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_routes = batch['target_routes'].to(device)
        target_routes = target_routes.view(target_routes.size(0), 100, 2)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, target_routes)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

      model.eval()
      val_loss = 0

      with torch.no_grad():
          for batch in val_loader:
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              target_routes = batch['target_routes'].to(device)
              target_routes = target_routes.view(target_routes.size(0), 100, 2)

              outputs = model(input_ids, attention_mask)
              loss = loss_fn(outputs, target_routes)
              val_loss += loss.item()

      print(f"Epoch {epoch+1}/{epochs}")
      print(f"Train Loss: {train_loss/len(train_loader):.4f}")
      print(f"Val Loss: {val_loss/len(val_loader):.4f}")

In [15]:
model = RoutePredictor()
model.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_model(model, train_data, val_data, tokenizer, epochs=50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Epoch 1/50
Train Loss: 2.5191
Val Loss: nan
Epoch 2/50
Train Loss: 0.0090
Val Loss: nan
Epoch 3/50
Train Loss: 0.0090
Val Loss: nan
Epoch 4/50
Train Loss: 0.0090
Val Loss: nan
Epoch 5/50
Train Loss: 0.0090
Val Loss: nan


KeyboardInterrupt: 

In [16]:
def predict_route(model, tokenizer, source, destination):
    model.eval()

    feature = f"[SRC] {source[0]:.6f} {source[1]:.6f} [DST] {destination[0]:.6f} {destination[1]:.6f}"
    inputs = tokenizer(feature, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        predicted_coords = model(input_ids=input_ids, attention_mask=attention_mask)

    predicted_route = [(coord[0].item(), coord[1].item()) for coord in predicted_coords[0]]
    return predicted_route


In [17]:
def visualize_control_and_predicted_route(df, veh_id, trip_id, model, tokenizer):
    route_df = df[(df['VehId'] == veh_id) & (df['Trip'] == trip_id)]
    route_df = route_df.sort_values('Timestamp(ms)')

    lats = route_df['Latitude[deg]'].tolist()
    lngs = route_df['Longitude[deg]'].tolist()

    if len(lats) == 0:
        print(f"No data found for Vehicle {veh_id}, Trip {trip_id}")
        return

    source = (lats[0], lngs[0])
    destination = (lats[-1], lngs[-1])

    predicted_route = predict_route(model, tokenizer, source, destination)
    pred_lats = [coord[0] for coord in predicted_route]
    pred_lngs = [coord[1] for coord in predicted_route]

    all_lats = lats + pred_lats
    all_lngs = lngs + pred_lngs
    north, south = max(all_lats) + 0.01, min(all_lats) - 0.01
    east, west = max(all_lngs) + 0.01, min(all_lngs) - 0.01
    bbox = (west, south, east, north)

    G = ox.graph.graph_from_bbox(
        bbox=bbox,
        network_type='drive_service',
        simplify=True
    )

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

    ox.plot_graph(G, ax=ax1, show=False, close=False, node_size=0, edge_linewidth=0.5)
    ax1.plot(lngs, lats, 'r-', linewidth=2, label='Actual Route')
    ax1.plot(lngs[0], lats[0], 'go', markersize=10, label='Start')
    ax1.plot(lngs[-1], lats[-1], 'bo', markersize=10, label='End')
    ax1.set_title(f'Actual Route for Vehicle {veh_id}, Trip {trip_id}')
    ax1.legend()

    ox.plot_graph(G, ax=ax2, show=False, close=False, node_size=0, edge_linewidth=0.5)
    ax2.plot(pred_lngs, pred_lats, 'r-', linewidth=2, label='Predicted Route')
    ax2.plot(lngs[0], lats[0], 'go', markersize=10, label='Start')
    ax2.plot(lngs[-1], lats[-1], 'bo', markersize=10, label='End')
    ax2.set_title(f'Model Prediction for Vehicle {veh_id}, Trip {trip_id}')
    ax2.legend()

    plt.tight_layout()
    plt.show()


In [19]:
visualize_control_and_predicted_route(ev_df, veh_id=135, trip_id=2253, model=model, tokenizer=tokenizer)

  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


KeyboardInterrupt: 