In [None]:
# !pip install -q supervision "ultralytics<=8.3.40"

In [3]:
import cv2

import numpy as np
import pandas as pd
import supervision as sv

from tqdm import tqdm
from ultralytics import YOLO
from supervision.assets import VideoAssets, download_assets
from collections import defaultdict, deque

from process_video import process_video

import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset, random_split
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import TQDMProgressBar, RichProgressBar
torch.manual_seed(47)

<torch._C.Generator at 0x7acd43b1d850>

### Process Video

In [1]:
SOURCE_VIDEO_PATH = "../media/vehicles.mp4"
TARGET_VIDEO_PATH = "../media/vehicles-result.mp4"
MODEL_NAME = "../models/yolov8x.pt"
OUTPUT_CSV = "data/output.csv"

process_video(
    source_video_path=SOURCE_VIDEO_PATH,
    target_video_path=TARGET_VIDEO_PATH,
    model_name=MODEL_NAME,
    output_csv=OUTPUT_CSV,
)

In [2]:
TARGET_WIDTH = 25
TARGET_HEIGHT = 250
SOURCE = np.array([[1252, 787], [2298, 803], [5039, 2159], [-550, 2159]])
TARGET = np.array([[0, 0], [TARGET_WIDTH - 1, 0], [TARGET_WIDTH - 1, TARGET_HEIGHT - 1], [0, TARGET_HEIGHT - 1],])

# TRANSFORM PERSPECTIVE
view_transformer = ViewTransformer(source=SOURCE, target=TARGET)

model = YOLO(MODEL_NAME)

video_info = sv.VideoInfo.from_video_path(video_path=SOURCE_VIDEO_PATH)
frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH)

# tracker initiation
byte_track = sv.ByteTrack(frame_rate=video_info.fps)

# annotators configuration
thickness = sv.calculate_optimal_line_thickness(resolution_wh=video_info.resolution_wh)
text_scale = sv.calculate_optimal_text_scale(resolution_wh=video_info.resolution_wh)
bounding_box_annotator = sv.BoundingBoxAnnotator(thickness=thickness)
label_annotator = sv.LabelAnnotator(
    text_scale=text_scale,
    text_thickness=thickness,
    text_position=sv.Position.BOTTOM_CENTER
)
trace_annotator = sv.TraceAnnotator(
    thickness=thickness,
    trace_length=video_info.fps * 2,
    position=sv.Position.BOTTOM_CENTER
)
polygon_zone = sv.PolygonZone(polygon=SOURCE)
coordinates = defaultdict(lambda: deque(maxlen=video_info.fps))

output_data = []

# open target video
with sv.VideoSink(TARGET_VIDEO_PATH, video_info) as sink:

    # loop over source video frame
    for frame in tqdm(frame_generator, total=video_info.total_frames):

        result = model(frame, imgsz=MODEL_RESOLUTION, verbose=False)[0]
        detections = sv.Detections.from_ultralytics(result)

        # filter out detections by class and confidence
        detections = detections[detections.confidence > CONFIDENCE_THRESHOLD]
        detections = detections[detections.class_id == 2]

        # filter out detections outside the zone
        detections = detections[polygon_zone.trigger(detections)]

        # refine detections using non-max suppression
        detections = detections.with_nms(IOU_THRESHOLD)

        # pass detection through the tracker
        detections = byte_track.update_with_detections(detections=detections)

        points = detections.get_anchors_coordinates(
            anchor=sv.Position.BOTTOM_CENTER
        )

        # calculate the detections position inside the target RoI
        points = view_transformer.transform_points(points=points).astype(int)

        # store detections position
        for tracker_id, [_, y] in zip(detections.tracker_id, points):
            coordinates[tracker_id].append(y)

        # format labels
        labels = []

        for tracker_id in detections.tracker_id:
            if len(coordinates[tracker_id]) < video_info.fps / 2:
                labels.append(f"#{tracker_id}")
                speed = -1
            else:
                # calculate speed
                coordinate_start = coordinates[tracker_id][-1]
                coordinate_end = coordinates[tracker_id][0]
                distance = abs(coordinate_start - coordinate_end)
                time = len(coordinates[tracker_id]) / video_info.fps
                speed = distance / time * 3.6
                labels.append(f"#{tracker_id} {int(speed)} km/h")
            output_data.append([tracker_id, len(coordinates[tracker_id]), *detections[detections.tracker_id == tracker_id].xyxy[0].tolist(), speed])

        # annotate frame
        annotated_frame = frame.copy()
        annotated_frame = trace_annotator.annotate(
            scene=annotated_frame, detections=detections
        )
        annotated_frame = bounding_box_annotator.annotate(
            scene=annotated_frame, detections=detections
        )
        annotated_frame = label_annotator.annotate(
            scene=annotated_frame, detections=detections, labels=labels
        )

        # add frame to target video
        sink.write_frame(annotated_frame)

df = pd.DataFrame(data=output_data, columns=["tracker_id", "frames", "x1", "y1", "x2", "y2", "speed"])
df.to_csv(OUTPUT_CSV, index=False)
print("Output saved to ", OUTPUT_CSV)

NameError: name 'np' is not defined

In [18]:
# --- 2. Dataset
class TrackingDataset(Dataset):
    def __init__(self, csv_path, seq_length=10):
        df = pd.read_csv(csv_path)
        df = df[df["speed"] != -1]  # Eliminar registros sin velocidad estimada
        # Calcular las coordenadas del centro a partir de x1, y1, x2, y2
        df["center_x"] = (df["x1"] + df["x2"]) / 2.0
        df["center_y"] = (df["y1"] + df["y2"]) / 2.0
        
        # Agrupar por track_id para extraer la secuencia de cada vehículo
        self.track_groups = df.groupby("track_id")
        self.seq_length = seq_length
        self.samples, self.labels = [], []
        for track_id, group in self.track_groups:
            # Ordenar por frame
            group = group.sort_values("frames")
            
            # Usamos los centros como entrada
            centers = group[["center_x", "center_y"]].values
            speeds = group["speed"].values
            
            # Crear ventanas deslizantes para cada vehículo
            if len(centers) > seq_length:
                for i in range(len(centers) - seq_length):
                    self.samples.append(centers[i:i+seq_length])
                    self.labels.append(speeds[i+seq_length])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        # Cada entrada es una secuencia [seq_length x 2] y la etiqueta es un escalar (velocidad)
        return (torch.tensor(self.samples[idx], dtype=torch.float32),
                torch.tensor(self.labels[idx], dtype=torch.float32))


In [19]:
# --- 3. DataModule para PyTorch Lightning ---
class TrackingDataModule(pl.LightningDataModule):
    def __init__(self, csv_path, seq_length=10, batch_size=16, split_ratio=(0.7, 0.15, 0.15)):
        super().__init__()
        self.csv_path = csv_path
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.split_ratio = split_ratio

    def setup(self, stage=None):
        # Crear el dataset completo
        dataset = TrackingDataset(self.csv_path, seq_length=self.seq_length)
        total = len(dataset)
        train_size = int(total * self.split_ratio[0])
        val_size = int(total * self.split_ratio[1])
        test_size = total - train_size - val_size
        self.train_dataset, self.val_dataset, self.test_dataset = random_split(dataset, [train_size, val_size, test_size])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [20]:
class TrackingLSTM(pl.LightningModule):
    def __init__(self, input_size=2, hidden_size=50, num_layers=2, output_size=1, lr=0.001, threshold=5.0):
        """
        threshold: margen de error en km/h para considerar una predicción como correcta.
        """
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.loss_fn = nn.CrossEntropyLoss()
        self.lr = lr
        self.threshold = threshold

    def forward(self, x):
        # x shape: [batch, seq_length, input_size]
        lstm_out, _ = self.lstm(x)
        # Se toma la salida del último timestep
        return self.fc(lstm_out[:, -1, :])
    
    def compute_accuracy(self, y_hat, y):
        """
        Calcula el porcentaje de predicciones cuya diferencia absoluta con el valor real es menor que el umbral.
        """
        diff = torch.abs(y_hat.squeeze() - y)
        correct = (diff < self.threshold).float().mean()
        return correct
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.squeeze(), y)
        acc = self.compute_accuracy(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.squeeze(), y)
        acc = self.compute_accuracy(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.squeeze(), y)
        acc = self.compute_accuracy(y_hat, y)
        self.log("test_loss", loss)
        self.log("test_acc", acc)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [21]:
seq_length = 10
batch_size = 16
split_ratio = (0.7, 0.15, 0.15)
n_epochs = 20

In [22]:
data_module = TrackingDataModule(
    csv_path=OUTPUT_CSV, 
    seq_length=seq_length, 
    batch_size=batch_size, 
    split_ratio=split_ratio
)

In [23]:
model = TrackingLSTM(
    input_size=2,
    hidden_size=50,
    num_layers=2,
    output_size=1,
    lr=0.001
)

logger = CSVLogger("logs/", name="tracking_lstm")

callback_checkpoint = ModelCheckpoint(
    save_top_k=1,
    mode='max', 
    monitor='val_acc'
)
callback_tqdm = TQDMProgressBar(leave=True)

In [None]:
trainer = pl.Trainer(
    max_epochs=n_epochs,
    callbacks=[callback_checkpoint, callback_tqdm],
    accelerator="auto",
    devices="auto",
    logger=logger,
)
trainer.fit(model, datamodule=data_module)

INFO:pytorch_lightning.utilities.rank_zero:Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory logs/tracking_lstm/version_2/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | lstm    | LSTM    | 31.2 K | train
1 | fc      | Linear  | 51     | train
2 | loss_fn | MSELoss | 0      | train
--------------------------------------------
31.3 K    Trainable params
0         Non-trainable params
31.3 K    Total params
0.125     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


In [None]:
trainer.test(model, datamodule=data_module)