In [None]:
pip install torch torchvision opencv-python numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
pip install torch torchvision torchaudio





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install pandas





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Diffusion Parameters
def get_alpha_schedule(T, start=1e-4, end=0.02):
    beta_t = torch.linspace(start, end, T).to(device)
    alpha_t = 1 - beta_t
    alpha_bar_t = torch.cumprod(alpha_t, dim=0)
    return beta_t, alpha_t, alpha_bar_t

T = 1000  # Number of diffusion steps
beta_t, alpha_t, alpha_bar_t = get_alpha_schedule(T)

# Load Video Frames and Ground Truth for MOT
class MOTDataset(Dataset):
    def __init__(self, image_folder, gt_file, transform=None):
        self.image_folder = image_folder
        self.transform = transform
        self.gt_data = pd.read_csv(gt_file, header=None, names=["frame", "id", "x", "y", "w", "h", "conf", "class", "visibility"])
        self.frames = sorted([f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))])
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        frame_name = self.frames[idx]
        frame_number = int(os.path.splitext(frame_name)[0].split('-')[0])  
        img_path = os.path.join(self.image_folder, frame_name)
        image = Image.open(img_path).convert("RGB")
        
        
        bboxes = self.gt_data[self.gt_data["frame"] == frame_number][["x", "y", "w", "h"]].values
        
        
        bbox_tensor = torch.zeros((7, 4), dtype=torch.float32)
        num_bboxes = min(len(bboxes), 7)
        bbox_tensor[:num_bboxes] = torch.tensor(bboxes[:num_bboxes], dtype=torch.float32)
        
        if self.transform:
            image = self.transform(image)
        
        return image, bbox_tensor


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


class Pro2DiffModel(nn.Module):
    def __init__(self):
        super(Pro2DiffModel, self).__init__()
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Linear(2048, 28)  
    
    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), 7, 4) 
        return x

model = Pro2DiffModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


@torch.no_grad()
def reverse_diffusion(model, noisy_images, noisy_bboxes, steps=T):
    model.eval()
    
    for t in range(steps - 1, -1, -1):
        alpha_bar_t_selected = alpha_bar_t[t].view(-1, 1, 1, 1)
        alpha_bar_t_bbox = alpha_bar_t[t].view(-1, 1)
        predicted_noise = model(noisy_images)
        mean_image = (noisy_images - torch.sqrt(1 - alpha_bar_t_selected) * predicted_noise) / torch.sqrt(alpha_bar_t_selected)
        
        if t > 0:
            noise = torch.randn_like(noisy_images)
            sigma_t = torch.sqrt(beta_t[t])
            noisy_images = mean_image + sigma_t * noise
        else:
            noisy_images = mean_image

        mean_bbox = (noisy_bboxes - torch.sqrt(1 - alpha_bar_t_bbox) * torch.randn_like(noisy_bboxes) * 0.1) / torch.sqrt(alpha_bar_t_bbox)
        
        if t > 0:
            noisy_bboxes = mean_bbox + sigma_t.view(-1, 1) * torch.randn_like(noisy_bboxes) * 0.1
        else:
            noisy_bboxes = mean_bbox

    return noisy_images, noisy_bboxes


def train(model, image_folder, gt_file, epochs=10, save_path="pro2diff_mot_model.pth"):
    dataset = MOTDataset(image_folder, gt_file, transform)
    dataloader = DataLoader(dataset, batch_size=7, shuffle=True) 
    
    if os.path.exists(save_path):
        print("Loading existing model...")
        model.load_state_dict(torch.load(save_path))
    
    model.train()
    for epoch in range(epochs):
        loop = tqdm(dataloader, leave=True)
        for images, bbox_gt in loop:
            images, bbox_gt = images.to(device), bbox_gt.to(device)
            
            optimizer.zero_grad()
            bbox_pred = model(images)
            loss = criterion(bbox_pred, bbox_gt)
            loss.backward()
            optimizer.step()
            
            loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
            loop.set_postfix(loss=loss.item())
    
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

train(model, "train1/train1/dancetrack0001/img1/", "train1/train1/dancetrack0001/gt/gt.txt", epochs=10)
train(model, "train1/train1/dancetrack0002/img1/", "train1/train1/dancetrack0002/gt/gt.txt", epochs=10)


noisy_images = torch.randn(1, 3, 224, 224).to(device)
noisy_bboxes = torch.randn(1, 7, 4).to(device)
denoised_image, denoised_bbox = reverse_diffusion(model, noisy_images, noisy_bboxes, steps=1000)


Epoch [1/10]: 100%|██████████| 101/101 [17:15<00:00, 10.25s/it, loss=6.11e+4]
Epoch [2/10]: 100%|██████████| 101/101 [15:58<00:00,  9.49s/it, loss=1.36e+4]
Epoch [3/10]: 100%|██████████| 101/101 [14:22<00:00,  8.54s/it, loss=9.61e+3]
Epoch [4/10]: 100%|██████████| 101/101 [15:32<00:00,  9.24s/it, loss=4.25e+3]
Epoch [5/10]:   3%|▎         | 3/101 [00:32<17:44, 10.86s/it, loss=4.96e+3]


KeyboardInterrupt: 

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def get_alpha_schedule(T, start=1e-4, end=0.02):
    beta_t = torch.linspace(start, end, T).to(device)
    alpha_t = 1 - beta_t
    alpha_bar_t = torch.cumprod(alpha_t, dim=0)
    return beta_t, alpha_t, alpha_bar_t

T = 1000  
beta_t, alpha_t, alpha_bar_t = get_alpha_schedule(T)


class MOTDataset(Dataset):
    def __init__(self, image_folder, gt_file, transform=None):
        self.image_folder = image_folder
        self.transform = transform
        self.gt_data = pd.read_csv(gt_file, header=None, names=["frame", "id", "x", "y", "w", "h", "conf", "class", "visibility"])
        self.frames = sorted([f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))])
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        frame_name = self.frames[idx]
        frame_number = int(os.path.splitext(frame_name)[0].split('-')[0]) 
        img_path = os.path.join(self.image_folder, frame_name)
        image = Image.open(img_path).convert("RGB")
        
       
        bboxes = self.gt_data[self.gt_data["frame"] == frame_number][["x", "y", "w", "h"]].values
        
       
        bbox_tensor = torch.zeros((7, 4), dtype=torch.float32)
        num_bboxes = min(len(bboxes), 7)
        bbox_tensor[:num_bboxes] = torch.tensor(bboxes[:num_bboxes], dtype=torch.float32)
        
        
        ids = self.gt_data[self.gt_data["frame"] == frame_number]["id"].values
        id_tensor = torch.zeros(7, dtype=torch.long) 
        id_tensor[:num_bboxes] = torch.tensor(ids[:num_bboxes], dtype=torch.long)
        
        if self.transform:
            image = self.transform(image)
        
        return image, bbox_tensor, id_tensor


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


class Pro2DiffModel(nn.Module):
    def __init__(self, num_classes=10):  
        super(Pro2DiffModel, self).__init__()
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Identity() 
        self.bbox_head = nn.Linear(2048, 28) 
        self.id_head = nn.Linear(2048, 7 * num_classes)  
    
    def forward(self, x):
        features = self.backbone(x)  
        bbox_pred = self.bbox_head(features).view(x.size(0), 7, 4)  
        id_pred = self.id_head(features).view(x.size(0), 7, -1)  
        return bbox_pred, id_pred

model = Pro2DiffModel(num_classes=10).to(device)

# Generalized IoU Implementation
def generalized_iou(box1, box2):
    # Calculate intersection
    inter_x1 = torch.max(box1[..., 0], box2[..., 0])
    inter_y1 = torch.max(box1[..., 1], box2[..., 1])
    inter_x2 = torch.min(box1[..., 0] + box1[..., 2], box2[..., 0] + box2[..., 2])
    inter_y2 = torch.min(box1[..., 1] + box1[..., 3], box2[..., 1] + box2[..., 3])
    inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
    
    # Calculate union
    box1_area = box1[..., 2] * box1[..., 3]
    box2_area = box2[..., 2] * box2[..., 3]
    union_area = box1_area + box2_area - inter_area
    
    # Calculate IoU
    iou = inter_area / (union_area + 1e-7)
    
    # Calculate generalized IoU
    enclose_x1 = torch.min(box1[..., 0], box2[..., 0])
    enclose_y1 = torch.min(box1[..., 1], box2[..., 1])
    enclose_x2 = torch.max(box1[..., 0] + box1[..., 2], box2[..., 0] + box2[..., 2])
    enclose_y2 = torch.max(box1[..., 1] + box1[..., 3], box2[..., 1] + box2[..., 3])
    enclose_area = (enclose_x2 - enclose_x1) * (enclose_y2 - enclose_y1)
    giou = iou - (enclose_area - union_area) / (enclose_area + 1e-7)
    
    return giou

# Define the full loss function (Equation 6 and 7 in the paper)
def compute_loss(bbox_pred, bbox_gt, id_pred, id_gt):
    # Reshape id_pred and id_gt for classification
    id_pred = id_pred.view(-1, id_pred.size(-1))  
    id_gt = id_gt.view(-1)  
    
   
    cls_loss = nn.functional.cross_entropy(id_pred, id_gt)
    
    # L1 loss for bounding boxes
    l1_loss = nn.functional.l1_loss(bbox_pred, bbox_gt)
    
    # Generalized IoU loss
    giou_loss = 1 - generalized_iou(bbox_pred, bbox_gt).mean()
    
    # Weighted sum of losses
    lambda_cls, lambda_l1, lambda_giou = 2, 5, 2
    total_loss = lambda_cls * cls_loss + lambda_l1 * l1_loss + lambda_giou * giou_loss
    return total_loss


@torch.no_grad()
def reverse_diffusion(model, noisy_images, noisy_bboxes, steps=T):
    model.eval()
    
    for t in range(steps - 1, -1, -1):
        alpha_bar_t_selected = alpha_bar_t[t].view(-1, 1, 1, 1)
        alpha_bar_t_bbox = alpha_bar_t[t].view(-1, 1)

        predicted_noise = model(noisy_images)
        mean_image = (noisy_images - torch.sqrt(1 - alpha_bar_t_selected) * predicted_noise) / torch.sqrt(alpha_bar_t_selected)
        
        if t > 0:
            noise = torch.randn_like(noisy_images)
            sigma_t = torch.sqrt(beta_t[t])
            noisy_images = mean_image + sigma_t * noise   
        else:
            noisy_images = mean_image

        mean_bbox = (noisy_bboxes - torch.sqrt(1 - alpha_bar_t_bbox) * torch.randn_like(noisy_bboxes)) / torch.sqrt(alpha_bar_t_bbox)
        
        if t > 0:
            noisy_bboxes = mean_bbox + sigma_t.view(-1, 1) * torch.randn_like(noisy_bboxes)
        else:
            noisy_bboxes = mean_bbox

    return noisy_images, noisy_bboxes


def scpp(prev_bboxes):
  
    return prev_bboxes

# Training Function
def train(model, image_folder, gt_file, epochs=10, save_path="pro2diff_mot_model.pth"):
    dataset = MOTDataset(image_folder, gt_file, transform)
    dataloader = DataLoader(dataset, batch_size=7, shuffle=True)  
    
    if os.path.exists(save_path):
        print("Loading existing model...")
        model.load_state_dict(torch.load(save_path))
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(epochs):
        loop = tqdm(dataloader, leave=True)
        for images, bbox_gt, id_gt in loop:
            images, bbox_gt, id_gt = images.to(device), bbox_gt.to(device), id_gt.to(device)
            
            optimizer.zero_grad()
            bbox_pred, id_pred = model(images)  
            
           
            loss = compute_loss(bbox_pred, bbox_gt, id_pred, id_gt)
            loss.backward()
            optimizer.step()
            
            loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
            loop.set_postfix(loss=loss.item())
    
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")


train(model, "train1/train1/dancetrack0001/img1/", "train1/train1/dancetrack0001/gt/gt.txt", epochs=10)
train(model, "train1/train1/dancetrack0002/img1/", "train1/train1/dancetrack0002/gt/gt.txt", epochs=10)


noisy_images = torch.randn(1, 3, 224, 224).to(device)
noisy_bboxes = torch.randn(1, 7, 4).to(device)
denoised_image, denoised_bbox = reverse_diffusion(model, noisy_images, noisy_bboxes, steps=1000)



Loading existing model...


  model.load_state_dict(torch.load(save_path))
Epoch [1/10]:   1%|          | 1/101 [00:16<28:05, 16.85s/it, loss=347]


KeyboardInterrupt: 

In [2]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\dharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp311-cp311-win_amd64.whl.metadata (103 kB)
     ---------------------------------------- 0.0/104.0 kB ? eta -:--:--
     ----------- --------------------------- 30.7/104.0 kB 1.3 MB/s eta 0:00:01
     -------------- ---------------------- 41.0/104.0 kB 653.6 kB/s eta 0:00:01
     --------------------- --------------- 61.4/104.0 kB 409.6 kB/s eta 0:00:01
     -----------------------------------  102.4/104.0 kB 587.0 kB/s eta 0:00:01
     ------------------------------------ 104.0/104.0 kB 498.5 kB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp311


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\dharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Class
class MOTDataset(Dataset):
    def __init__(self, image_folder, gt_file, transform=None, max_images=50):
        self.image_folder = image_folder
        self.transform = transform
        self.gt_data = pd.read_csv(gt_file, header=None, names=["frame", "id", "x", "y", "w", "h", "conf", "class", "visibility"])
        self.frames = sorted([f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))])[:max_images]
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        frame_name = self.frames[idx]                                                              
        frame_number = int(os.path.splitext(frame_name)[0].split('-')[0])  
        img_path = os.path.join(self.image_folder, frame_name)
        image = Image.open(img_path).convert("RGB")
        
        # Bounding boxes
        bboxes = self.gt_data[self.gt_data["frame"] == frame_number][["x", "y", "w", "h"]].values
        bbox_tensor = torch.zeros((7, 4), dtype=torch.float32)
        num_bboxes = min(len(bboxes), 7)
        bbox_tensor[:num_bboxes] = torch.tensor(bboxes[:num_bboxes], dtype=torch.float32)
        
        # IDs
        ids = self.gt_data[self.gt_data["frame"] == frame_number]["id"].values
        id_tensor = torch.zeros(7, dtype=torch.long)  
        id_tensor[:num_bboxes] = torch.tensor(ids[:num_bboxes], dtype=torch.long)
        
        if self.transform:
            image = self.transform(image)
        
        return image, bbox_tensor, id_tensor, img_path   

# Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Model
class Pro2DiffModel(nn.Module):
    def __init__(self, num_classes=10):  
        super(Pro2DiffModel, self).__init__()
        self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        self.backbone.fc = nn.Identity()  
        self.bbox_head = nn.Linear(2048, 28)  
        self.id_head = nn.Linear(2048, 7 * num_classes)  
    
    def forward(self, x):
        features = self.backbone(x)  
        bbox_pred = self.bbox_head(features).view(x.size(0), 7, 4)  
        id_pred = self.id_head(features).view(x.size(0), 7, -1)  
        return bbox_pred, id_pred 

# Initialize Model
model = Pro2DiffModel(num_classes=10).to(device)

# Generalized IoU
def generalized_iou(box1, box2):
    inter_x1 = torch.max(box1[..., 0], box2[..., 0])
    inter_y1 = torch.max(box1[..., 1], box2[..., 1])
    inter_x2 = torch.min(box1[..., 0] + box1[..., 2], box2[..., 0] + box2[..., 2])
    inter_y2 = torch.min(box1[..., 1] + box1[..., 3], box2[..., 1] + box2[..., 3])
    inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)

    box1_area = box1[..., 2] * box1[..., 3]
    box2_area = box2[..., 2] * box2[..., 3]
    union_area = box1_area + box2_area - inter_area
    
    iou = inter_area / (union_area + 1e-7)

    enclose_x1 = torch.min(box1[..., 0], box2[..., 0])
    enclose_y1 = torch.min(box1[..., 1], box2[..., 1])
    enclose_x2 = torch.max(box1[..., 0] + box1[..., 2], box2[..., 0] + box2[..., 2])
    enclose_y2 = torch.max(box1[..., 1] + box1[..., 3], box2[..., 1] + box2[..., 3])
    enclose_area = (enclose_x2 - enclose_x1) * (enclose_y2 - enclose_y1)
    giou = iou - (enclose_area - union_area) / (enclose_area + 1e-7)

    return giou

# Compute Loss
def compute_loss(bbox_pred, bbox_gt, id_pred, id_gt):
    id_pred = id_pred.view(-1, id_pred.size(-1))  
    id_gt = id_gt.view(-1)  

    cls_loss = nn.functional.cross_entropy(id_pred, id_gt)
    l1_loss = nn.functional.l1_loss(bbox_pred, bbox_gt)
    giou_loss = 1 - generalized_iou(bbox_pred, bbox_gt).mean()

    lambda_cls, lambda_l1, lambda_giou = 2, 5, 2
    total_loss = lambda_cls * cls_loss + lambda_l1 * l1_loss + lambda_giou * giou_loss
    return total_loss, cls_loss

# Training function
def train_sequentially(model, dataset_paths, epochs=10, save_path="pro21.pth"):
    optimizer_bbox = torch.optim.Adam(model.bbox_head.parameters(), lr=0.01)  # Separate optimizer for bbox_head
    optimizer_id = torch.optim.Adam(model.id_head.parameters(), lr=0.01)  # Separate optimizer for id_head

    if os.path.exists(save_path):
        print(f"🔄 Loading existing model from {save_path}...")
        model.load_state_dict(torch.load(save_path, map_location=device))

    model.to(device)

    for dataset_path in dataset_paths:
        image_folder, gt_file = dataset_path
        print(f"🚀 Training on dataset: {image_folder}")

        dataset = MOTDataset(image_folder, gt_file, transform, max_images=50)
        dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

        for epoch in range(epochs):
            model.train()
            total_loss = 0

            loop = tqdm(dataloader, leave=True)
            for images, bbox_gt, id_gt, _ in loop:
                images, bbox_gt, id_gt = images.to(device), bbox_gt.to(device), id_gt.to(device)

                optimizer_bbox.zero_grad()
                optimizer_id.zero_grad()
                bbox_pred, id_pred = model(images)

                # Debugging ground truth labels and predictions
                print(f"id_gt: {id_gt}")
                print(f"id_pred: {id_pred}")

                loss, cls_loss = compute_loss(bbox_pred, bbox_gt, id_pred, id_gt)
                loss.backward()

                # Debugging gradients and loss values
                print(f"Gradients for bbox_head before step: {model.bbox_head.weight.grad.abs().mean()}")
                print(f"Gradients for id_head before step: {model.id_head.weight.grad.abs().mean()}")
                print(f"cls_loss: {cls_loss.item()}")

                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  
                optimizer_bbox.step()
                optimizer_id.step()

                # Debugging gradients after step
                print(f"Gradients for bbox_head after step: {model.bbox_head.weight.grad.abs().mean()}")
                print(f"Gradients for id_head after step: {model.id_head.weight.grad.abs().mean()}")

                total_loss += loss.item()

            avg_loss = total_loss / len(dataloader)
            loop.set_description(f"📊 Dataset: {image_folder} - Epoch [{epoch+1}/{epochs}]")
            loop.set_postfix(loss=avg_loss)


            torch.save(model.state_dict(), save_path)
            model_size = os.path.getsize(save_path) / (1024 * 1024)
            print(f"📏 Model size after epoch {epoch+1}: {model_size:.2f} MB")

        print(f"✅ Finished training on {image_folder}\n")

 
datasets = [
    ("train1/train1/dancetrack0002/img1/", "train1/train1/dancetrack0002/gt/gt.txt"),
    ("train1/train1/dancetrack0008/img1/", "train1/train1/dancetrack0008/gt/gt.txt"),
    ]

train_sequentially(model, datasets, epochs=13)

In [8]:
pip install detectron2


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement detectron2 (from versions: none)
ERROR: No matching distribution found for detectron2

[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\dharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [9]:
import cv2
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
import matplotlib.pyplot as plt
import numpy as np

# Setup Detectron2 configuration
cfg = get_cfg()
# Load the LVIS-Res50 model from Detectron2 model zoo
cfg.merge_from_file("https://raw.githubusercontent.com/facebookresearch/detectron2/main/tools/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml")
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/retinanet_R_50_FPN_1x.yaml")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # Set threshold for detection

# Create predictor from configuration
predictor = DefaultPredictor(cfg)

# Load an image (provide the path to your image here)
image = cv2.imread("train1/train1/dancetrack0012/img1/00000001.jpg")
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Make predictions
outputs = predictor(image)

# Visualize results
instances = outputs["instances"]
boxes = instances.pred_boxes.tensor.numpy()
labels = instances.pred_classes.numpy()
scores = instances.scores.numpy()

# Plot image with bounding boxes
plt.figure(figsize=(10, 10))
plt.imshow(image_rgb)

for box, label, score in zip(boxes, labels, scores):
    x1, y1, x2, y2 = box
    plt.gca().add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, color='r', linewidth=2))
    plt.text(x1, y1, f"{label}: {score:.2f}", color='red', fontsize=12, bbox=dict(facecolor='white', alpha=0.5))
plt.show()


ModuleNotFoundError: No module named 'detectron2'

In [1]:
pip install motmetrics

Collecting motmetrics
  Downloading motmetrics-1.4.0-py3-none-any.whl.metadata (20 kB)
Collecting xmltodict>=0.12.0 (from motmetrics)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading motmetrics-1.4.0-py3-none-any.whl (161 kB)
Downloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, motmetrics
Successfully installed motmetrics-1.4.0 xmltodict-0.14.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install torchmetrics motmetrics pandas scikit-learn opencv-python torch torchvision


Collecting torchmetrics
  Downloading torchmetrics-1.6.3-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.1-py3-none-any.whl.metadata (5.6 kB)
Downloading torchmetrics-1.6.3-py3-none-any.whl (931 kB)
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/931.7 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/931.7 kB ? eta -:--:--
   --------------------- ---------------- 524.3/931.7 kB 989.2 kB/s eta 0:00:01
   ---------------------------------------- 931.7/931.7 kB 1.2 MB/s eta 0:00:00
Downloading lightning_utilities-0.14.1-py3-none-any.whl (28


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
import torchvision
import os
import cv2
import motmetrics as mm
import pandas as pd
import time
from tqdm import tqdm
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from sklearn.metrics import precision_score, recall_score, f1_score

# -----------------------------------------
# 🔹 Load & Inspect Model Checkpoint with Progress Bar
# -----------------------------------------
MODEL_PATH = "diffdet_coco_res50_300boxes.pth"  # Update with actual path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("🔹 Loading model checkpoint...")
start_time = time.time()

checkpoint = torch.load(MODEL_PATH, map_location=device)
print(f"✅ Checkpoint loaded in {time.time() - start_time:.2f} seconds.")

if "model" in checkpoint:
    print("🔹 Extracting 'model' key from checkpoint")
    checkpoint = checkpoint["model"]  # Extract only model weights

print("🔹 Initializing Faster R-CNN model...")
start_time = time.time()
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
print(f"✅ Model initialized in {time.time() - start_time:.2f} seconds.")

print("🔹 Loading model weights (this may take time)...")
start_time = time.time()
missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
print(f"✅ Model loaded in {time.time() - start_time:.2f} seconds.")
print(f"🔹 Missing keys: {len(missing_keys)}, Unexpected keys: {len(unexpected_keys)}")

model.to(device)
model.eval()
print("✅ Model is ready!")

# -----------------------------------------
# 🔹 Load Image Dataset & Run Inference with Progress Bar
# -----------------------------------------
DATASET_PATH = "train1/train1/dancetrack0001/img1/"  # Update with dataset folder
image_files = sorted([os.path.join(DATASET_PATH, f) for f in os.listdir(DATASET_PATH) if f.endswith(('.jpg', '.png'))])

predictions = []

print("\n🔹 Running inference on images...")
for frame_idx, img_path in tqdm(enumerate(image_files), total=len(image_files), desc="Processing Images"):
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB
    image_tensor = torchvision.transforms.ToTensor()(image).unsqueeze(0).to(device)  # Convert to tensor

    with torch.no_grad():
        output = model(image_tensor)  # Run inference

    for i, bbox in enumerate(output[0]['boxes']):
        bbox = bbox.cpu().numpy()  # Convert tensor to numpy
        obj_id = i  # Assume unique ID for detection
        predictions.append([frame_idx, obj_id, bbox[0], bbox[1], bbox[2], bbox[3]])

pred_df = pd.DataFrame(predictions, columns=['Frame', 'ID', 'X', 'Y', 'Width', 'Height'])
print("✅ Inference complete!")

# -----------------------------------------
# 🔹 Load Ground Truth Data & Compute Tracking Metrics (MOT)
# -----------------------------------------
GT_FILE = "train1/train1/dancetrack0012/gt/gt.txt"  # Update with GT file path
gt_df = pd.read_csv(GT_FILE, header=None, usecols=[0,1,2,3,4,5], 
                    names=['Frame', 'ID', 'X', 'Y', 'Width', 'Height'])

def convert_to_mot_format(df):
    df['X'] += df['Width'] / 2  
    df['Y'] += df['Height'] / 2  
    return df[['Frame', 'ID', 'X', 'Y', 'Width', 'Height']]

gt_df = convert_to_mot_format(gt_df)
pred_df = convert_to_mot_format(pred_df)

acc = mm.MOTAccumulator(auto_id=True)

print("\n🔹 Computing Tracking Metrics...")
for frame in tqdm(sorted(gt_df["Frame"].unique()), desc="Processing Frames"):
    gt_objects = gt_df[gt_df["Frame"] == frame]
    pred_objects = pred_df[pred_df["Frame"] == frame]

    gt_ids = list(gt_objects["ID"])
    pred_ids = list(pred_objects["ID"])

    distances = mm.distances.iou_matrix(gt_objects[['X', 'Y', 'Width', 'Height']].values, 
                                        pred_objects[['X', 'Y', 'Width', 'Height']].values,
                                        max_iou=0.5)

    acc.update(gt_ids, pred_ids, distances)

mh = mm.metrics.create()
metrics = mh.compute(acc, metrics=['num_frames', 'idf1', 'mota', 'motp', 'precision', 'recall', 'num_switches', 'num_fragmentations'], name='DiffDet')

print("\n📌 Tracking Metrics:")
print(metrics)

# -----------------------------------------
# 🔹 Compute Object Detection Metrics with Progress Bar
# -----------------------------------------
print("\n🔹 Computing Object Detection Metrics...")

def convert_to_torchmetrics_format(df):
    results = []
    for frame in tqdm(df["Frame"].unique(), desc="Processing Frames"):
        frame_data = df[df["Frame"] == frame]
        boxes = torch.tensor(frame_data[['X', 'Y', 'Width', 'Height']].values)  # BBox format
        labels = torch.tensor(frame_data['ID'].values)  # Object IDs as class labels
        scores = torch.ones(len(boxes))  # Dummy confidence scores for GT
        results.append({'boxes': boxes, 'labels': labels, 'scores': scores})
    return results

gt_torch = convert_to_torchmetrics_format(gt_df)
pred_torch = convert_to_torchmetrics_format(pred_df)

map_metric = MeanAveragePrecision()
map_metric.update(pred_torch, gt_torch)
map_result = map_metric.compute()

print("\n📌 Object Detection Metrics:")
print(f"mAP: {map_result['map']:.4f}")
print(f"Precision: {precision_score(gt_df['ID'], pred_df['ID'], average='macro'):.4f}")
print(f"Recall: {recall_score(gt_df['ID'], pred_df['ID'], average='macro'):.4f}")
print(f"F1 Score: {f1_score(gt_df['ID'], pred_df['ID'], average='macro'):.4f}")


🔹 Loading model checkpoint...


  checkpoint = torch.load(MODEL_PATH, map_location=device)


✅ Checkpoint loaded in 4.90 seconds.
🔹 Extracting 'model' key from checkpoint
🔹 Initializing Faster R-CNN model...




✅ Model initialized in 2.39 seconds.
🔹 Loading model weights (this may take time)...
✅ Model loaded in 0.01 seconds.
🔹 Missing keys: 295, Unexpected keys: 549
✅ Model is ready!

🔹 Running inference on images...


Processing Images: 100%|██████████| 703/703 [3:38:19<00:00, 18.63s/it]    


✅ Inference complete!

🔹 Computing Tracking Metrics...


Processing Frames: 100%|██████████| 1203/1203 [00:05<00:00, 213.97it/s]



📌 Tracking Metrics:
         num_frames  idf1  mota  motp  precision  recall  num_switches  \
DiffDet        1203   0.0   0.0   NaN        NaN     0.0             0   

         num_fragmentations  
DiffDet                   0  

🔹 Computing Object Detection Metrics...


Processing Frames: 100%|██████████| 1203/1203 [00:03<00:00, 393.70it/s]
Processing Frames: 0it [00:00, ?it/s]


ModuleNotFoundError: `MAP` metric requires that `pycocotools` or `faster-coco-eval` installed. Please install with `pip install pycocotools` or `pip install faster-coco-eval` or `pip install torchmetrics[detection]`.