In [1]:
import cv2
import os
import json
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, ViTModel, ViTForImageClassification
import torch

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
training_samples_path = "/Users/minhnam/Desktop/playground/AICityChallenge2024_Track2/speed_prediction/datasets/avg_img/WTS/train/labels.json"
training_samples_data = json.load(open(training_samples_path))

speeds = set()
for sample in training_samples_data:
    speeds.add(int(sample["speed"]))
labels = sorted(list(speeds))
labels

[0, 5, 10, 15, 20, 25, 30]

In [3]:
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k", cache_dir="./cache")
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    cache_dir="./cache",
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from PIL import Image

image = Image.open("/Users/minhnam/Desktop/playground/AICityChallenge2024_Track2/speed_prediction/dataset/images/20230707_12_SN17_T1_vehicle_view/20230707_12_SN17_T1_vehicle_view_event0_0.jpg")

In [18]:
inputs = image_processor(images=[image, image, image], return_tensors="pt")
outputs = model(**inputs, labels=torch.tensor([1, 1, 1]))

In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

num_epochs = 10
for i in range(num_epochs):
    outputs = model(**inputs, labels=torch.tensor([1, 1, 1]))
    loss = outputs.loss
    logits = outputs.logits
    accuracy = (logits.argmax(dim=-1) == torch.tensor([1, 1, 1])).float().mean()
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(loss.item(), accuracy.item())

0.29285234212875366 1.0
0.4288746416568756 1.0
0.4025709629058838 1.0
0.3236382305622101 1.0
0.25084543228149414 1.0
0.23482809960842133 1.0
0.2173997014760971 1.0
0.20259679853916168 1.0
0.18695811927318573 1.0
0.17503555119037628 1.0


In [4]:
class AvgImgSpeedPredictionDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir, labels_path, transform=None):
        self.image_paths = []
        self.labels = []
        self.transform = transform
        
        # Load labels
        with open(labels_path) as f:
            labels_data = json.load(f)
        
        # Create list of image paths and labels
        for sample in labels_data:
            image_filename = sample["image_name"]
            speed = sample["speed"]
            
            image_path = os.path.join(images_dir, image_filename)
            
            if not os.path.exists(image_path):
                print(f"Image {image_path} does not exist")
                continue
            
            self.image_paths.append(image_path)
            self.labels.append(speed)
        
        # Sanity check
        assert len(self.image_paths) == len(self.labels)
        
        label_set = set(self.labels)
        self.label_to_idx = {label: idx for idx, label in enumerate(label_set)}
        self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(self.label_to_idx[label])

In [7]:
dataset = AvgImgSpeedPredictionDataset(
    images_dir='./datasets/avg_img/WTS/train/images/',
    labels_path='./datasets/avg_img/WTS/train/labels.json',
)
print(len(dataset))
# data loader
train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
)
val_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=False,
)

32611


In [10]:
print_every = 100
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for i in range(num_epochs):
    losses = []
    print(f"Epoch {i:02d}/{num_epochs:02d}")
    
    # train
    # use tqdm to show progress
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for j, batch in pbar:
        images, labels = batch
        inputs = image_processor(images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        accuracy = (logits.argmax(1) == labels).float().mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
        pbar.set_description(f"Loss: {np.mean(losses):.4f}, Accuracy: {accuracy:.4f}")
        
    for j, batch in enumerate(val_loader):
        images, labels = batch
        inputs = image_processor(images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        accuracy = (logits.argmax(1) == labels).float().mean()
        
        losses.append(loss.item())

    print(f'Epoch {i:03d}, Mean Loss = {np.mean(losses):.4f}\n\n')

Epoch 00/01


Loss: 2.0105, Accuracy: 0.0000:   0%|          | 3/16306 [00:02<3:44:58,  1.21it/s]

Epoch 000, Mean Loss = 2.0105





