In [160]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [161]:
import cv2
import torch
import random
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from tqdm import tqdm
from tqdm import trange
from torchvision.ops import box_iou

In [162]:
class TrafficDataset(Dataset):
    def __init__(self, img_dir, label_dir, transform = None, img_size=(512,512), max_samples = None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.img_size = img_size
        self.images = [img for img in os.listdir(img_dir) if img.endswith(('.jpg', '.png', '.jpeg'))]
        
        if max_samples is not None:
            self.images = random.sample(self.images, min(max_samples, len(self.images)))
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        img_name = self.images[index]
        img_path = os.path.join(self.img_dir, img_name)
        label_path = os.path.join(self.label_dir, img_name.replace('.jpg','.txt').replace('.png','.txt'))
        
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w, _ = image.shape
        
        boxes = []
        labels = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f.readlines():
                    cls, x_c, y_c, bw, bh = map(float, line.strip().split())
                    x_c *= w
                    y_c *= h
                    bw *= w
                    bh *= h
                    x1 = x_c - bw / 2
                    x2 = x_c + bw / 2
                    y1 = y_c - bh / 2
                    y2 = y_c + bh / 2
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(cls))
        
        if not boxes:
            return self.__getitem__((index+1) % len(self))
                    
        boxes = torch.tensor(boxes, dtype = torch.float32)
        labels = torch.tensor(labels, dtype = torch.int64)
        image = cv2.resize(image, self.img_size)
        scale_x = self.img_size[0] / w
        scale_y = self.img_size[1] / h
        boxes[:, [0,2]]*=scale_x
        boxes[:, [1,3]]*=scale_y
        image = T.ToTensor()(image)
        
        target = {
            "boxes": boxes,
            "labels": labels
        }
        
        return image, target        
    

In [163]:
train_dataset = TrafficDataset(
    img_dir = r"C:\Users\Rishabh Surana\Desktop\ATMS project\Custom model for Object Detection\Road Image Dataset\trafic_data\train\images", 
    label_dir = r"C:\Users\Rishabh Surana\Desktop\ATMS project\Custom model for Object Detection\Road Image Dataset\trafic_data\train\labels",
    max_samples = 500
)

In [164]:
valid_dataset = TrafficDataset(
    img_dir = r"C:\Users\Rishabh Surana\Desktop\ATMS project\Custom model for Object Detection\Road Image Dataset\trafic_data\valid\images", 
    label_dir = r"C:\Users\Rishabh Surana\Desktop\ATMS project\Custom model for Object Detection\Road Image Dataset\trafic_data\valid\labels",
    max_samples = 300
)

In [165]:
print(f"Number of training images: {len(train_dataset)}")
print(f"Number of validation images: {len(valid_dataset)}")

Number of training images: 500
Number of validation images: 300


In [166]:
from collections import Counter
label_counter = Counter()
for _, target in train_dataset:
    label_counter.update(target['labels'].tolist())
print(label_counter)

Counter({5: 898, 4: 656, 13: 576, 17: 557, 10: 412, 18: 297, 11: 260, 9: 163, 15: 154, 19: 140, 2: 75, 3: 65, 7: 39, 20: 26, 16: 16, 8: 14, 0: 11, 1: 8, 14: 6, 12: 3})


In [167]:
for i in range(5):
    img, tgt = train_dataset[i]
    print(f"Image {i}: shape = {img.shape}, target ={tgt}")

Image 0: shape = torch.Size([3, 512, 512]), target ={'boxes': tensor([[338.4000, 340.6222, 503.2000, 489.9556],
        [275.2000, 282.3111, 334.4000, 403.2000],
        [357.2000, 317.1555, 407.6000, 359.8222],
        [ 68.4000, 261.6889, 176.4000, 392.5334],
        [211.2000, 283.7333, 259.2000, 393.2444],
        [262.0000, 338.4889, 275.6000, 366.9333],
        [199.2000, 313.6000, 213.6000, 369.0667]]), 'labels': tensor([ 5, 11,  4,  4,  4,  9,  4])}
Image 1: shape = torch.Size([3, 512, 512]), target ={'boxes': tensor([[414.0000, 243.2000, 510.8000, 499.2000],
        [200.4000, 175.6445, 374.0000, 398.9333],
        [ 52.8000,  68.2667, 164.8000, 210.4889],
        [ 21.2000,  45.5111,  58.8000, 122.3111],
        [  0.0000,  19.9111,  48.0000,  65.4222],
        [109.2000,  16.3556, 132.4000,  32.0000],
        [333.2000, 379.7333, 434.8000, 510.5778],
        [164.8000, 333.5111, 280.0000, 468.6222],
        [ 74.0000, 216.1778, 142.0000, 304.3556],
        [110.0000, 184.888

In [168]:
def get_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

In [169]:
num_classes = 22
model = get_model(num_classes)
device = torch.device("cuda")
model.to(device)



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [170]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle = True, num_workers=0, collate_fn=lambda x: tuple(zip(*x)))
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle = False, num_workers=0, collate_fn=lambda x: tuple(zip(*x)))

In [171]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_schedular = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 3, gamma = 0.1)

In [172]:
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    print("\n\n ---Training ---")
    print(f"Started training epoch {epoch+1}")
    model.train()
    total_loss = 0
    
    loop = tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch [{epoch+1}]")
    
    for i, (images, targets) in loop:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        loss_value = losses.item()
        total_loss += loss_value
        
        loop.set_postfix({
            "Step":f"{i+1}/{len(data_loader)}",
            "Loss":f"{loss_value:.4f}",
            "AvgLoss":f"{total_loss/(i+1):.4f}"
        })
           
    avg_loss = total_loss / len(data_loader)
    print(f"\n Epoch{epoch+1} completed. Avg Loss: {avg_loss:.4f}\n")

In [173]:
def save_best_model(model, acc, best_acc, path):
    dir_name = os.path.dirname(path)
    if dir_name != "":
        os.makedirs(os.path.dirname(path), exist_ok=True)
    if acc >= best_acc:
        torch.save(model.state_dict(), path)
        print(f"Best model saved at: {path} | Accuracy: {acc:.4f}")
        return acc
    else:
        print(f"Not saved. Current acc ({acc:.4f}) < Best acc ({best_acc:.4f})")
    return best_acc


In [174]:
best_val_accuracy = 0.0
num_epochs = 15

In [175]:
def evaluate(model, data_loader, device, iou_threshold=0.5, score_threshold=0.5):
    print(f"\n\n ---Evaluation @ threshold={score_threshold} ---")
    model.eval()
    total_correct = 0
    total_gt = 0
    total_pred = 0

    from torchvision.ops import box_iou

    with torch.no_grad():
        for images, targets in data_loader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            outputs = model(images)
            torch.cuda.empty_cache()

            for output in outputs:
                if 'scores' in output:
                    keep = output['scores'] > score_threshold
                    output['boxes'] = output['boxes'][keep]
                    output['labels'] = output['labels'][keep]
                    output['scores'] = output['scores'][keep]

            for pred, target in zip(outputs, targets):
                pred_boxes = pred['boxes']
                pred_labels = pred['labels']
                true_boxes = target['boxes']
                true_labels = target['labels']

                if pred_boxes.numel() == 0 or true_boxes.numel() == 0:
                    total_gt += len(true_labels)
                    total_pred += len(pred_labels)
                    continue

                ious = box_iou(pred_boxes, true_boxes)
                matched_indices = (ious > iou_threshold).nonzero(as_tuple=False)

                matched_gt = set()
                matched_pred = set()

                for pred_idx, gt_idx in matched_indices:
                    if (
                        pred_labels[pred_idx] == true_labels[gt_idx]
                        and gt_idx.item() not in matched_gt
                        and pred_idx.item() not in matched_pred
                    ):
                        total_correct += 1
                        matched_gt.add(gt_idx.item())
                        matched_pred.add(pred_idx.item())

                total_gt += len(true_labels)
                total_pred += len(pred_labels)

    precision = total_correct / total_pred if total_pred else 0
    recall = total_correct / total_gt if total_gt else 0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0
    )

    accuracy = total_correct / total_gt if total_gt else 0
    print(f"Eval Summary -> Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f} | Accuracy: {accuracy:.4f}")
    
    return 1 - accuracy, accuracy


In [None]:
if __name__ == "__main__":
    for epoch in trange(num_epochs, desc="Training Epochs"):
        
        try:
            train_one_epoch(model, optimizer, train_loader, device, epoch)
        except Exception as e:
            print(f"Error in training: {e}") 
        lr_schedular.step()
        model.to("cuda")
        torch.cuda.empty_cache()
        val_loss, val_accuracy = evaluate(model, valid_loader, device)
        
        print(f"Epoch {epoch+1} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
        torch.cuda.empty_cache()
        best_val_accuracy = save_best_model(model, val_accuracy, best_val_accuracy, "CNN_obj_Traffic_detector_4.pth")

In [None]:
num_classes = 22  # include background
device = torch.device("cpu")
model = get_model(num_classes)
model.load_state_dict(torch.load("CNN_obj_Traffic_detector_4.pth", map_location=device))
model.eval()

In [None]:
device = torch.device("cpu")
def load_image(image_path, img_size=(512,512)):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    orig_image = image.copy()
    image = cv2.resize(image, img_size)
    transform = T.ToTensor()
    return transform(image), orig_image

In [None]:
def rescale_boxes(boxes, from_shape, to_shape):
    scale_x = to_shape[1] / from_shape[1]
    scale_y = to_shape[0] / from_shape[0]
    boxes[:, [0, 2]] *= scale_x
    boxes[:, [1, 3]] *= scale_y
    return boxes

In [None]:
def predict(model, image_tensor, threshold=0.5):
    with torch.no_grad():
        prediction = model([image_tensor.to(device)])[0]

    keep = prediction['scores'] > threshold
    boxes = prediction['boxes'][keep].cpu()
    labels = prediction['labels'][keep].cpu()
    scores = prediction['scores'][keep].cpu()
    
    return boxes, labels, scores

In [None]:
def draw_boxes(image, boxes, labels, scores):
    for box, label, score in zip(boxes, labels, scores):
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, f"{label}: {score:.2f}", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    return image


In [None]:
import matplotlib.pyplot as plt
image_path = r"C:\Users\Rishabh Surana\Desktop\ATMS project\test_data\test_image\image 4.jpg"
image_tensor, orig_image = load_image(image_path)

boxes, labels, scores = predict(model, image_tensor, threshold=0.5)

rescaled_boxes = rescale_boxes(boxes.clone(), from_shape=(512, 512), to_shape=orig_image.shape[:2])
output_image = draw_boxes(orig_image, rescaled_boxes, labels, scores)


plt.figure(figsize=(8, 6))
plt.imshow(output_image)
plt.axis('off')
plt.title("Detected Objects")
plt.show()

In [178]:
num_classes = 22
print("Validation data size : 500")
for i in range(4):
    model = get_model(num_classes)
    print("\nModel ",i+1," : ")
    m_path = f"C:\\Users\\Rishabh Surana\\Desktop\\ATMS project\\Custom model for Object Detection\\CNN_obj_Traffic_detector_{i+1}.pth"
    model.load_state_dict(torch.load(m_path))

    model.to(device)

    for thresh in [0.4, 0.5, 0.6]:
        evaluate(model, valid_loader, device, score_threshold=thresh)



Validation data size : 500

Model  1  : 


 ---Evaluation @ threshold=0.4 ---
Eval Summary -> Precision: 0.5746 | Recall: 0.5035 | F1: 0.5367 | Accuracy: 0.5035


 ---Evaluation @ threshold=0.5 ---
Eval Summary -> Precision: 0.6614 | Recall: 0.4574 | F1: 0.5408 | Accuracy: 0.4574


 ---Evaluation @ threshold=0.6 ---
Eval Summary -> Precision: 0.7302 | Recall: 0.4075 | F1: 0.5231 | Accuracy: 0.4075

Model  2  : 


 ---Evaluation @ threshold=0.4 ---
Eval Summary -> Precision: 0.5454 | Recall: 0.4950 | F1: 0.5190 | Accuracy: 0.4950


 ---Evaluation @ threshold=0.5 ---
Eval Summary -> Precision: 0.6433 | Recall: 0.4377 | F1: 0.5210 | Accuracy: 0.4377


 ---Evaluation @ threshold=0.6 ---
Eval Summary -> Precision: 0.7404 | Recall: 0.3808 | F1: 0.5029 | Accuracy: 0.3808

Model  3  : 


 ---Evaluation @ threshold=0.4 ---
Eval Summary -> Precision: 0.5722 | Recall: 0.5430 | F1: 0.5572 | Accuracy: 0.5430


 ---Evaluation @ threshold=0.5 ---
Eval Summary -> Precision: 0.6450 | Recall: 0.5062 | F

In [None]:
from deep_sort_realtime.deepsort_tracker import DeepSort