In [None]:
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py", "engine.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py", "utils.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py", "coco_eval.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py", "transforms.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py", "coco_utils.py")

In [1]:
import os
import torch

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import v2 as T
import torch.utils.data

from engine import train_one_epoch, evaluate
import utils


In [14]:
# img = read_image("AugmentedDatasetCombined/Images/GOPR2726-MP4_20231127193915_0307_JPEG_jpg.rf.50e4b30b33e27997fd87758413954a5f.jpg")
# print(img.shape)

In [14]:
# defining the files directory and testing directory
parentDir = "AugmentedDatasetCombined"

files_dir = 'AugmentedDatasetCombined/Images'
# test_dir = '/content/augmented_dataset/Trspeclass.v2i.yolov8/test/images'


class AugmentedDataset(torch.utils.data.Dataset):

    def __init__(self, parentDir, width, height, transforms=None):
        self.transforms = transforms
        self.parentDir = parentDir
        self.height = height
        self.width = width
        self.filesDir = os.path.join(parentDir, "Images")
        
        # sorting the images for consistency
        # To get images, the extension of the filename is checked to be jpg
        self.imgs = [image for image in sorted(os.listdir(self.filesDir))
                        if image[-4:]=='.jpg']


        # classes: 0 index is reserved for background
        self.classes = [" ", 'coconut', 'coniferous', 'date palm', 'deciduous', 'banana']

    def __getitem__(self, idx):

        img_name = self.imgs[idx]
        image_path = os.path.join(self.filesDir, img_name)

        # reading the images 
        img = read_image(image_path)
        
        # labels file
        labelFilename = img_name[:-4] + '.txt'
        labelFilePath = os.path.join(self.parentDir, "Labels", labelFilename)

        boxes = []
        labels = []
        objects = []

        with open(labelFilePath) as f:
            lines = f.readlines()
            for line in lines:
                objects.append(line.split())

        wt = img.shape[1]
        ht = img.shape[2]

        for member in objects:
            labels.append(int(member[0]) + 1)

            # bounding box

            xCentre = float(member[1]) * wt
            yCentre = float(member[2]) * ht
            width = float(member[3]) * wt
            height = float(member[4]) * ht


            xmin = int(xCentre - width//2)
            xmax = int(xCentre + width//2)
            ymin = int(yCentre - height//2)
            ymax = int(yCentre + height//2)


            xmin_corr = (xmin/wt)*self.width
            xmax_corr = (xmax/wt)*self.width
            ymin_corr = (ymin/ht)*self.height
            ymax_corr = (ymax/ht)*self.height

            boxes.append([xmin_corr, ymin_corr, xmax_corr, ymax_corr])

        # convert boxes into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        
        if (boxes.shape[0] == 0):
            # print("error, no box here")
            #! hardcoding box values if no box exists, dont do this.....
            boxes = torch.as_tensor([[0 ,0, 200, 200]])
            #! and adding it as a backgound image, with label 0
            labels = torch.zeros((1,), dtype=torch.int64)
        
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        # Wrap sample and targets into torchvision tv_tensors:
        img = tv_tensors.Image(img)

        # suppose all instances are not crowd
        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

        labels = torch.as_tensor(labels, dtype=torch.int64)


        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["area"] = area
        target["iscrowd"] = iscrowd
        # image_id
        image_id = torch.tensor([idx])
        target["image_id"] = image_id


        if self.transforms is not None:
            img, target = self.transforms(img, target)



        return img, target

    def __len__(self):
        return len(self.imgs)

In [8]:
# check dataset
dataset = AugmentedDataset(parentDir, 640, 640)
print('length of dataset = ', len(dataset), '\n')

# getting the image and target for a test index.  Feel free to change the index.
img, target = dataset[9]
print(img.shape, '\n',target)

length of dataset =  5248 

torch.Size([3, 640, 640]) 
 {'boxes': tensor([[343.,   3., 623., 211.],
        [418., 297., 638., 533.]]), 'labels': tensor([1, 1]), 'area': tensor([58240., 51920.]), 'iscrowd': tensor([0, 0]), 'image_id': tensor([9])}


In [3]:
# helper function
def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

#### Testing forward() method (Optional)

Before iterating over the dataset, it’s good to see what the model expects during training and inference time on sample data.

In [9]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

dataset = AugmentedDataset('AugmentedDatasetCombined', 640, 640, get_transform(train=True))

data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=utils.collate_fn
)

# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(output)

# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions
print(predictions[0])

{'loss_classifier': tensor(0.5576, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.1261, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.1523, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0201, grad_fn=<DivBackward0>)}
{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>)}


In [10]:
def getModel(numClasses):
    # load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

    # replace the classifier with a new one, that has
    # num_classes which is user-defined
    num_classes = 6  # 1 class (person) + background
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    return model

In [15]:
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has 5 classes, plus the background class
num_classes = 6

# use our dataset and defined transformations
dataset = AugmentedDataset('AugmentedDatasetCombined',640, 640, get_transform(train=True))
dataset_test = AugmentedDataset('AugmentedDatasetCombined',640, 640,  get_transform(train=False))

# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=utils.collate_fn
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    collate_fn=utils.collate_fn
)

# get the model using our helper function
model = getModel(num_classes)

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it just for 2 epochs
num_epochs = 2

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("That's it!")

Epoch: [0]  [   0/2599]  eta: 0:37:49  lr: 0.000010  loss: 1.7179 (1.7179)  loss_classifier: 1.6304 (1.6304)  loss_box_reg: 0.0753 (0.0753)  loss_objectness: 0.0041 (0.0041)  loss_rpn_box_reg: 0.0080 (0.0080)  time: 0.8732  data: 0.0510  max mem: 3535
Epoch: [0]  [  10/2599]  eta: 0:27:50  lr: 0.000060  loss: 1.7507 (1.9297)  loss_classifier: 1.5224 (1.4915)  loss_box_reg: 0.1220 (0.1762)  loss_objectness: 0.0426 (0.2399)  loss_rpn_box_reg: 0.0094 (0.0220)  time: 0.6451  data: 0.0659  max mem: 3694
Epoch: [0]  [  20/2599]  eta: 0:26:14  lr: 0.000110  loss: 1.4202 (1.4762)  loss_classifier: 1.0706 (1.1131)  loss_box_reg: 0.1392 (0.1743)  loss_objectness: 0.0426 (0.1698)  loss_rpn_box_reg: 0.0097 (0.0191)  time: 0.5973  data: 0.0690  max mem: 3694
Epoch: [0]  [  30/2599]  eta: 0:25:06  lr: 0.000160  loss: 0.5915 (1.2099)  loss_classifier: 0.3423 (0.8379)  loss_box_reg: 0.1392 (0.1806)  loss_objectness: 0.0313 (0.1713)  loss_rpn_box_reg: 0.0097 (0.0201)  time: 0.5542  data: 0.0602  max me

AssertionError: All bounding boxes should have positive height and width. Found invalid box [202.22222900390625, 546.6666259765625, 202.22222900390625, 564.4443969726562] for target at index 0.

In [17]:
torch.save(model.state_dict(), "model")