In [1]:
import os
import numpy as np
import torch
import torchvision.transforms as transforms
import torchvision
import matplotlib.pyplot as plt

from PIL import Image
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.utils import draw_bounding_boxes
from tqdm import tqdm

In [2]:
class MenuDataset(torch.utils.data.Dataset):
    def __init__(self, root,path):
        self.root = root
        self.path=path
        self._transforms = transforms.Compose([transforms.ToTensor()])
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted([img for img in os.listdir(os.path.join(root, path+"/images")) if 
                                 ('.jpg' in img) or ('jpeg'in img) or('png'in img)]))
        self.masks = list(sorted([mask for mask in os.listdir(os.path.join(root, path+"/labels")) if '.txt' in mask]))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, self.path+"/images", self.imgs[idx])
        mask_path = os.path.join(self.root, self.path+"/labels", self.masks[idx])
        img = Image.open(img_path)
        # get bounding box coordinates for each mask
        objs=np.loadtxt(mask_path)
        if(objs.ndim==1):
            objs=np.array([objs])
        num_objs = np.shape(objs)[0]
        w,h=img.size
        boxes = objs[:,1:]
        boxes[:,0]=boxes[:,0]*w
        boxes[:,1]=boxes[:,1]*h
        boxes[:,2]=boxes[:,2]*w
        boxes[:,3]=boxes[:,3]*h
        # convert everything into a torch.Tensor
        boxes = torchvision.ops.box_convert(torch.tensor(boxes),'cxcywh','xyxy')
        # there is only one class
        labels = torch.as_tensor([int(obj[0])+1 for obj in objs], dtype=torch.int64)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self._transforms is not None:
            img, target = self._transforms(img),target

        return img, target

    def __len__(self):
        return len(self.imgs)

In [3]:
dataset = MenuDataset('me.v3-90-10.yolov5pytorch','train/')

In [4]:
# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
num_classes = 5 # background, item,description,title and price
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [5]:
data_loader = torch.utils.data.DataLoader(
 dataset, batch_size=2, shuffle=True)
images,targets = dataset.__getitem__(1)

In [6]:
images = [images]
targets = [targets]
output = model(images,targets)   # Returns losses and detections
# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(images)  # Returns predictions

[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


In [7]:
def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        TT=transforms.ToPILImage()
        img = TT(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

In [26]:
def main(model):
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 5
    # use our dataset and defined transformations
    dataset = MenuDataset('me.v3-90-10.yolov5pytorch','train/')
    dataset_test = MenuDataset('me.v3-90-10.yolov5pytorch','test/')

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()

    # move model to the right device
    model.to(device)

    # construct an optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10
    
    #helper functions 
    tensor_to_int=transforms.ConvertImageDtype(torch.uint8)
    for epoch in range(num_epochs):
        model.train()
        losses_list=[]
        for i in tqdm(range(dataset.__len__())):
            images,targets = dataset.__getitem__(i)
            images = [images]
            targets = [targets]
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            loss_dict = model(images,targets)
            losses = sum(loss for loss in loss_dict.values())
            loss_value = losses.item()
            losses_list.append(loss_value)
            losses.backward()
            optimizer.step()
            if i%100==0:
                print("Current Loss: ",sum(losses_list)/len(losses_list))
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        model.eval()
        predictions_imgs=[]
        
        for i in tqdm(range(dataset_test.__len__())):
            images,targets = dataset_test.__getitem__(i)
            images = [images]
            targets = [targets]
            outputs = model(images)
            img=tensor_to_int(images[0])
            predictions_imgs.append(draw_bounding_boxes(img, outputs[0]['boxes'], colors="yellow",width=6))
            
    show(predictions_imgs)
    print("That's it!")

In [27]:
main(model)

  1%|▍                                          | 1/113 [00:03<06:50,  3.67s/it]

Current Loss:  0.5639121501298073


 89%|████████████████████████████████████▋    | 101/113 [05:40<00:40,  3.37s/it]

Current Loss:  1.0501482845256742


100%|█████████████████████████████████████████| 113/113 [06:20<00:00,  3.37s/it]
100%|███████████████████████████████████████████| 12/12 [00:20<00:00,  1.70s/it]
  1%|▍                                          | 1/113 [00:03<06:20,  3.39s/it]

Current Loss:  0.6637884973818375


 89%|████████████████████████████████████▋    | 101/113 [05:40<00:40,  3.39s/it]

Current Loss:  1.0638431913166317


100%|█████████████████████████████████████████| 113/113 [06:20<00:00,  3.37s/it]
100%|███████████████████████████████████████████| 12/12 [00:20<00:00,  1.70s/it]
  1%|▍                                          | 1/113 [00:03<06:25,  3.44s/it]

Current Loss:  0.6958190830093316


 89%|████████████████████████████████████▋    | 101/113 [05:41<00:40,  3.36s/it]

Current Loss:  1.0457621524697427


100%|█████████████████████████████████████████| 113/113 [06:21<00:00,  3.38s/it]
100%|███████████████████████████████████████████| 12/12 [00:20<00:00,  1.74s/it]
  1%|▍                                          | 1/113 [00:03<06:33,  3.51s/it]

Current Loss:  0.6143025369632947


 89%|████████████████████████████████████▋    | 101/113 [05:41<00:40,  3.35s/it]

Current Loss:  1.012292560323083


100%|█████████████████████████████████████████| 113/113 [06:22<00:00,  3.38s/it]
100%|███████████████████████████████████████████| 12/12 [00:20<00:00,  1.69s/it]
  1%|▍                                          | 1/113 [00:03<06:36,  3.54s/it]

Current Loss:  0.5302072858124585


 89%|████████████████████████████████████▋    | 101/113 [05:39<00:40,  3.35s/it]

Current Loss:  0.9590625235276296


100%|█████████████████████████████████████████| 113/113 [06:19<00:00,  3.36s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.66s/it]
  1%|▍                                          | 1/113 [00:03<06:12,  3.33s/it]

Current Loss:  0.5294565313221474


 89%|████████████████████████████████████▋    | 101/113 [05:38<00:40,  3.35s/it]

Current Loss:  0.9423984353989815


100%|█████████████████████████████████████████| 113/113 [06:17<00:00,  3.34s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.65s/it]
  1%|▍                                          | 1/113 [00:03<06:13,  3.34s/it]

Current Loss:  0.5461807269014676


 89%|████████████████████████████████████▋    | 101/113 [05:39<00:39,  3.33s/it]

Current Loss:  0.9229582323865171


100%|█████████████████████████████████████████| 113/113 [06:18<00:00,  3.35s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.66s/it]
  1%|▍                                          | 1/113 [00:03<06:25,  3.44s/it]

Current Loss:  0.4961875679206373


 89%|████████████████████████████████████▋    | 101/113 [05:38<00:40,  3.35s/it]

Current Loss:  0.9223913413171746


100%|█████████████████████████████████████████| 113/113 [06:18<00:00,  3.35s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.65s/it]
  1%|▍                                          | 1/113 [00:03<06:08,  3.29s/it]

Current Loss:  0.5282086412172132


 89%|████████████████████████████████████▋    | 101/113 [05:36<00:39,  3.30s/it]

Current Loss:  0.9188659718901637


100%|█████████████████████████████████████████| 113/113 [06:16<00:00,  3.33s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.63s/it]
  1%|▍                                          | 1/113 [00:03<05:56,  3.19s/it]

Current Loss:  0.5331881785668027


 89%|████████████████████████████████████▋    | 101/113 [05:36<00:39,  3.32s/it]

Current Loss:  0.9192756988472177


100%|█████████████████████████████████████████| 113/113 [06:15<00:00,  3.33s/it]
100%|███████████████████████████████████████████| 12/12 [00:19<00:00,  1.63s/it]


That's it!


In [47]:
%matplotlib qt
dataset_test = MenuDataset('me.v3-90-10.yolov5pytorch','test/')
dict_labels={1:'description',2:'item',3:'price',4:'tittle'}
colors_dict={1:"yellow",2:"red",3:"green",4:"blue"}

img,target=dataset_test.__getitem__(8)
model.eval()
result=model([img])
indexes=[item.item() for item in result[0]['scores']>0.5]
tt=transforms.ConvertImageDtype(torch.uint8)
img=tt(img)
        
drawn_boxes = draw_bounding_boxes(img, result[0]['boxes'][indexes],
                    labels=[dict_labels[item.item()] for item in result[0]["labels"][indexes]], 
                    colors=[colors_dict[item.item()] for item in result[0]["labels"][indexes]],width=6)
show(drawn_boxes)

In [44]:
result[0]['scores']

tensor([0.9152, 0.8686, 0.8670, 0.8524, 0.8392, 0.8374, 0.8340, 0.8325, 0.8238,
        0.8184, 0.8013, 0.8012, 0.8002, 0.7981, 0.7705, 0.7699, 0.7550, 0.7534,
        0.7502, 0.7325, 0.7252, 0.7185, 0.7084, 0.7012, 0.6992, 0.6940, 0.6904,
        0.6884, 0.6851, 0.6844, 0.6627, 0.6523, 0.6490, 0.6411, 0.6395, 0.6276,
        0.6118, 0.5943, 0.5848, 0.5751, 0.5664, 0.5640, 0.5608, 0.5564, 0.5560,
        0.5462, 0.5230, 0.5219, 0.5142, 0.5125, 0.5096, 0.5075, 0.5059, 0.5051,
        0.4847, 0.4829, 0.4733, 0.4637, 0.4583, 0.4562, 0.4548, 0.4543, 0.4537,
        0.4492, 0.4358, 0.4140, 0.4083, 0.3944, 0.3902, 0.3809, 0.3806, 0.3775,
        0.3656, 0.3629, 0.3412, 0.3405, 0.3272, 0.3269, 0.3217, 0.3134, 0.3038,
        0.2951, 0.2944, 0.2717, 0.2680, 0.2614, 0.2528, 0.2510, 0.2395, 0.2335,
        0.2320, 0.2278, 0.2276, 0.2220, 0.2146, 0.2131, 0.2129, 0.1956, 0.1939,
        0.1910], grad_fn=<IndexBackward0>)