In [1]:
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import DataLoader
import cv2
import math
import sys
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchvision.utils import draw_bounding_boxes

In [2]:
if (torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print(device)
print(torch.__version__)
print(torchvision.__version__)

# device = torch.device("cpu")

cuda NVIDIA GeForce GTX 1080 Ti
1.12.1+cu116
0.13.1+cu116


In [13]:
from dataset import MusicSheetDataSet
import random

def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])

def crop(image, target, region, dataset):
    boxes = []
    labels = []
    for annotation in target:
        # print(annotation)
        orig_box = [x for x in annotation['a_bbox']]
        new_box = [
            max(orig_box[0], region[0]) - region[0],
            max(orig_box[1], region[1]) - region[1],
            min(orig_box[2], region[2]) - region[0],
            min(orig_box[3], region[3]) - region[1],
        ]

        if new_box[0] >= new_box[2] or new_box[1] >= new_box[3] or area(new_box) < area(orig_box) * 0.5:
            continue

        for cat_id in annotation['cat_id']:
            if (cat_id == None):
                continue

            category = dataset.get_category(cat_id)
            if (category['annotation_set'] != 'deepscores'):
                continue

            if (category['name'] in {'stem', 'ledgerLine'}):
                break
            # if (category['name'] in oneset):
            labels.append(int(cat_id))
            boxes.append(new_box)
    
    return (
        torch.tensor(image[region[1] : region[3], region[0] : region[2]]).div(255).unsqueeze(0),
        {
            'boxes': torch.tensor(boxes),
            'labels': torch.tensor(labels),
        }
    )

def transform(images, targets, dataset):
    image_res = []
    target_res = []
    for image, target in zip(images, targets):
        height, width = image.shape

        x = random.randrange(0, width // 2)
        y = random.randrange(0, height // 2)
        region = [x, y, x + width // 2, y + height // 2]

        i, t = crop(image, target, region, dataset)

        if t['boxes'].shape[0] == 0:
            region = [0, 0, width // 2, height // 2]
            i, t = crop(image, target, region, dataset)
        
        image_res.append(i)
        target_res.append(t)
    return image_res, target_res

dataset = MusicSheetDataSet("ds2_dense", "train")

In [44]:
class MusicSymbolDetector:
    def __init__(self):
        self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(
            pretrained=True,
            num_classes=137,
            min_size=1024,
            max_size=1024,
            box_detections_per_img=300
        )

        params = [p for p in self.model.parameters() if p.requires_grad]
        self.optimizer = torch.optim.Adam(params)
        self.epoch = 0
        self.loss = 0
    
    def __call__(self, image):
        self.model.to(device)
        self.model.eval()
        # split image into 6 smaller image for accurate small object detection
        height, width = image.shape
        box_size = int(width * 0.55)

        x_starts = [0, width - box_size]
        y_starts = [0, (height - box_size) // 2, height - box_size]

        # inference on each sub image
        results = []
        for x in x_starts:
            for y in y_starts:
                image = torch.tensor(image[y : y + box_size, x : x + box_size]).unsqueeze(0).to(device)
                print(image.shape)
                results.append(self.model([image]))
        
        # combine results, reconstruction
        
        return results
    
    def save(self):
        torch.save({
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "epoch": self.epoch,
            "loss": self.loss,
        }, f"fasterrcnn/{self.epoch}")

    def load(self, path = None):
        if path == None:
            path = self
            self = MusicSymbolDetector()
        
        data = torch.load(path)
        self.model.load_state_dict(data['model'])
        self.optimizer.load_state_dict(data['optimizer'])
        current_epoch = data['epoch']
        
        print("loaded model at epoch: {}, loss: {}".format(current_epoch, data['loss']))

        return self
        # move optimizer to cuda
        # for state in self.optimizer.state.values():
        #     for k, v in state.items():
        #         if isinstance(v, torch.Tensor):
        #             state[k] = v.cuda()
    
    def train(self, dataset : MusicSheetDataSet, epochs : int = 1, transform = None):
        self.model.to(device)
        self.model.train()
        torch.cuda.empty_cache()

        data_count = len(dataset)

        for _ in range(epochs):
            all_losses = 0
            all_losses_dict = {}

            dataloader = DataLoader(
                dataset,
                collate_fn=lambda x : zip(*x),
                shuffle = True
            )

            for images, targets in tqdm(dataloader):
                if transform != None:
                    images, targets = transform(images, targets, dataset)
                
                images = [image.to(device) for image in images]
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

                self.optimizer.zero_grad(set_to_none=True)

                loss_dict: dict[str, torch.Tensor] = self.model(images, targets) # the model computes the loss automatically if we pass in targets

                losses: torch.Tensor = sum(loss for loss in loss_dict.values())

                loss_value = losses.item()
                all_losses += loss_value
                
                for k, v in loss_dict.items():
                    if k not in all_losses_dict:
                        all_losses_dict[k] = 0
                    all_losses_dict[k] += v
                
                if not math.isfinite(loss_value):
                    print(f"Loss is {loss_value}, stopping trainig") # train if loss becomes infinity
                    print(loss_dict)
                    sys.exit(1)
                
                losses.backward()
                
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                self.optimizer.step()
            
            self.epoch += 1
            self.loss = all_losses / data_count
            print("Epoch {:>3}, lr: {:.6f}, loss: {:.6f}, {}".format(
                self.epoch,
                self.optimizer.param_groups[0]['lr'], 
                self.loss,
                ', '.join("{}: {:.6f}".format(k, v / data_count) for k, v in all_losses_dict.items()),
            ))


In [15]:
detector = MusicSymbolDetector()
detector.train(dataset, 10, transform)

100%|██████████| 1362/1362 [08:02<00:00,  2.82it/s] 


Epoch   1, lr: 0.001000, loss: 1.638613, loss_classifier: 0.533824, loss_box_reg: 0.434572, loss_objectness: 0.164745, loss_rpn_box_reg: 0.505472


100%|██████████| 1362/1362 [06:10<00:00,  3.68it/s]


Epoch   2, lr: 0.001000, loss: 1.003286, loss_classifier: 0.230735, loss_box_reg: 0.256892, loss_objectness: 0.109326, loss_rpn_box_reg: 0.406333


100%|██████████| 1362/1362 [05:45<00:00,  3.94it/s]


Epoch   3, lr: 0.001000, loss: 0.831465, loss_classifier: 0.167384, loss_box_reg: 0.204791, loss_objectness: 0.108266, loss_rpn_box_reg: 0.351024


100%|██████████| 1362/1362 [05:40<00:00,  4.00it/s]


Epoch   4, lr: 0.001000, loss: 0.735072, loss_classifier: 0.143315, loss_box_reg: 0.183961, loss_objectness: 0.080195, loss_rpn_box_reg: 0.327601


100%|██████████| 1362/1362 [05:40<00:00,  4.00it/s]


Epoch   5, lr: 0.001000, loss: 0.674543, loss_classifier: 0.122734, loss_box_reg: 0.165570, loss_objectness: 0.075550, loss_rpn_box_reg: 0.310689


100%|██████████| 1362/1362 [05:42<00:00,  3.98it/s]


Epoch   6, lr: 0.001000, loss: 0.628249, loss_classifier: 0.114969, loss_box_reg: 0.155037, loss_objectness: 0.068855, loss_rpn_box_reg: 0.289388


100%|██████████| 1362/1362 [05:45<00:00,  3.94it/s]


Epoch   7, lr: 0.001000, loss: 0.571289, loss_classifier: 0.102222, loss_box_reg: 0.143025, loss_objectness: 0.058526, loss_rpn_box_reg: 0.267516


100%|██████████| 1362/1362 [05:41<00:00,  3.98it/s]


Epoch   8, lr: 0.001000, loss: 0.534824, loss_classifier: 0.089530, loss_box_reg: 0.133623, loss_objectness: 0.055824, loss_rpn_box_reg: 0.255847


100%|██████████| 1362/1362 [05:35<00:00,  4.06it/s]


Epoch   9, lr: 0.001000, loss: 0.517922, loss_classifier: 0.084330, loss_box_reg: 0.128994, loss_objectness: 0.054858, loss_rpn_box_reg: 0.249739


100%|██████████| 1362/1362 [05:40<00:00,  4.00it/s]


Epoch  10, lr: 0.001000, loss: 0.490933, loss_classifier: 0.077152, loss_box_reg: 0.121108, loss_objectness: 0.051761, loss_rpn_box_reg: 0.240913


In [45]:
detector = MusicSymbolDetector.load("fasterrcnn/10")

loaded model at epoch: 10, loss: 0.49093336560018086


In [46]:
# img = cv2.imread("ds2_dense/images/lg-900267602436792595-aug-gutenberg1939--page-4.png", cv2.IMREAD_GRAYSCALE)
img = cv2.imread("sheets/bohemia rhapsody.png", cv2.IMREAD_GRAYSCALE)
img = torch.tensor(img).div(255)

# img, res = dataset[266]
# load = torch.load("fasterrcnn/1")
# model.load_state_dict(load['model'])
# optimizer.load_state_dict(load['optimizer'])
# load("fasterrcnn/1024-1")

res = detector(img)[0]

labels = [dataset.get_category(l.item())['name'] for l in res['labels']]

print(res['labels'].shape)

plt.imshow(draw_bounding_boxes(img.mul(255).type(torch.uint8), res['boxes'], labels).moveaxis(0, 2))
plt.savefig("img2.png", dpi=800)


# print(img.shape)

# device = torch.device("cpu")
# results = HRNetBackbbone().to(device)(img[:, 0:1280, 0:1280].unsqueeze(0))
# results = model.detect(img)

# print(results)

# x = results['0'].sum(1).moveaxis(0, 2).detach().cpu()
# print(x.shape)
# plt.imshow(x)
# plt.savefig("img.png", dpi=800)

# model.model.backbone = HRNetBackbbone()


# TODO: validation, test


  image = torch.tensor(image[y : y + box_size, x : x + box_size]).unsqueeze(0)


torch.Size([1, 1636, 1636])


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 67108864 bytes.

In [None]:
from dataset import MusicSheetDataSet

max_asp = 0
min_asp = 100

def trans(image, target):
    global max_asp, min_asp

    y, x = image.shape

    asp = y / x
    if asp > max_asp:
        max_asp = asp
    if asp < min_asp:
        min_asp = asp


x = MusicSheetDataSet("ds2_dense", "train", trans)

for i in x:
    pass

print(max_asp, min_asp)

# image, target = dataset[0]
# print(image.shape)
# _, x, y = image.shape


# scaleX = 800 / x
# scaleY = 800 / y

# # image = cv2.resize(image.mul(255).type(torch.uint8).numpy(), dsize=(800, 800))

# # target = model(image.unsqueeze(0).to(device))[0]

# plt.imshow(draw_bounding_boxes(
#     image.mul(255).type(torch.uint8), 
#     torch.concat([torch.tensor([b[0] * scaleX, b[1] * scaleY, b[2] * scaleX, b[3] * scaleY]).unsqueeze(0) for b in target['boxes']]), 
#     [oneset_rev[x.item()] for x in target['labels']]
# ).moveaxis(0, 2))
# plt.savefig("img.png", dpi=800)





1.4148148148148147 1.4138398914518318
