## Imports
https://pytorch.org/vision/main/feature_extraction.html

### Docs

[Mask R-CNN ResNet-50](https://pytorch.org/vision/main/models/generated/torchvision.models.detection.maskrcnn_resnet50_fpn.html#torchvision.models.detection.maskrcnn_resnet50_fpn)

[Faster R-CNN](https://pytorch.org/vision/main/models/generated/torchvision.models.detection.fasterrcnn_resnet50_fpn.html#torchvision.models.detection.fasterrcnn_resnet50_fpn)

[Dataset ImageFolder](https://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html)

In [1]:
import torch
import torchvision
from torchvision.models import resnet50
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor

from torchvision.models.detection import fasterrcnn_resnet50_fpn as FasterRCNN
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights as Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor


from torchvision.models.detection.backbone_utils import LastLevelMaxPool
from torchvision.ops.feature_pyramid_network import FeaturePyramidNetwork
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

from torchvision.io import decode_image
from torchvision import tv_tensors

import matplotlib.pyplot as plt
from PIL import Image


### Constants

In [2]:
ROOT = 'CUB_200_2011/CUB_200_2011/'
IMAGES = f'{ROOT}images/'
TRAIN_DIR = f'{ROOT}train/'
TEST_DIR = f'{ROOT}test/'

SAMPLE_TRAIN = f'{ROOT}subset/train/'
SAMPLE_TEST = f'{ROOT}subset/test/'

SUBSET = True

SAMPLE = 3

### Helper Function

In [3]:
def translate():
    '''
        Returns two maps/dicts. species names --> class ID number and the inverse
    '''
    classes = open("CUB_200_2011/CUB_200_2011/classes.txt")
    strToInt = {}
    intToStr = {}
    while(True):
        try:
            entry = classes.readline()
            parsed = entry.split()
            strToInt[parsed[1].strip()] = int(parsed[0].strip())
            intToStr[int(parsed[0].strip())] = parsed[1].strip()
        
        except:
            break
    classes.close()
    return strToInt, intToStr

In [4]:
def fetchNames():
    '''
        Returns a map from [img_no] --> ([Img_Dir/Img_Name], [class_ID])
    '''
    imgs = open("CUB_200_2011/CUB_200_2011/images.txt")
    # File Data: [Img_No] [Img_Dir/Img_Name]
    data = {} # Hold all Input/Classification  pairs
    while(True):
        try:
            entry = imgs.readline()
            parsed = entry.split()
            name = parsed[1]
            # if (int(name[0:3]) > SAMPLE):
            #     break
            data[ int(parsed[0]) ] = name[0:len(name) - 4]
        except:
            break
    imgs.close()

    classes = open("CUB_200_2011/CUB_200_2011/image_class_labels.txt")
    # File Data: [Img_No] [Class_ID]
    
    while(True):
        try:
            entry = classes.readline()
            parsed = entry.split()
            img = data[ int(parsed[0]) ]
            # if (int(parsed[1]) > SAMPLE):
            #     break
            data[ int(parsed[0]) ] = (img, int(parsed[1]))
        except:
            break
    classes.close()
    return data

In [5]:
def trainTestSplit():
    '''
        Split the image IDs into training and testing sets 
        based on the recommendation provided

        File format: [Image ID] [0/1]

        Returns two lists of image IDs
    '''
    # A map from [Img_ID] --> ([Img_Path], [Class_ID])
    imgs = fetchNames()
    file = open("CUB_200_2011/CUB_200_2011/train_test_split.txt")
    train = []
    test = []
    data = fetchNames()
    
    while (True):
        try:
            # Read in the next line and separate the image ID from the boolean
            line = file.readline().split()
            i = int( line[0].strip() )
            if (data[i][1] > SAMPLE):
                break
            training = int( line[1].strip() ) == 1
            if training:
                train.append(i)  #imgs[i])
            else:
                test.append(i)  #imgs[i])
        except:
            break
    file.close()
    return train, test

In [6]:
from PIL import Image 
import PIL
import os
def mkTestTrainDir():
    try:
        os.mkdir(TRAIN_DIR)
    except:
        pass
    train, test = trainTestSplit()
    print(f'Train: {len(train)}. Test: {len(test)}')
    img = None
    for i in train:
        try:
            d = i[0].split('/')[0]
            if( int(d[0:3]) > SAMPLE ):
                break
            os.mkdir(f'{SAMPLE_TRAIN}{d}')
        except:
            pass
            
        with Image.open(f'{IMAGES}{i[0]}.jpg') as img:
            img.save(f'{SAMPLE_TRAIN}{i[0]}.jpg')

    try:
        os.mkdir(TEST_DIR)
    except:
        pass
    for i in test:
        try:
            d = i[0].split('/')[0]
            if( int(d[0:3]) > SAMPLE ):
                break
            os.mkdir(f'{SAMPLE_TEST}{d}')
        except:
            pass
        with Image.open(f'{IMAGES}{i[0]}.jpg') as img:
            img.save(f'{SAMPLE_TEST}{i[0]}.jpg')
if (False):
    mkTestTrainDir()

## Read in Bounding Boxes

In [7]:
def boundingBoxes():
    dic = {}
    boxes = []
    ids = []
    data = fetchNames() # id --> (path, class)
    with open(f'{ROOT}bounding_boxes.txt') as file:
        box = None
        for l in file:
            try:
                line = l.split()
                try:
                    img = int(line[0])
                    key = img
                    # if (data[img][1] > SAMPLE):
                    #     break
                    img = torch.Tensor(img)
                    x = float(line[1])
                    y = float(line[2])
                    width = float(line[3])
                    height = float(line[4])
                    box = torch.Tensor([[x, y, x + width, y + height]])
                except:
                    print("Error parsing coordinate values")
                    continue
                boxes.append(box)
                ids.append(img)
                dic[key] = box
            except Exception as e:
                print(e)
                break
    return boxes, ids, dic

## Read in Images


In [8]:
# Transforms
# https://www.kaggle.com/code/sharansmenon/pytorch-cubbirds200-classification
transform = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomApply(torch.nn.ModuleList([transforms.ColorJitter()]), p=0.1),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            # transforms.ToTensor(),
            # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

In [9]:
class BirdImages(Dataset):
    def __init__(self, root, ids, classMap, boxMap, transform=None):
        super(BirdImages, self).__init__()
        self.root = root
        self.data = ids
        self.data.sort()
        self.classes = classMap
        self.boxes = boxMap
        self.transforms = transform
        # datasets.ImageFolder(root, transform=transform)

    def __len__(self):
        return len(self.data)

    # (Tensor[Img], ClassID, Box)
    def __getitem__(self, idx):
        imgID = self.data[idx]
        
        path, classID = self.classes[imgID]
        box = self.boxes[imgID]
        
        # load images
        img_path = os.path.join(self.root, f'{path}.jpg')
        img = decode_image(img_path).float()

        # Wrap sample and targets into torchvision tv_tensors:
        # img = tv_tensors.Image(img)

        # if self.transforms is not None:
        #     img = self.transforms(img)
        
        return img #, classID, box
        
class BirdBoxes(Dataset):
    def __init__(self, root, ids, classMap, boxMap, transform=None):
        super(BirdBoxes, self).__init__()
        self.root = root
        self.data = ids
        self.data.sort()
        self.classes = classMap
        self.boxes = boxMap
        self.transforms = transform
        # datasets.ImageFolder(root, transform=transform)

    def __len__(self):
        return len(self.data)

    # (Tensor[Img], ClassID, Box)
    def __getitem__(self, idx):
        imgID = self.data[idx]
        
        path, classID = self.classes[imgID]
        box = self.boxes[imgID]
        d = {}
        d['boxes'] = torch.Tensor(box)
        d['labels'] = torch.Tensor(classID).long()
        return d

classMap = fetchNames()
boxes, ids, dic = boundingBoxes()
train, test = trainTestSplit()

def getDataset(training):
    classMap = fetchNames()
    boxes, ids, dic = boundingBoxes()
    train, test = trainTestSplit()
    if (training):
        trainSet = BirdImages(SAMPLE_TRAIN, train, classMap, dic, transform=transform)
        trainLoader = DataLoader(trainSet, batch_size=32, shuffle=True)
        return trainSet, trainLoader
    else:
        testSet = BirdBoxes(SAMPLE_TRAIN, train, classMap, dic, transform=transform)
        testLoader = DataLoader(testSet, batch_size=32, shuffle=True)
        return testSet, testLoader

## Model

[Building a Faster R-CNN](https://medium.com/@fractal.ai/guide-to-build-faster-rcnn-in-pytorch-42d47cb0ecd3)

https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html#defining-your-model

[The example I'm following](https://pytorch.org/vision/main/models/generated/torchvision.models.detection.fasterrcnn_resnet50_fpn.html#torchvision.models.detection.fasterrcnn_resnet50_fpn)

In [None]:
classMap = fetchNames()
boxes, ids, dic = boundingBoxes()
train, test = trainTestSplit()

print(len(train))

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=Weights.DEFAULT)
# For training
trainS , trainL = getDataset(True)
images = trainS # BirdImages(SAMPLE_TRAIN, train, classMap, dic, transform=transform)
boxes, labels, dic = boundingBoxes()
# images = list(image for image in images)
targets = []

print("\n\nLOOPING\n\n")
# for i in range(len(images)):
#     d = {}
#     d['boxes'] = boxes[i]
#     d['labels'] = labels[i]
#     targets.append(d)

targets = BirdBoxes(SAMPLE_TRAIN, train, classMap, dic, transform=transform)
print("\n\nTRAINING\n\n")

output = model(images, targets)


90


LOOPING




TRAINING




In [None]:
# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)

# optionally, if you want to export the model to ONNX:
torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)

## Articles

[Feature extraction for model inspection](https://pytorch.org/vision/main/feature_extraction.html)

[Feature Pyramid Network](https://pytorch.org/vision/main/generated/torchvision.ops.FeaturePyramidNetwork.html)

[Bounding Box Prediction using PyTorch](https://www.geeksforgeeks.org/bounding-box-prediction-using-pytorch/)

# From a different tutorial

model = FasterRCNN(weights=Weights.DEFAULT)
num_classes = SAMPLE + 1  # 200 Species + background

in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

