# Multi Object Detection - Model Data Setup

WNixalo - 5/5/2018

---

This notebook's purpose it to build intuition and practice implementing the fastai workflow for multi-object detection: specifically how to load data w/ a `ModelData` object.

**References**: [codealong-pascal-multi](https://github.com/WNoxchi/Aersu/blob/master/GLOC/model_dev/codealong-fastai-dl2-pascal-multi.ipynb) | [fastai pascal-multi](https://github.com/fastai/fastai/blob/master/courses/dl2/pascal-multi.ipynb)


## Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.conv_learner import *

sys.path.insert(1, os.path.join('../'))
from utils import common
from utils import temp_utils
from utils.subfolder_val_idxs import set_val_idxs

from matplotlib import patches, patheffects

In [3]:
PATH = Path('../data')
PATH_TRAIN     = PATH/'train'
PATH_TRAIN_BBX = PATH/'interstage_train'
PATH_CSV     = PATH/'labels.csv'
PATH_CSV_BBX = PATH/'interstage_labels.csv'
CPU_PATH_CSV     = PATH/'cpu_labels.csv'
CPU_PATH_CSV_BBX = PATH/'cpu_interstage_labels.csv'

## Multi Class from JSON $\rightarrow$ CSV

How do you create a ModelData object for multi-class classification.

In [23]:
for i,o in enumerate(trn_j['annotations']):
    print(o)
    if i == 1: break

{'segmentation': [[155, 96, 155, 270, 351, 270, 351, 96]], 'area': 34104, 'iscrowd': 0, 'image_id': 12, 'bbox': [155, 96, 196, 174], 'category_id': 7, 'id': 1, 'ignore': 0}
{'segmentation': [[184, 61, 184, 199, 279, 199, 279, 61]], 'area': 13110, 'iscrowd': 0, 'image_id': 17, 'bbox': [184, 61, 95, 138], 'category_id': 15, 'id': 2, 'ignore': 0}


In [7]:
path = Path('../pascal_train2007.json')
trn_j = json.load(path.open())

def get_trn_anno():
    trn_anno = collections.defaultdict(lambda:[])
    for o in trn_j['annotations']:
#         if not o['ignore']:
        if True:
            bb = o['bbox']
            bb = np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])
            trn_anno[o['image_id']].append((bb, o['category_id']))
    return trn_anno

trn_anno = get_trn_anno()

cats = dict((o['id'], o['name']) for o in trn_j['categories'])

trn_fns = dict(([o['id'], o['file_name']]) for o in trn_j['images'])

trn_ids = [o['id'] for o in trn_j['images']]

mc  = [set([cats[p[1]] for p in trn_anno[o]]) for o in trn_ids]
mcs = [' '.join(str(p) for p in o) for o in mc]

df = pd.DataFrame({'fn':[trn_fns[o] for o in trn_ids], 'class':mcs}, columns=['fn','class'])

In [8]:
cats

{1: 'aeroplane',
 2: 'bicycle',
 3: 'bird',
 4: 'boat',
 5: 'bottle',
 6: 'bus',
 7: 'car',
 8: 'cat',
 9: 'chair',
 10: 'cow',
 11: 'diningtable',
 12: 'dog',
 13: 'horse',
 14: 'motorbike',
 15: 'person',
 16: 'pottedplant',
 17: 'sheep',
 18: 'sofa',
 19: 'train',
 20: 'tvmonitor'}

In [9]:
for i,item in enumerate(trn_fns.items()):
    print(item)
    if i == 9: break

(12, '000012.jpg')
(17, '000017.jpg')
(23, '000023.jpg')
(26, '000026.jpg')
(32, '000032.jpg')
(33, '000033.jpg')
(34, '000034.jpg')
(35, '000035.jpg')
(36, '000036.jpg')
(42, '000042.jpg')


In [10]:
trn_ids[:10]

[12, 17, 23, 26, 32, 33, 34, 35, 36, 42]

In [11]:
mc[:10]

[{'car'},
 {'horse', 'person'},
 {'bicycle', 'person'},
 {'car'},
 {'aeroplane', 'person'},
 {'aeroplane'},
 {'train'},
 {'diningtable', 'person'},
 {'dog'},
 {'train'}]

In [12]:
mcs[:10]

['car',
 'person horse',
 'person bicycle',
 'car',
 'aeroplane person',
 'aeroplane',
 'train',
 'diningtable person',
 'dog',
 'train']

In [13]:
df.head()

Unnamed: 0,fn,class
0,000012.jpg,car
1,000017.jpg,person horse
2,000023.jpg,person bicycle
3,000026.jpg,car
4,000032.jpg,aeroplane person


In [None]:
MC_CSV = path/'tmp/mc.csv'

## Multi Object

How do you create a ModelData object for multi-object detection.

In [24]:
mc  = [[cats[p[1]] for p in trn_anno[o]] for o in trn_ids]
id2cat = list(cats.values())
cat2id = {v:k for k,v in enumerate(id2cat)}
mcs = np.array([np.array([cat2id[p] for p in o]) for o in mc])

mbb  = [np.concatenate([p[0] for p in trn_anno[o]]) for o in trn_ids]
mbbs = [' '.join(str(p) for p in o) for o in mbb]

df_mbb = pd.DataFrame({'fn':[trn_fns[o] for o in trn_ids], 'bbox':mbbs}, columns=['fn','bbox'])

Because it's multi *Object* not multi *Class* the number of occurrences of a class in an image is important.

In [26]:
mc[:10]

[['car'],
 ['person', 'horse'],
 ['bicycle', 'bicycle', 'bicycle', 'person', 'person', 'person'],
 ['car'],
 ['aeroplane', 'aeroplane', 'person', 'person'],
 ['aeroplane', 'aeroplane', 'aeroplane'],
 ['train', 'train'],
 ['person', 'person', 'person', 'diningtable'],
 ['dog'],
 ['train', 'train']]

In [56]:
print(f'type(id2cat): {type(id2cat)}; type(cat2id): {type(cat2id)}')
for ic,ci in zip(id2cat, cat2id.items()):
    print(f'{ic:<15}{ci[0]+" :":>15} {ci[1]}')

type(id2cat): <class 'list'>; type(cat2id): <class 'dict'>
aeroplane          aeroplane : 0
bicycle              bicycle : 1
bird                    bird : 2
boat                    boat : 3
bottle                bottle : 4
bus                      bus : 5
car                      car : 6
cat                      cat : 7
chair                  chair : 8
cow                      cow : 9
diningtable      diningtable : 10
dog                      dog : 11
horse                  horse : 12
motorbike          motorbike : 13
person                person : 14
pottedplant      pottedplant : 15
sheep                  sheep : 16
sofa                    sofa : 17
train                  train : 18
tvmonitor          tvmonitor : 19


In [67]:
mcs[:10]

array([array([6]), array([14, 12]), array([ 1,  1,  1, 14, 14, 14]), array([6]), array([ 0,  0, 14, 14]),
       array([0, 0, 0]), array([18, 18]), array([14, 14, 14, 10]), array([11]), array([18, 18])],
      dtype=object)

So that's how it's done. `mcs` (multiple classes) contains arrays of class ids. `mbb` (multiple bounding boxes) is the same thing but for coordinates (`mbbs` just changes the format to CSV-compatible strings).

In [68]:
mbb[:10]

[array([ 96, 155, 269, 350]),
 array([ 61, 184, 198, 278,  77,  89, 335, 402]),
 array([229,   8, 499, 244, 219, 229, 499, 333, 177,   1, 499,  89,   0,   1, 368, 116,   1,   2, 461, 242,
          0, 224, 485, 333]),
 array([124,  89, 211, 336]),
 array([ 77, 103, 182, 374,  87, 132, 122, 196, 179, 194, 228, 212, 188,  25, 237,  43]),
 array([106,   8, 262, 498, 199, 420, 225, 481, 187, 324, 222, 410]),
 array([166, 115, 399, 359, 152, 140, 228, 332]),
 array([ 95,   0, 360, 190,  97, 217, 317, 464, 194, 467, 316, 499, 303,   2, 374, 499]),
 array([ 78,  26, 343, 318]),
 array([ 31, 262, 294, 499,  35,   0, 298, 234])]

In [69]:
mbbs[:10]

['96 155 269 350',
 '61 184 198 278 77 89 335 402',
 '229 8 499 244 219 229 499 333 177 1 499 89 0 1 368 116 1 2 461 242 0 224 485 333',
 '124 89 211 336',
 '77 103 182 374 87 132 122 196 179 194 228 212 188 25 237 43',
 '106 8 262 498 199 420 225 481 187 324 222 410',
 '166 115 399 359 152 140 228 332',
 '95 0 360 190 97 217 317 464 194 467 316 499 303 2 374 499',
 '78 26 343 318',
 '31 262 294 499 35 0 298 234']

Moment of truth: what does the fastai DataLoader CSV look like for multiple bounding boxes?

In [70]:
df_mbb.head()

Unnamed: 0,fn,bbox
0,000012.jpg,96 155 269 350
1,000017.jpg,61 184 198 278 77 89 335 402
2,000023.jpg,229 8 499 244 219 229 499 333 177 1 499 89 0 1...
3,000026.jpg,124 89 211 336
4,000032.jpg,77 103 182 374 87 132 122 196 179 194 228 212 ...


### Putting It Together: Multi-Class CSV + Multi-BoundingBox CSV

Finally, the Bounding Box and Class data have to be merged together so the fastai ModelData object can have access to them. This is done by writing a class to concatenate the two datasets (`mbbs` and `mcs`).

A ModelData object is created by loading the `mbbs` CSV. The `mbbs` and `mcs` arrays are then concatenated as a `ConcatLblDataset` object (which is defined for this purpose). This is done by concatenating the actual `mcs` array with the dataset of the ModelData object - which is where the `mbbs` array used to initalize the ModelData object lives.

The basic idea is:

- There are two arrays of output data for the dataset: classes per image (`mcs`) and bounding-boxes per image (`mbbs`).
- A ModelData object is created using one of the data arrays.
- The other data array is then concatenated with a copy of the first array *from* the ModelData object.
- The ModelData object's dataset is updated to be the new concatenation.

Also note the necessary transforms, and pointed the ModelData constructor to the correct CSV and data folders, *and* specifying `continous=True` to run regression on location (bounding box coordinates) data.

In [96]:
# setup (spec paths & put csv where it's expected)
os.makedirs(PATH/'tmp', exist_ok=True)

jpeg_path = PATH_TRAIN
MBB_CSV = PATH/'tmp/mbb.csv'

df_mbb.to_csv(MBB_CSV, index=False)

In [97]:
tfms=tfms_from_model(resnet34, sz=224)

In [98]:
md = ImageClassifierData.from_csv(path, jpeg_path, MBB_CSV, tfms=tfms, continuous=True)

In [99]:
class ConcatLblDataset(Dataset):
    def __init__(self, ds, y2):
        self.ds,self.y2 = ds,y2
        self.sz = ds.sz
    def __len__(self):
        return len(self.ds)
    def __getitem__(self, i):
        x,y = self.ds[i]
        return (x, (y, self.y2[i]))    

In [100]:
trn_ds2 = ConcatLblDataset(md.trn_ds, mcs)

In [101]:
md.trn_dl.dataset = trn_ds2

And then you go from there.

***NOTE***: probably have to specify a validation set during this process. Same steps for the concatenated validation dataset:

```
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.val_dl.dataset = val_ds2
```

The only functional thing left to do is to define the Neural Net's architecture and loss function. The architecture must have two output heads for boundingbox regression and classification (these require different loss functions). The architecure will also define the granularity of anchors boxes for detection. The loss function will optimize for classification and detection (**NOTE**: *I still have to see exactly how this works*).

## temp - Single-Shot Detector Head

In [None]:
class StdConv(nn.Module):
    def __init__(self, nin, nout, stride=2, drop=0.1):
        super().__init__()
        self.conv = nn.Conv2d(nin, nout, 3, stride=stride, padding=1)
        self.bn = nn.BatchNorm2d(nout)
        self.drop = nn.Dropout(drop)
        
    def forward(self, x): return self.drop(self.bn(F.relu(self.conv(x))))
    
def flatten_conv(x,k):
    bs,nf,gx,gy = x.size()
    x = x.permute(0,2,3,1).contiguous()
    return x.view(bs, -1, nf // k)

class OutConv(nn.Module): # 2 separate output conv layers: bbx reg & clsfn
    def __init__(self, k, nin, bias):
        super().__init__()
        self.k = k
        self.oconv1 = nn.Conv2d(nin, (len(id2cat) + 1)*k, 3, padding=1)
        self.oconv2 = nn.Conv2d(nin, 4*k, 3, padding=1)
        self.oconv1.bias.data.zero_().add_(bias)
        
    def forward(self, x):
        return [flatten_conv(self.oconv1(x), self.k), 
                flatten_conv(self.oconv2(x), self.k)]
In [54]:
class SSD_Head(nn.Module):

In [None]:
class SSD_Head(nn.Module):
    def __init__(self, k, bias):
        super().__init__()
        self.drop = nn.Dropout(0.25)
        self.sconv0 = StdConv(512, 256, stride=1) # stride 1 doesnt change geometry while adding a computation layer
#         self.sconv1 = StdConv(256, 256)
        self.sconv2 = StdConv(256, 256)
        self.out = OutConv(k, 256, bias)
        
    def forward(self, x):
        x = self.drop(F.relu(x))
        x = self.sconv0(x)
#         x = self.sconv1(x)
        x = self.sconv2(x)
        return self.out(x)
    
head_reg4 = SSD_Head(k, -3.)
models = ConvnetBuilder(f_model, 0, 0, 0, custom_head=head_reg4)
learn  = ConvLearner(md, models)
learn.

In [None]:
class SSD_MultiHead(nn.Module):
    def __init__(self, k, bias):
        super().__init__()
        self.drop = nn.Dropout(drop)
        self.sconv1 = StdConv(512, 256, drop=drop) # stride 2 conv halves grid size
        self.sconv2 = StdConv(256, 256, drop=drop)
        self.sconv3 = StdConv(256, 256, drop=drop)
#         self.out0 = OutConv(k, 256, bias) # dont think this is used
        self.out1 = OutConv(k, 256, bias)
        self.out2 = OutConv(k, 256, bias)
        self.out3 = OutConv(k, 256, bias)
        
    def forward(self, x):
        x = self.drop(F.relu(x))
        x = self.sconv1(x) # 4x4 anchor size
        o1c,o1λ = self.out1(x) # grab outputs (anchors)
        x = self.sconv2(x) # 2x2
        o2c,o2λ = self.out2(x)
        x = self.sconv3(x) # 1x1
        o3c,o3λ = self.out3(x)
#         return [o1c,o1λ]
        return [torch.cat([o1c,o2c,o3c], dim=1),
                torch.cat([o1λ,o2λ,o3λ], dim=1)]

head_reg4 = SSD_MultiHead(k, -4)
models = ConvnetBuilder(f_model, 0, 0, 0, custom_head=head_reg4)
learn = ConvLearner(md, models)
learn.opt_fn = optim.Adam