# Fast.ai DataLoader Issue on MNIST

**ISSUE**: A broadcasting error is occuring 

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from pathlib import Path
import os
import struct   # for IDX conversion
import gzip     # for IDX conversion
from urllib.request import urlretrieve # for IDX conversion

from fastai.conv_learner import * # if you want to use fastai Learner

In [3]:
PATH = Path('data/mnist')

bs = 64
sz = 28

## PyTorch DataLoader

In [4]:
transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])

# see: https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb
# frm: https://github.com/pytorch/pytorch/issues/1106
trainset = torchvision.datasets.MNIST(root=PATH, train=True, download=True,
                                   transform=transform)
validset = torchvision.datasets.MNIST(root=PATH, train=True, download=True,
                                   transform=transform)
testset  = torchvision.datasets.MNIST(root=PATH, train=True, download=True,
                                   transform=transform)
p_val = 0.15
n_val = int(p_val * len(trainset))
idxs  = np.arange(len(trainset))
np.random.shuffle(idxs)
train_idxs, valid_idxs = idxs[n_val:], idxs[:n_val]
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_idxs)
valid_sampler = torch.utils.data.sampler.SequentialSampler(valid_idxs)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs,
                                          sampler=train_sampler, num_workers=2)
validloader = torch.utils.data.DataLoader(validset, batch_size=bs,
                                          sampler=valid_sampler, num_workers=2)
testloader  = torch.utils.data.DataLoader(testset, batch_size=bs, num_workers=2)

## Fast.AI ModelData Object

In [5]:
def read_IDX(fname):
    """see: https://gist.github.com/tylerneylon/ce60e8a06e7506ac45788443f7269e40"""
    with gzip.open(fname) as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

In [6]:
fnames = [o for o in os.listdir(PATH) if 'ubyte.gz' in o] # could just use glob

trn_x_idx = [i for i,s in enumerate(fnames) if 'train-imag' in s][0]
trn_y_idx = [i for i,s in enumerate(fnames) if 'train-lab' in s][0]

# load entire IDX files into memory as ndarrays
train_x_array = read_IDX(PATH/fnames[trn_x_idx])
train_y_array = read_IDX(PATH/fnames[trn_y_idx])

# using same indices as pytorch dataloader
valid_x_array, valid_y_array = train_x_array[valid_idxs], train_y_array[valid_idxs]
train_x_array, train_y_array = train_x_array[train_idxs], train_y_array[train_idxs]

In [7]:
tfms = tfms_from_stats(inception_stats, sz=sz)

model_data = ImageClassifierData.from_arrays(PATH, 
    (train_x_array, train_y_array), (valid_x_array, valid_y_array),
    bs=bs, tfms=tfms, num_workers=2, test=None)

## Error Replication:

### 1.

In [8]:
learner = ConvLearner.pretrained(resnet18, model_data)

In [9]:
learner.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 0/797 [00:00<?, ?it/s]


ValueError: operands could not be broadcast together with shapes (28,28) (3,) 

### 2. 
Narrowing in on the exact cause. The error occurs when the fastai DataLoader attempts to retrieve the next minibatch of data.

In [12]:
x,y = next(iter(model_data.trn_dl))

ValueError: operands could not be broadcast together with shapes (28,28) (3,) 

### 3.

Does this happen with the PyTorch DataLoader? Why not?

In [29]:
# x,y = next(iter(model_data.trn_dl))
x,y = next(iter(trainloader))

In [30]:
x.shape

torch.Size([64, 1, 28, 28])

What happens when I wrap the pytorch dataloader that's different than the fastai dl?

In [31]:
model_data.trn_dl

<fastai.dataloader.DataLoader at 0x13c73ce48>

`model_data`'s `.trn_dl` is created by `ImageData`'s `.get_dl` method during initialization. `ImageData.get_dl(ds, shuf)` creates `ImageData.trn_dl` out of `(trn_ds, True)` whose `trn_ds` object is deconstructed from the `datasets` argument passed into the `ImageData` constructor. When building via `ImageClassifierData.from_arrays` this is created via:
```
datasets = cls.get_ds(ArraysIndexDataset, trn, val, tfms, test=test)
```
(where `cls.get_ds` is the static method `get_ds` of `ImageData`.

---

Perhaps they're using different iterator methods?

[PyTorch DataLoader Iterator](https://pytorch.org/docs/master/_modules/torch/utils/data/dataloader.html?highlight=_DataLoaderIter#DataLoader):
```
class DataLoader(object):
    ...
    ...
    def __iter__(self):
        return _DataLoaderIter(self)
```
$\longrightarrow$
```
class _DataLoaderIter(object):
    r"""Iteratoes once over the DataLoader's dataset, as specified by the sampler"""
    
    def __init__(self, loader):
        self.dataset = loader.dataset
        self. collate_fn = ...
        ...
    ...
```

[FastAI DataLoader Iterator](https://github.com/fastai/fastai/blob/master/fastai/dataloader.py#L80):
```
class DataLoader(object):
    ...
    ...
    def __iter__(self):
        if self.num_workers==0:
            for batch in map(self.get_batch, iter(self.batch_sampler)):
                yield get_tensor(batch, self.pin_memory, self.half)
        else:
            with ThreadPoolExecutor(max_workers=self.num_workers) as e:
                # avoid py3.6 issue where queue is infinite and can result in memory exhaustion
                for c in chunk_iter(iter(self.batch_sampler), self.num_workers*10):
                    for batch in e.map(self.get_batch, c):
                        yield get_tensor(batch, self.pin_memory, self.half)
```

So fastai is defining its dataloader's iterator right there inside the class. I wonder if `yield get_tensor(batch, self.pin_memory, self.half)` is where my issue is coming up, specifically the interaction between `get_tensor` and `batch`. Now... `batch` is created by mapping `DataLoader.get_batch` on `iter(DataLoader.batch_sampler` ... for the training set `batch_sampler` is `BatchSampler(RandomSampler(dataset), batch_size, drop_last)`, where `batch_size` is defined in `ImageData.get_dl(..)` and is the batch size passed into the original constructor (ie: 64 here).

So it makes sense that batch size isn't playing a role. It's the missing channel dimension that's causing problems.

---

I wanted to dive further before coming to this –– but I think a simple fix is to just use a `np.expand_dims` on the datasets before passing them into `ImageClassifierData.from_arrays`.. the DataLoader will be able to broadcast (1,28,28) to (3,28,28), though I wonder how that'll affect the actual model. Diving further into PyTorch's dataloader iterator is getting a little too off topic right now (this notebook is just meant to be a mini-diversion to fix a bug for the MNIST baseline test).

## Testing Solution 1 (Failed)

Let's see if adding a channel dimension fixing things:

In [44]:
fnames = [o for o in os.listdir(PATH) if 'ubyte.gz' in o] # could just use glob

trn_x_idx = [i for i,s in enumerate(fnames) if 'train-imag' in s][0]
trn_y_idx = [i for i,s in enumerate(fnames) if 'train-lab' in s][0]

# load entire IDX files into memory as ndarrays
train_x_array = read_IDX(PATH/fnames[trn_x_idx])
train_y_array = read_IDX(PATH/fnames[trn_y_idx])

In [45]:
train_x_array.shape

(60000, 28, 28)

In [46]:
train_x_array.nbytes / 2**20 # size in MB

44.86083984375

In [48]:
# np.reshape(train_x_array, (60000, 1, 28, 28))
train_x_array = np.expand_dims(train_x_array, 1)
train_x_array.shape

(60000, 1, 28, 28)

In [49]:
train_x_array[0].shape

(1, 28, 28)

In [50]:
train_x_array.nbytes / 2**20 # size in MB

44.86083984375

In [51]:
# using same indices as pytorch dataloader
valid_x_array, valid_y_array = train_x_array[valid_idxs], train_y_array[valid_idxs]
train_x_array, train_y_array = train_x_array[train_idxs], train_y_array[train_idxs]

In [52]:
tfms = tfms_from_stats(inception_stats, sz=sz)

model_data = ImageClassifierData.from_arrays(PATH, 
    (train_x_array, train_y_array), (valid_x_array, valid_y_array),
    bs=bs, tfms=tfms, num_workers=2, test=None)

In [53]:
x,y = next(iter(model_data.trn_dl))

ValueError: operands could not be broadcast together with shapes (28,28,28) (3,) 

That did not make things better.

Okay.. where is `s` coming from – or where is the `(3,)` coming from? Or... was there some dimension reshuffling going on? I know PyTorch uses the opposite channel ordering to a lot of other places... maybe that automatic reordering is screwing things up into a `(28,28,28)` Tensor?

## Testing Solution 2 (Failed)

Maybe I should've ordered channel-last instead of channel-first?

In [54]:
fnames = [o for o in os.listdir(PATH) if 'ubyte.gz' in o] # could just use glob

trn_x_idx = [i for i,s in enumerate(fnames) if 'train-imag' in s][0]
trn_y_idx = [i for i,s in enumerate(fnames) if 'train-lab' in s][0]

# load entire IDX files into memory as ndarrays
train_x_array = read_IDX(PATH/fnames[trn_x_idx])
train_y_array = read_IDX(PATH/fnames[trn_y_idx])

In [55]:
train_x_array.shape

(60000, 28, 28)

In [56]:
# np.reshape(train_x_array, (60000, 1, 28, 28))
train_x_array = np.expand_dims(train_x_array, -1)
train_x_array.shape

(60000, 28, 28, 1)

In [57]:
train_x_array[0].shape

(28, 28, 1)

In [58]:
# using same indices as pytorch dataloader
valid_x_array, valid_y_array = train_x_array[valid_idxs], train_y_array[valid_idxs]
train_x_array, train_y_array = train_x_array[train_idxs], train_y_array[train_idxs]

In [59]:
tfms = tfms_from_stats(inception_stats, sz=sz)

model_data = ImageClassifierData.from_arrays(PATH, 
    (train_x_array, train_y_array), (valid_x_array, valid_y_array),
    bs=bs, tfms=tfms, num_workers=2, test=None)

In [60]:
x,y = next(iter(model_data.trn_dl))

ValueError: operands could not be broadcast together with shapes (28,28) (3,) 

## Further Debugging

Nope. And the shapes are the same as if I never added a dimension. Time for some deubg traces. 

Aha, in `classChannelOrder()` in `fastai/transforms.py`:

> changes image array shape from (h, 2, 3) to (3, h, w).
> tfm_y decides the transformation done to the y element.

Going back to the original shape (`(28,28)`):

In [8]:
fnames = [o for o in os.listdir(PATH) if 'ubyte.gz' in o] # could just use glob

trn_x_idx = [i for i,s in enumerate(fnames) if 'train-imag' in s][0]
trn_y_idx = [i for i,s in enumerate(fnames) if 'train-lab' in s][0]

# load entire IDX files into memory as ndarrays
train_x_array = read_IDX(PATH/fnames[trn_x_idx])
train_y_array = read_IDX(PATH/fnames[trn_y_idx])

In [9]:
train_x_array.shape

(60000, 28, 28)

In [10]:
train_x_array[0].shape

(28, 28)

In [11]:
# using same indices as pytorch dataloader
valid_x_array, valid_y_array = train_x_array[valid_idxs], train_y_array[valid_idxs]
train_x_array, train_y_array = train_x_array[train_idxs], train_y_array[train_idxs]

In [12]:
tfms = tfms_from_stats(inception_stats, sz=sz)

model_data = ImageClassifierData.from_arrays(PATH, 
    (train_x_array, train_y_array), (valid_x_array, valid_y_array),
    bs=bs, tfms=tfms, num_workers=2, test=None)

In [13]:
model_data.bs

64

In [24]:
x,y = next(iter(model_data.trn_dl))

ValueError: operands could not be broadcast together with shapes (28,28) (3,) 

The issue is happening when the Fast AI dataloader is attempting to Normalize the data!

So let's see how PyTorch is normalizing its, and possibly what's going wrong that shouldn't be: 

In [15]:
raise Error

NameError: name 'Error' is not defined