## Pytorch implementations of data-loading modules 

Goal is to allow batched data-loading and calculation of normalization statistics in a image-format agnostic way. 

In [15]:
import os
import cv2
import numpy as np
from skimage import io
from skimage.external import tifffile

In [3]:
PATH = "datasets/yeast_v2/"

folders = os.listdir(PATH)

In [7]:
for folder in folders:
    fnames, lbls, all_lbls = read_dirs(PATH, folder)

In [29]:
len(fnames)

280

In [10]:
print(fnames[0:2])
print(lbls[0:2])
print(all_lbls[0:2])

['train\\WT\\WT_WP_E1_S0_F1_I2_C2_A0.tif', 'train\\WT\\WT_WP_E1_S0_F1_I2_C3_A0.tif']
['WT', 'WT']
['WT', 'mfb1KO']


In [18]:
image_1 = open_image(PATH + fnames[0]); print(image_1.shape)

(200, 200, 2)


In [109]:
yeast_train = yeast_dataset('train')
bs = 280

big = np.zeros((bs,200,200,2))

for i in range(len(yeast_train)):
    sample = yeast_train[i]

    big[i] = sample
#     print(sample.shape)
    
    if i == bs:
        break
        
batch_mean = np.mean(big, axis=(0,1,2)); print(batch_mean.shape); print(batch_mean)
batch_std = np.std(big, axis=(0,1,2)); print(batch_std.shape); print(batch_std)

(2,)
[0.02111004 0.00453067]
(2,)
[0.0015035  0.00147647]


In [73]:
print(big.shape)

(4, 200, 200, 2)


In [85]:
a = np.mean(big[0], axis = (0,1)); print(a)
b = np.mean(big[1], axis = (0,1)); print(b)
c = np.mean(big[2], axis = (0,1)); print(c)
d = np.mean(big[2], axis = (0,1)); print(d)

print(np.mean())

[0.02146161 0.00443499]
[0.02137806 0.00460488]
[0.02138503 0.00499342]
[0.02138503 0.00499342]


In [104]:
## exploring axis arguments in np.sum, or generally in ndarrays. 

x = np.array([[[[0,1],
               [2,3]],
              [[0,1],
               [2,3]]],
              [[[4,5],
               [6,7]],
              [[4,5],
               [6,7]]]
             ])
               
print(x.shape)
np.sum(x, axis=(0,1,2)) #seems to be the correct way to treat an ndarray while preserving the last axis.


(2, 2, 2, 2)


array([24, 32])

In [87]:
batch_mean = np.mean(big, axis=(0,1,2)); print(batch_mean.shape); print(batch_mean)

(2,)
[0.02135302 0.00471475]


In [113]:
yeast_train = yeast_dataset('train')

dataloader = torch.utils.data.DataLoader(yeast_train, batch_size=140, shuffle=False)

pop_mean = []
pop_std0 = []
pop_std1 = []
for i, data in enumerate(dataloader, 0):
    # shape (batch_size, height, width channels)
    numpy_image = data.numpy()

    # shape (channels,)
    batch_mean = np.mean(numpy_image, axis=(0, 1, 2))
    batch_std0 = np.std(numpy_image, axis=(0, 1, 2))
    #     batch_std1 = np.std(numpy_image, axis=(0,1,2), ddof=1)

    pop_mean.append(batch_mean)
    pop_std0.append(batch_std0)
#     pop_std1.append(batch_std1)

# shape (num_iterations, channels) -> (mean across 0th axis) -> shape (channels,)
pop_mean = np.array(pop_mean).mean(axis=0)
pop_std0 = np.array(pop_std0).mean(axis=0)
# pop_std1 = np.array(pop_std1).mean(axis=0)


In [120]:
print(pop_mean)
print(pop_std0)

[0.02171638 0.00451   ]
[0.0016155  0.00146062]


In [43]:
class yeast_dataset(Dataset):

    def __init__(self, folder, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.folder = folder
        fnames, lbls, all_lbls = read_dirs(PATH, folder)  
        self.fnames = fnames
        self.transform = transform

    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, idx):
        img_name = str(PATH + self.fnames[idx])
        sample = open_image(img_name)

        if self.transform:
            sample = self.transform(sample)

        return sample

In [5]:
def read_dirs(path, folder):
    '''
    Fetches name of all files in path in long form, and labels associated by extrapolation of directory names. 
    '''
    lbls, fnames, all_lbls = [], [], []
    full_path = os.path.join(path, folder)
    for lbl in sorted(os.listdir(full_path)):
        if lbl not in ('.ipynb_checkpoints','.DS_Store'):
            all_lbls.append(lbl)
            for fname in os.listdir(os.path.join(full_path, lbl)):
                if fname not in ('.DS_Store'):
                    fnames.append(os.path.join(folder, lbl, fname))
                    lbls.append(lbl)
    return fnames, lbls, all_lbls

In [13]:
def open_image(fn):
    """ Opens an image using OpenCV given the file path.

    Arguments:
        fn: the file path of the image

    Returns:
        The image as numpy.array
"""
    

    flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
    if not os.path.exists(fn):
        raise OSError('No such file or directory: {}'.format(fn))
    elif os.path.isdir(fn):
        raise OSError('Is a directory: {}'.format(fn))
    else:
        try:
            if str(fn).lower().endswith(('.tif', '.tiff', '.tifff')):
                im = tifffile.imread(str(fn)).astype(np.float32)/65535 #(!) for 16-bit images
                im = np.moveaxis(im, 0, -1)
            else:
                im = cv2.imread(str(fn), flags).astype(np.float32)/255 
                im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            if im is None: raise OSError(f'File not recognized by io.imread: {fn}') 
            return im
        except Exception as e:
            raise OSError('Error handling image at: {}'.format(fn)) from e

In [28]:
fnames, lbls, all_lbls = read_dirs(PATH, 'train')

images = open_image(PATH + fnames[0])

for ims in fnames:
    images = np.concatenate(images, open_image(PATH + ims))
    break
    

type(images)

TypeError: only integer scalar arrays can be converted to a scalar index