In [6]:
import os
import numpy as np
import cv2
import pandas as pd
from tifffile import TiffFile
from tqdm.notebook import tqdm
from torch.utils.data import Dataset

In [7]:
PIXEL_SIZE = 4
MASKS = '../input/hubmap-organ-segmentation/train.csv'
DATA = '../input/hubmap-organ-segmentation/train_images'
OUT_TRAIN = '../input/hubmap-2022-for-Train3/train'
OUT_MASKS = '../input/hubmap-2022-for-Train3/masks'

In [8]:
# functions to convert encoding to mask and mask to encoding
def enc2mask(encs, shape):
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    #print(encs)
    for m,enc in enumerate(encs):
        
        if isinstance(enc,np.float64) and np.isnan(enc): continue
        s = enc.split()
        for i in range(len(s)//2):
            start = int(s[2*i]) - 1
            length = int(s[2*i+1])
            img[start:start+length] = 1 + m
        break
    return img.reshape(shape).T

def mask2enc(mask, n=1):
    pixels = mask.T.flatten()
    encs = []
    for i in range(1,n+1):
        p = (pixels == i).astype(np.int8)
        if p.sum() == 0: encs.append(np.nan)
        else:
            p = np.concatenate([[0], p, [0]])
            runs = np.where(p[1:] != p[:-1])[0] + 1
            runs[1::2] -= runs[::2]
            encs.append(' '.join(str(x) for x in runs))
    return encs

df_masks = pd.read_csv(MASKS)[['id','rle','pixel_size']].set_index('id')
df_masks.head()

Unnamed: 0_level_0,rle,pixel_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10044,1459676 77 1462675 82 1465674 87 1468673 92 14...,0.4
10274,715707 2 718705 8 721703 11 724701 18 727692 3...,0.4
10392,1228631 20 1231629 24 1234624 40 1237623 47 12...,0.4
10488,3446519 15 3449517 17 3452514 20 3455510 24 34...,0.4
10610,478925 68 481909 87 484893 105 487863 154 4908...,0.4


In [9]:
# one of the new images cannot be loaded into 16GB RAM
# use rasterio to load image part by part
# using a datas

class HuBMAPDataset(Dataset):
    def __init__(self, idx, df=None):
        with TiffFile(os.path.join(DATA,str(idx)+'.tiff')) as tif:
            self.data = tif.asarray()
        self.shape = self.data.shape
        self.mask = enc2mask(df,(self.shape[1],self.shape[0])) if encs is not None else None
        #self.new_size = (int(self.shape[0] // (PIXEL_SIZE / df['pixel_size'])), int(self.shape[1] // (PIXEL_SIZE / df['pixel_size'])))
        self.new_size = (512, 512)
        
    def __len__(self):
        return 1
    
    def __getitem__(self, idx):
        
        img = cv2.resize(self.data,self.new_size,
                            interpolation = cv2.INTER_AREA)
        mask = cv2.resize(self.mask,self.new_size,
                            interpolation = cv2.INTER_NEAREST)
        
        return img, mask, idx

In [10]:
x_tot,x2_tot = [],[]
for index, encs in tqdm(df_masks.iterrows(),total=len(df_masks)):
    ds = HuBMAPDataset(index,df=encs)

    im,m,idx = ds[0]
    if idx < 0: continue
        
    x_tot.append((im/255.0).reshape(-1,3).mean(0))
    x2_tot.append(((im/255.0)**2).reshape(-1,3).mean(0))
    
    #write data
    train_filename = OUT_TRAIN + f'/{index}.png'
    mask_filename = OUT_MASKS + f'/{index}.png'

    cv2.imwrite(train_filename, im)
    cv2.imwrite(mask_filename, m)
        
#image stats
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
print('mean:',img_avr, ', std:', img_std)

  0%|          | 0/351 [00:00<?, ?it/s]

mean: [0.82829411 0.80269686 0.8205803 ] , std: [0.15668971 0.18531429 0.17469466]
