In [1]:
# %matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import skimage.external.tifffile as tiff

from common import Statistics, dataset_source
from resources.conv_learner import *
from resources.plots import *
from pprint import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
PATH = "datasets/Kaggle_HPA_2018/"
data_path = Path(PATH)

In [4]:
NUM_CLASSES = 27
bs = 64
sz = [4,224,224]

In [5]:
# setting up a list of a random 20% of images in train as the validation set

lbl_csv = PATH + 'HPA_labels.csv'
n = len(list(open(lbl_csv))) -1
val_idxs = get_cv_idxs(n)

In [6]:
def get_data(path, lbl_csv, val_idxs, sz, bs, aug_tfms):

    tfms = tfms_with_IntNorm(sz, aug_tfms=aug_tfms, crop_type=CropType.CENTER)
    data = ImageClassifierData.from_csv(PATH, 'train_all', lbl_csv, val_idxs = val_idxs, test_name='test_all', tfms=tfms, bs=bs, suffix = '.tiff', 
                                        balance=True)
    
    return data

In [7]:
# define augmentations
augs = [RandomDihedral()]

# initialize data object
data = get_data(PATH, lbl_csv, val_idxs, sz, bs, aug_tfms = augs)

weighted


### inpsect data_loaders

In [16]:
x, y = next(iter(data.trn_dl))
# x_test, y_test = next(iter(data.test_dl))

In [17]:
y.sum(dim=0)


 22
  2
  4
  1
  4
  4
  6
  2
  6
  3
  3
  3
  6
  0
  4
  3
  2
  1
 13
  2
  5
  5
  5
  8
  5
  8
  2
  4
[torch.cuda.FloatTensor of size 28 (GPU 0)]

In [None]:
# loading epochs to inspect class-balance

In [8]:
def load_epoch(dl = data.trn_dl):
    
    batch = iter(dl)

    xs = []
    ys = []

#     for b in range(0,len(dl)):
    for b in range(0,10):

        x_, y_ = next(batch)
        x_np, y_np = to_np(x_), to_np(y_)
        xs.append(x_np)
        ys.append(y_np)

    return np.vstack(xs), np.concatenate(ys)


In [9]:
xs, ys = load_epoch()

In [10]:
# analyze results
lbl_dist = ys.sum(axis=0)
print(lbl_dist)

[216.  28.  21.  26.  36.  21.  30.  29.  41.  22.  35.  27.  57.  28.  51.  27.  56.  34. 100.  26.  27.
  28.  29.  64.  33.  36.  27.  29.]


In [None]:
# weights calculation WIP

ys = data.trn_ds.y

cut = 1/len(data.classes)
perc = (ys.sum(axis=0) / ys.sum())

weights_per_label = [cut / perc[i] for i in range(len(data.classes))]
w_matrix = [ys[i] * weights_per_label for i in range(len(ys))]

min_weights_per_im = [np.min(w_matrix[i][np.nonzero(w_matrix[i])]) for i in range(len(ys))]
weights_per_im = [np.max(w_matrix[i]) for i in range(len(ys))]

In [None]:
min_weights_per_im[:10]

### inpsecting loaded images and labels

In [None]:
def to_label(y):
    ind = [i for i, p in enumerate(y) if y[i]==1]
    return(ind)

In [None]:
# inspect train images
plt.style.use('seaborn-white')

idx = 2

im = to_np(x)[idx]

lbl = to_label(to_np(y)[idx])
print(lbl)

fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

In [None]:
# inspect test images
plt.style.use('seaborn-white')

idx = 3

im = to_np(x_test)[idx]

# lbl = to_label(to_np(y)[idx])
# print(lbl)

fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

# training

In [11]:
torch.cuda.is_available()

True

In [12]:
# base-model

arch = resnet18_c
learn = ConvLearner.pretrained(arch, data, ps=0, opt_fn=optim.Adam, pretrained=False)

In [13]:
# additional parameters

wd=1e-5 # weight-decay/L2 regularization 
learn.metrics = [accuracy_thresh(0.5),f1_macro]

In [14]:
learn.unfreeze()

In [15]:
learn.summary()

OrderedDict([('Conv2d-1',
              OrderedDict([('input_shape', [-1, 4, 224, 224]),
                           ('output_shape', [-1, 64, 112, 112]),
                           ('trainable', True),
                           ('nb_params', 12544)])),
             ('BatchNorm2d-2',
              OrderedDict([('input_shape', [-1, 64, 112, 112]),
                           ('output_shape', [-1, 64, 112, 112]),
                           ('trainable', True),
                           ('nb_params', 128)])),
             ('ReLU-3',
              OrderedDict([('input_shape', [-1, 64, 112, 112]),
                           ('output_shape', [-1, 64, 112, 112]),
                           ('nb_params', 0)])),
             ('MaxPool2d-4',
              OrderedDict([('input_shape', [-1, 64, 112, 112]),
                           ('output_shape', [-1, 64, 56, 56]),
                           ('nb_params', 0)])),
             ('Conv2d-5',
              OrderedDict([('input_shape', [-1, 64, 56, 5

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
# training loops:

In [18]:
%time learn.fit(1e-2, 1, cycle_len=1, wds=wd, use_wd_sched=True)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

EPOCH 0 ---------------------------------------- STEP 0                                                                                                                                                                                      
epoch      trn_loss   val_loss   <lambda>   f1_macro   
    0      0.178617   0.166772   0.946929   0.035186  

Wall time: 10min 1s


[array([0.16677]), 0.9469285928286039, 0.035186028369155006]

In [None]:
learn.save('first_model_1epoch')

In [None]:
log_preds, y = learn.predict_with_targs()

## test-predictions

In [None]:
log_preds, y =  learn.predict_with_targs('test')

In [None]:
# creating submission file
thresh = 0.2



clss = np.arange(0, len(data.classes)) # get class indeces
res = np.array([' '.join(np.char.mod('%d', clss[np.where(p > thresh)])) for p in log_preds]) # generating output

# ensure that there are no empty cells: in case no value > thresh, fill in with argmax()
for i in range(res.shape[0]):
    if res[i] == '':
        res[i] = log_preds[i].argmax()

# getting image Ids
fnames = np.array([os.path.basename(im).split('.')[0] for im in data.test_ds.fnames])

# creating submission file
sub_df = pd.DataFrame(res, index=fnames, columns=['Predicted'])
sub_df.to_csv(f"{PATH}submissions/Test_submission_thresh_02.csv", index_label='Id')

## other stuff

In [None]:
# List of label identities:

cell_location_label = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }