In [1]:
# %matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import skimage.external.tifffile as tiff

from common import Statistics, dataset_source
from resources.conv_learner import *
from resources.plots import *
from pprint import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
PATH = "datasets/Kaggle_HPA_2018/"
data_path = Path(PATH)

In [4]:
NUM_CLASSES = 28
bs = 64
sz = [3,224,224]

In [5]:
# setting up a list of a random 20% of images in train as the validation set

lbl_csv = PATH + 'HPA_labels.csv'
n = len(list(open(lbl_csv))) -1
val_idxs = get_cv_idxs(n, val_pct=0.1)

In [6]:
def get_data(path, lbl_csv, val_idxs, sz, bs, aug_tfms):

    tfms = tfms_with_IntNorm(sz, aug_tfms=aug_tfms, crop_type=CropType.CENTER)
    data = ImageClassifierData.from_csv(PATH, 'full_train_BGR', lbl_csv, val_idxs = val_idxs, test_name='full_test_BGR', tfms=tfms, bs=bs, suffix = '.tiff', 
                                        balance=True)
    
    return data

In [7]:
# define augmentations
augs = [RandomDihedral()]

# initialize data object
data = get_data(PATH, lbl_csv, val_idxs, sz, bs, aug_tfms = augs)

weighted


### inpsect data_loaders

In [None]:
x, y = next(iter(data.trn_dl))
# x_test, y_test = next(iter(data.test_dl))

In [None]:
x.shape

In [None]:
# loading epochs to inspect class-balance

In [None]:
def load_epoch(dl = data.trn_dl):
    
    batch = iter(dl)

    xs = []
    ys = []

#     for b in range(0,len(dl)):
    for b in range(0,10):

        x_, y_ = next(batch)
        x_np, y_np = to_np(x_), to_np(y_)
        xs.append(x_np)
        ys.append(y_np)

    return np.vstack(xs), np.concatenate(ys)


In [None]:
xs, ys = load_epoch()

In [None]:
# analyze results
lbl_dist = ys.sum(axis=0)
print(lbl_dist)

In [None]:
# weights calculation WIP

ys = data.trn_ds.y

cut = 1/len(data.classes)
perc = (ys.sum(axis=0) / ys.sum())

weights_per_label = [cut / perc[i] for i in range(len(data.classes))]
w_matrix = [ys[i] * weights_per_label for i in range(len(ys))]

min_weights_per_im = [np.min(w_matrix[i][np.nonzero(w_matrix[i])]) for i in range(len(ys))]
weights_per_im = [np.max(w_matrix[i]) for i in range(len(ys))]

In [None]:
min_weights_per_im[:10]

### inpsecting loaded images and labels

In [None]:
def to_label(y):
    ind = [i for i, p in enumerate(y) if y[i]==1]
    return(ind)

In [None]:
# inspect train images
plt.style.use('seaborn-white')

idx = 2

im = to_np(x)[idx]

lbl = to_label(to_np(y)[idx])
print(lbl)

fig, ax = plt.subplots(1,3, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

In [None]:
# inspect test images
plt.style.use('seaborn-white')

idx = 3

im = to_np(x_test)[idx]

# lbl = to_label(to_np(y)[idx])
# print(lbl)

fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

# training

In [None]:
torch.cuda.is_available()

In [8]:
# base-model

arch = resnet18
# arch = resnet50
learn = ConvLearner.pretrained(arch, data, ps=0, opt_fn=optim.Adam, pretrained=True)

In [9]:
# additional parameters

wd=1e-5 # weight-decay/L2 regularization 
learn.metrics = [accuracy_thresh(0.5),f1_macro]

In [10]:
learn.unfreeze()

In [None]:
learn.summary()

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
# training loops:

In [None]:
%time learn.fit(5e-3, 1, cycle_len=1, wds=wd, use_wd_sched=True)

In [13]:
lr = (5e-4, 1e-3, 5e-3)

In [14]:
%time learn.fit(lr, 2, cycle_len=8, wds=wd, use_wd_sched=True, use_clr=(20,8, 0.95, 0.85))

HBox(children=(IntProgress(value=0, description='Epoch', max=16), HTML(value='')))

EPOCH 0 ---------------------------------------- STEP 0                                                                                                                                                                                      
epoch      trn_loss   val_loss   <lambda>   f1_macro   
    0      0.137455   0.1615     0.946871   0.127783  
EPOCH 1 ---------------------------------------- STEP 1                                                                                                                                                                                      
    1      0.10834    0.157976   0.949055   0.22694   
EPOCH 2 ---------------------------------------- STEP 2                                                                                                                                                                                      
    2      0.092836   0.134314   0.952389   0.255329  
EPOCH 3 ---------------------------------------- STEP 3          

[array([0.09773]), 0.9675042538501689, 0.4371290804418025]

In [None]:
%time learn.fit(lr, 1, cycle_len=8, wds=wd, use_wd_sched=True, use_clr=(20,8, 0.95, 0.85))

In [15]:
learn.save('ResNet18_pre_64_10perVal_v1')


In [34]:
learn.load('ResNet18_pre_64_10perVal_v1')

In [None]:
log_preds, y = learn.predict_with_targs()

## test-predictions

In [16]:
log_preds, y =  learn.TTA(is_test=True)

                                                                                                                                                                                                                                             

In [55]:
def opt_th(preds, targs, start=0.2, end=0.6, step=0.05):
    ths = np.arange(start,end,step)
    res = [f1_macro(preds, targs, thresh=th, kind='macro') for th in ths]
    idx = np.argmax(res)
    return ths[idx], res

In [64]:
# get train predictions

preds_trn, targs_trn =  learn.predict_with_targs('trn')

preds_trn_torch = torch.from_numpy(preds_trn)
targs_trn_torch = torch.from_numpy(targs_trn)

opt_th(preds_trn_torch, targs_trn_torch)

trn


(0.39999999999999997,
 [0.92619235324298,
  0.9328112334721564,
  0.9371767136037829,
  0.939984370193041,
  0.9413976617363594,
  0.9413670204891537,
  0.9408816526273823,
  0.9393772059857021])

In [None]:
print(preds_val_torch[0])
print(preds_trn_torch[0])

In [57]:
# get val predictions

preds_val, targs_val =  learn.predict_with_targs('val')

preds_val_torch = torch.from_numpy(preds_val)
targs_val_torch = torch.from_numpy(targs_val)

opt_th(preds_val_torch, targs_val_torch)

(0.3,
 [0.5891571227741271,
  0.5968790636677838,
  0.6024754308320616,
  0.6010317740546677,
  0.5988708536620362,
  0.5940680327869428,
  0.5936215038296789,
  0.591517903171673])

In [None]:
# get test predictions

preds_test, targs_test =  learn.predict_with_targs('test')

In [None]:
def load_epoch(dl = data.trn_dl):
    
    batch = iter(dl)

    xs = []
    ys = []

#     for b in range(0,len(dl)):
    for b in range(0,10):

        x_, y_ = next(batch)
        x_np, y_np = to_np(x_), to_np(y_)
        xs.append(x_np)
        ys.append(y_np)

    return np.vstack(xs), np.concatenate(ys)

In [68]:
data.test_dl.sampler

<torch.utils.data.sampler.SequentialSampler at 0x227ceb6cb70>

In [79]:
test_names = data.test_ds.fnames
np.array(test_names)[:10]

array(['full_test_BGR\\00008af0-bad0-11e8-b2b8-ac1f6b6435d0.tiff',
       'full_test_BGR\\0000a892-bacf-11e8-b2b8-ac1f6b6435d0.tiff',
       'full_test_BGR\\0006faa6-bac7-11e8-b2b7-ac1f6b6435d0.tiff',
       'full_test_BGR\\0008baca-bad7-11e8-b2b9-ac1f6b6435d0.tiff',
       'full_test_BGR\\000cce7e-bad4-11e8-b2b8-ac1f6b6435d0.tiff',
       'full_test_BGR\\00109f6a-bac8-11e8-b2b7-ac1f6b6435d0.tiff',
       'full_test_BGR\\001765de-bacd-11e8-b2b8-ac1f6b6435d0.tiff',
       'full_test_BGR\\0018641a-bac9-11e8-b2b8-ac1f6b6435d0.tiff',
       'full_test_BGR\\00200f22-bad7-11e8-b2b9-ac1f6b6435d0.tiff',
       'full_test_BGR\\0026f154-bac6-11e8-b2b7-ac1f6b6435d0.tiff'], dtype='<U55')

In [80]:
test_im_ids = {fname: idx for idx, fname in enumerate(test_names)}

In [98]:
np.array(list(test_im_ids.values()))[:,None].shape

(11702, 1)

In [48]:
def create_submission(preds_file, output_name, th = 0.3, TTA=False):
    
    # creating submission file
    
    if TTA:
        preds = preds_file.mean(axis=0)
        print('TTA:',preds.shape)
    else: preds = preds_file
    
    clss = np.arange(0, len(data.classes)) # get class indeces
    res = np.array([' '.join(np.char.mod('%d', clss[np.where(p > th)])) for p in preds]) # generating output

    # ensure that there are no empty cells: in case no value > thresh, fill in with argmax()
    for i in range(res.shape[0]):
        if res[i] == '':
            res[i] = preds[i].argmax()

    # getting image Ids
    fnames = np.array([os.path.basename(im).split('.')[0] for im in data.test_ds.fnames])

    # creating submission file
    sub_df = pd.DataFrame(res, index=fnames, columns=['Predicted'])
    sub_df.to_csv(output_name, index_label='Id')

In [49]:
submission_name = PATH + 'submissions/Res18_pre_0.42_t-03.csv'

create_submission(log_preds, submission_name, th=0.3, TTA=True)

TTA: (11702, 28)


## other stuff

In [None]:
# List of label identities:

cell_location_label = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }

In [None]:
# from sklearn.metrics import f1_score

# thresholds = np.linspace(0, 1, 1000)
# score = 0.0
# test_threshold=0.5*np.ones(28)
# best_threshold=np.zeros(28)
# best_val = np.zeros(28)
# for i in range(28):
#     for threshold in thresholds:
#         test_threshold[i] = threshold
#         max_val = np.max(preds_y)
#         val_predict = (preds_y > test_threshold)
#         score = f1_score(valid_y > 0.5, val_predict, average='macro')
#         if score > best_val[i]:
#             best_threshold[i] = threshold
#             best_val[i] = score
#     print("Threshold[%d] %0.6f, F1: %0.6f" % (i,best_threshold[i],best_val[i]))
#     test_threshold[i] = best_threshold[i]
# print("Best threshold: ")
# print(best_threshold)
# print("Best f1:")
# print(best_val)