In [1]:
# %matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import skimage.external.tifffile as tiff

from common import Statistics, dataset_source
from resources.conv_learner import *
from resources.plots import *
from pprint import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
path = "datasets/Hirano3D/"
# data_path = Path(PATH)

In [4]:
_clss = {'NM17-002': 0,
         'Ctrl-AK': 0,
         'GUK1-D': 1, 
         'GUK1-R': 1,
         'MFN2-1': 2,
         'VCP-1': 3}

#### CV generation via sklearn, Multilabel implementation by trent-b, or FastAi

Both libraries seem to be completely useless... can just use native fastai function: get_cv_idxs().<br>
However, get_cv_idxs() does NOT shuffle...!

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_label_stratified_CV_idxs(csv_path):
    
    _all_labels = pd.read_csv(csv_path)
    arr = _all_labels.values

    X = arr[:,0]
    y = arr[:,1:]
    
    ### sklearn.model_selection.StratifiedKFold
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    
    for train_index, val_index in sss.split(X, y):
        trn_idxs = train_index
        val_idxs = val_index
    
    print(f"""Train label-distribution:\n"""
          f"""{pd.Series(arr[:,1][trn_idxs]).value_counts()}""")
    print(f"""Val label-distribution:\n"""
          f"""{pd.Series(arr[:,1][val_idxs]).value_counts()}""")
    
    return trn_idxs, val_idxs


from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

def get_label_stratified_CV_idxs_multi(csv_path):
    
    # FastAi csv_source expects a folder-name string to be passed as first arg... -> 'dummy'
    X, y, all_lbls = csv_source('dummy', csv_path)
    
    ### Iterative stratification library: https://github.com/trent-b/iterative-stratification
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2)
    
    for train_index, val_index in msss.split(X, y):
        trn_idxs = train_index
        val_idxs = val_index
    
    trn_count = np.sum(y[trn_idxs], axis=0)
    val_count = np.sum(y[val_idxs], axis=0)
    
    print(f"""Train label-distribution:\n"""
          f"""{trn_count}""")
    print(f"""Val label-distribution:\n"""
          f"""{val_count}""")
    
    return trn_idxs, val_idxs

In [6]:
### used atm...
csv_path = path + '16bit_multi_folder_Hirano3D_v1.csv'
trn_idxs, val_idxs = get_label_stratified_CV_idxs(csv_path)

Train label-distribution:
0    216
1    163
2    161
3    124
dtype: int64
Val label-distribution:
0    54
1    41
2    40
3    31
dtype: int64


In [7]:
# csv_path = path + 'multi_folder_Hirano3D_02.csv'
# trn_idxs, val_idxs = get_label_stratified_CV_idxs(csv_path)

In [8]:
# setting up a list of a random 20% of images in train as the validation set

# lbl_csv = path + 'multi_folder_Hirano3D_02.csv'
# n = len(list(open(lbl_csv))) -1
# val_idxs = get_cv_idxs(n, val_pct=0.2)


# # Count labels
# # _all_labels = pd.read_csv(PATH + 'multi_folder_Hirano3D_02.csv')
# # print(_all_labels.Targets.value_counts())

# _arr = _all_labels.values
# _val_labels = pd.DataFrame(_arr[val_idxs], columns=['Id','Targets'])
# print(_val_labels.Targets.value_counts())

### Define Dataloader:

In [9]:
def get_data(path, lbl_csv, val_idxs, sz, bs, aug_tfms):

    tfms = tfms_with_IntNorm(sz, aug_tfms=aug_tfms, crop_type=CropType.CENTER)
    data = ImageClassifierData.from_csv(PATH, 'data', lbl_csv, 
                                        val_idxs = val_idxs,
                                        test_name='data/16bit_Hirano3D_MaxP_GRFB_test_512', 
                                        tfms=tfms, bs=bs, suffix = '.tif', 
                                        balance=True)
    return data

In [10]:
PATH = "datasets/Hirano3D"
# data_path = Path(PATH)

In [11]:
NUM_CLASSES = 4
bs = 64
sz = [4,64,64]

In [12]:
# define augmentations
augs = [RandomDihedral()]

# csv:
lbl_csv = path + '16bit_multi_folder_Hirano3D_v1.csv'

# initialize data object
data = get_data(PATH, lbl_csv, val_idxs, sz, bs, aug_tfms = augs)

Calculating weights...
one-hot encoding single-labels...
Weights calculated successfully!
Using WeightedRandomSampler


### inpsect data_loaders

In [None]:
x, y = next(iter(data.fix_dl))
# x_test, y_test = next(iter(data.test_dl))

In [None]:
x[0].shape

In [None]:
# print transformations
# plt.style.use('seaborn-white')

c = 0
idx = 0
fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    x, y = next(iter(data.aug_dl))
    im = to_np(x)[idx]
#     ax.imshow(np.sum(im, axis = 0))
    ax.imshow(im[c])

In [None]:
from PIL import Image

In [None]:
im_raw = tiff.imread('datasets\\Hirano3D\\test_out/fCMT2_02_e3_017.tif')
im_scaled = (im_raw/255)

im_sc_rot = np.moveaxis(im_scaled, 0,-1)

m = np.array(np.mean(im_sc_rot, axis=(0,1)), dtype=np.float32)
s = np.array(np.std(im_sc_rot, axis=(0,1)), dtype=np.float32)

im_scaled_normed = (im_sc_rot - m) / s
m_n = np.mean(im_scaled_normed[0])
s_n = np.std(im_scaled_normed[0])

print(np.mean(im_raw[0]))
print(np.mean(im_scaled[0]))
print(np.max(im_scaled_normed[0]))
print(m)

print(m_n, s_n)

im_raw.shape

In [None]:
tiff.imshow(im_raw[0])

In [None]:
# inspect images
plt.style.use('seaborn-white')
ch = 0
idx = 0

# im = to_np(x)[idx]
im_rawHPA = tiff.imread('datasets\\Kaggle_HPA_2018\\HPAv18_BGR_test\\1183_51_A11_2.tiff')
im_raw = tiff.imread('datasets\\Hirano3D\\test_out/fCMT2_02_e3_017.tif')
# im_scaled = (im_raw/255)


fig, ax = plt.subplots(1,2, figsize=(16,10))
ax[0].imshow(im_rawHPA[0])
ax[1].imshow(im_raw[ch])

In [None]:
ch = 1

print(np.max(im_raw[ch]))
print(np.max(im_rawHPA[ch]))

In [None]:
print(im_rawHPA[ch])

In [None]:
plt.imshow(to_np(x)[5][2])

In [None]:
# loading epoch for manual inspection...

In [None]:
def load_epoch(dl = data.trn_dl):
    
    batch = iter(dl)

    xs = []
    ys = []

#     for b in range(0,len(dl)):
    for b in range(0,10):

        x_, y_ = next(batch)
        x_np, y_np = to_np(x_), to_np(y_)
        xs.append(x_np)
        ys.append(y_np)

    return np.vstack(xs), np.concatenate(ys)


### inpsecting loaded images and labels

In [None]:
def to_label(y):
    ind = [i for i, p in enumerate(y) if y[i]==1]
    return(ind)

In [None]:
# inspect train images
plt.style.use('seaborn-white')

idx = 2

im = to_np(x)[idx]

lbl = to_np(y)[idx]
# lbl = to_label(to_np(y)[idx])
print(lbl)

fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

In [None]:
# inspect test images
plt.style.use('seaborn-white')

idx = 3

im = to_np(x_test)[idx]

# lbl = to_label(to_np(y)[idx])
# print(lbl)

fig, ax = plt.subplots(1,4, figsize=(16,10))
for i, ax in enumerate(ax.flat):
    ax.imshow(im[i])

# training

In [None]:
torch.cuda.is_available()

In [None]:
GLOBAL_STEP = 0

In [13]:
# base-model

arch = dn121_c
# arch = resnet50
learn = ConvLearner.pretrained(arch, data, ps=0, opt_fn=optim.Adam, pretrained=False)

In [14]:
# additional parameters

wd=1e-5 # weight-decay/L2 regularization 
# learn.metrics = [accuracy, f1_micro]

In [15]:
learn.unfreeze()

In [16]:
%time learn.fit(1e-3, 2, cycle_len=1, wds=wd, use_wd_sched=True)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

EPOCH 0 ---------------------------------------- STEP 0                                                                                                                                                                                      
epoch      trn_loss   val_loss   accuracy   
    0      1.509949   1.437895   0.204819  
EPOCH 1 ---------------------------------------- STEP 1                                                                                                                                                                                      
    1      1.381894   2.319068   0.204819  

creating log-files...
log-files saved to: datasets/Hirano3D
Wall time: 13.8 s


[array([2.31907]), 0.20481927746749787]

In [17]:
%time learn.fit(1e-3, 2, cycle_len=1, wds=wd, use_wd_sched=True)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

EPOCH 0 ---------------------------------------- STEP 2                                                                                                                                                                                      
epoch      trn_loss   val_loss   accuracy   
    0      1.19584    2.765112   0.313253  
EPOCH 1 ---------------------------------------- STEP 3                                                                                                                                                                                      
    1      1.134962   2.367119   0.204819  

appending existing log-files...
log-files saved to: datasets/Hirano3D
Wall time: 13.2 s


[array([2.36712]), 0.20481927746749787]

In [None]:
learn.summary()

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
# training loops:

In [None]:
learn.sched.__dict__.keys()

In [None]:


_per_iter = [[i, learn.sched.losses[idx], learn.sched.lrs[idx]] for idx, i in enumerate(learn.sched.iterations)]
_per_iter = pd.DataFrame(per_iter, columns=['Iteration', 'trn_loss', 'lr'])
per_iter_df.append(_per_iter)

if learn.sched.glob_step: 
    per_iter_df = _per_iter
    print('created')
else: 
    per_iter_df.append(_per_iter)
    print('appended')

In [None]:
learn.sched.plot_lr()

In [None]:
%time learn.fit(1e-2, 2, cycle_len=1, wds=wd, use_wd_sched=True)

In [None]:
learn.sched.Glob_lrs

In [None]:
learn.sched.plot_loss()

In [None]:
%time learn.fit(1e-4, 8, cycle_len=1, wds=wd, use_wd_sched=True)

In [None]:
%time learn.fit(1e-5, 4, cycle_len=8, wds=wd, use_wd_sched=True, use_clr=(20,8, 0.95, 0.85))

In [None]:
%time learn.fit(1e-4, 2, cycle_len=16, cycle_mult=2, wds=wd, use_wd_sched=True, use_clr=(20,8, 0.95, 0.85))

In [None]:
%time learn.fit(2e-5, 1, cycle_len=8, cycle_mult=2, wds=wd, use_wd_sched=True)

In [None]:
lr = (5e-4, 1e-3, 5e-3)

In [None]:
sz = [4,516, 516]
learn.set_data(get_data(PATH, lbl_csv, val_idxs, sz, bs, aug_tfms = augs))

In [None]:
%time learn.fit(1e-3, 1, cycle_len=8, cycle_mult=2, wds=wd, use_wd_sched=True)

In [None]:
%time learn.fit(1e-3, 2, cycle_len=8, wds=wd, use_wd_sched=True)

In [None]:
learn.save('Hirano3D_v1_ResNet18_512_72')

In [None]:
learn.load('Hirano3D_v1_ResNet18_224_70')

## test-predictions

In [None]:
log_preds, y = learn.predict_with_targs()
preds = np.argmax(log_preds, axis=1)
print(preds)

In [None]:
log_preds, y =  learn.TTA()

In [None]:
# Plot confusion matrix 
plt.style.use('seaborn-white')
log_preds_mean = np.mean(log_preds, axis=0)
preds = np.argmax(log_preds_mean, axis=1)
cm = confusion_matrix(preds,y)
plot_confusion_matrix(cm, data.classes)

In [None]:
log_preds, y =  learn.TTA(is_test=True)
log_preds_mean = np.mean(log_preds, axis=0)
preds = np.argmax(log_preds_mean, axis=1)
print(preds)

In [None]:
# function to be called by register_forward_hook

def get_embeddings(layer_name):
    def register_hook(layer_name):
        def get_embedding(layer, inp, outp):
            tmp = inp[0]
            embedding.append(tmp)

        hook = layer.register_forward_hook(get_embedding) 
        
        for i in ['trn', 'val', 'test']:
            embedding = []
            preds, y = learn.predict_with_targs(i)
            
            # populating dict, consiting of [0]: preds, [1]: y, [2]: activations[layer]
            embeddings[i] = [preds, y, np.vstack(to_np(embedding))]
            
        hook.remove()
        
    embeddings = {}    
    layer = learn.models.model._modules.get(layer_name)
    register_hook(layer)
    
    return embeddings

In [None]:
embeddings = get_embeddings('14')

In [None]:
# unpacking embeddings

embs_trn = embeddings['trn'][2]
y_trn = embeddings['trn'][1]

embs_val = embeddings['val'][2]
y_val = embeddings['val'][1]

embs_test = embeddings['test'][2]
y_test = embeddings['test'][1]

print(embs_trn.shape)
print(embs_val.shape)
print(embs_test.shape)

print(y_trn.shape)
print(y_val.shape)
print(y_test.shape)


In [None]:
import umap

In [None]:
UMAP_trn_embedding = umap.UMAP(n_neighbors=10,
                      min_dist=0.3,
                      metric='correlation').fit(embs_trn)

UMAP_trn = UMAP_trn_embedding.embedding_
UMAP_val = UMAP_trn_embedding.transform(embs_val)
UMAP_test = UMAP_trn_embedding.transform(embs_test)

In [None]:
# data.test_ds.fnames

In [None]:
y_test = np.zeros(60)
y_test[:30] = y_test[:30] +1 
y_test

In [None]:
# plotting PCA vs TSNE results

fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

compA = 0
compB = 1

for i in range(4):
    trn_UMAP_cls = UMAP_trn[y_trn == i]

    axarr[0].scatter(trn_UMAP_cls[:,compA], trn_UMAP_cls[:,compB], label = data.classes[i], s = 5)
    axarr[0].legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    if i == 0:
        axarr[1].scatter(trn_UMAP_cls[:,compA], trn_UMAP_cls[:,compB], label = data.classes[i], s = 5)

    axarr[0].set_xlim(-10,5)
    axarr[0].set_ylim(-5,8)
    
for i in [0,1]:
    test_UMAP_cls = UMAP_test[y_test == i]
    
    axarr[1].scatter(test_UMAP_cls[:,compA], test_UMAP_cls[:,compB], s = 5)
    axarr[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))
    axarr[1].set_xlim(-10,5)
    axarr[1].set_ylim(-5,8)


In [None]:
def opt_th(preds, targs, start=0.2, end=0.6, step=0.05):
    ths = np.arange(start,end,step)
    res = [f1_macro(preds, targs, thresh=th, kind='macro') for th in ths]
    idx = np.argmax(res)
    return ths[idx], res

In [None]:
# get train predictions

preds_trn, targs_trn =  learn.predict_with_targs('trn')

preds_trn_torch = torch.from_numpy(preds_trn)
targs_trn_torch = torch.from_numpy(targs_trn)

opt_th(preds_trn_torch, targs_trn_torch)

In [None]:
print(preds_val_torch[0])
print(preds_trn_torch[0])

In [None]:
# get val predictions

preds_val, targs_val =  learn.predict_with_targs('val')

preds_val_torch = torch.from_numpy(preds_val)
targs_val_torch = torch.from_numpy(targs_val)

opt_th(preds_val_torch, targs_val_torch)

In [None]:
# get test predictions

preds_test, targs_test =  learn.predict_with_targs('test')

In [None]:
targs_test[:50]

In [None]:
def load_epoch(dl = data.trn_dl):
    
    batch = iter(dl)

    xs = []
    ys = []

#     for b in range(0,len(dl)):
    for b in range(0,10):

        x_, y_ = next(batch)
        x_np, y_np = to_np(x_), to_np(y_)
        xs.append(x_np)
        ys.append(y_np)

    return np.vstack(xs), np.concatenate(ys)

In [None]:
data.test_dl.sampler

In [None]:
test_names = data.test_ds.fnames
test_names[:10]

In [None]:
def create_submission(preds_file, output_name, th = 0.3, TTA=False):
    
    # creating submission file
    
    if TTA:
        preds = preds_file.mean(axis=0)
        print('TTA:',preds.shape)
    else: preds = preds_file
    
    clss = np.arange(0, len(data.classes)) # get class indeces
    res = np.array([' '.join(np.char.mod('%d', clss[np.where(p > th)])) for p in preds]) # generating output

    # ensure that there are no empty cells: in case no value > thresh, fill in with argmax()
    for i in range(res.shape[0]):
        if res[i] == '':
            res[i] = preds[i].argmax()

    # getting image Ids
    fnames = np.array([os.path.basename(im).split('.')[0] for im in data.test_ds.fnames])

    # creating submission file
    sub_df = pd.DataFrame(res, index=fnames, columns=['Predicted'])
    sub_df.to_csv(output_name, index_label='Id')

In [None]:
submission_name = PATH + 'submissions/Res18_pre_0.42_t-03.csv'

create_submission(log_preds, submission_name, th=0.3, TTA=True)

## other stuff

In [None]:
# List of label identities:

cell_location_label = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }

In [None]:
# from sklearn.metrics import f1_score

# thresholds = np.linspace(0, 1, 1000)
# score = 0.0
# test_threshold=0.5*np.ones(28)
# best_threshold=np.zeros(28)
# best_val = np.zeros(28)
# for i in range(28):
#     for threshold in thresholds:
#         test_threshold[i] = threshold
#         max_val = np.max(preds_y)
#         val_predict = (preds_y > test_threshold)
#         score = f1_score(valid_y > 0.5, val_predict, average='macro')
#         if score > best_val[i]:
#             best_threshold[i] = threshold
#             best_val[i] = score
#     print("Threshold[%d] %0.6f, F1: %0.6f" % (i,best_threshold[i],best_val[i]))
#     test_threshold[i] = best_threshold[i]
# print("Best threshold: ")
# print(best_threshold)
# print("Best f1:")
# print(best_val)