In [3]:
%reload_ext autoreload
%autoreload 2
import os, sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="4"  # specify which GPU(s) to be used
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import pandas as pd
import time
import matplotlib.pyplot as plt
import pickle as pkl

In [4]:
fold = 1
portion = 60
mode = f"curriculum_easy_{portion}idx"
percent = 0.1
df = pd.read_csv(f"./Train_Data_{mode}_{fold}.csv")
total_train_num = len(df.loc[df["split"] != fold])
exclude_num = int(percent * total_train_num * percent)
include_auto_num = int(percent * total_train_num + exclude_num)
include_expert_num = int(percent * total_train_num)
print(f"exclude {exclude_num} images, add {include_auto_num} images automatically, require {include_expert_num} to be annotated.")

exclude 74 images, add 822 images automatically, require 748 to be annotated.


In [5]:
idx_in_pool = set(np.load(f"../Idxs/{mode}_{fold}.npy"))

In [6]:
len(idx_in_pool), total_train_num

(7405, 7486)

### Exclude noisy samples that are hard and medium examples

In [7]:
ranked_stats = np.load(f"../Step4_Training/runs/train_ranked_sample_loss_{fold}_{mode}.npy")
df_temp = df.loc[ranked_stats[:,0].astype(int)]
df_temp.head(20)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,tile_number,split,tile_pixel,tile_blueratio,image_idx,k_means_grp,curicullum_cluster_grp
7102,fc8bbe93363ee983d99c949f41ea180f,karolinska,5,5+4,37,0.0,"80,79,77,72,75,68,68,61,59,67,69,54,57,62,65,6...","28,28,27,27,27,25,26,26,25,26,25,25,25,25,25,2...",7102,5,1
2273,67f5929b81aef7c91f8501e93cb04193,karolinska,2,3+4,40,0.0,"98,88,90,86,90,85,85,86,79,78,85,77,78,80,69,7...","34,32,29,32,30,29,28,32,30,30,28,28,27,31,29,2...",2273,1,0
5620,1dfa9e0e91f817fbd98ae93fc20d6255,karolinska,5,5+5,58,3.0,"110,111,116,111,108,103,99,96,99,99,98,97,98,9...","47,50,44,36,37,34,33,41,32,32,39,33,33,32,33,3...",5620,5,0
8133,d34c7fe759808771f0b2dc65ad0df03c,radboud,4,4+4,10,0.0,1111098989868475667142,46384138323228292823,8133,1,0
2192,b90fb52788bc4fc6be21377c096008c3,karolinska,4,4+4,45,0.0,"70,73,67,74,71,70,69,70,70,71,64,69,65,64,58,6...","27,26,26,26,26,25,26,26,26,26,25,26,25,25,24,2...",2192,1,0
7155,d4dcb1cd15053d17de19d2d63ddcb173,radboud,4,3+5,56,2.0,"110,104,98,101,89,95,86,100,89,99,92,86,81,87,...","41,38,35,37,34,36,34,37,35,34,39,33,31,32,33,3...",7155,1,0
993,7234d2bf2df91f65e94e9ecc4ec5827d,radboud,3,4+3,41,2.0,"98,102,88,84,88,97,91,72,87,91,101,92,99,88,75...","35,42,33,31,32,37,30,30,37,36,36,35,35,31,29,3...",993,10,1
3301,fc07f0ce6d2cdff9d65591e94fc2130e,karolinska,5,4+5,27,3.0,"87,82,87,74,79,74,69,80,77,71,66,60,63,54,54,5...","30,33,30,28,29,28,27,28,27,26,25,25,25,24,24,2...",3301,5,1
5444,9079ffff621fc4e738b65b66587bfa47,radboud,5,4+5,46,0.0,"106,104,86,97,105,88,91,107,100,87,93,88,82,95...","42,38,34,33,33,33,33,38,39,33,33,34,30,32,37,3...",5444,10,2
3373,2db286ff4d3398fa6538678a6452497f,karolinska,1,3+3,55,2.0,"127,122,120,117,117,112,110,112,107,112,107,10...","46,47,42,45,43,42,40,37,38,38,40,38,38,42,35,3...",3373,6,0


In [8]:
df_temp_ = df_temp.loc[df["curicullum_cluster_grp"] != 3]
exclude_id = df_temp_.image_idx.tolist()
if len(exclude_id) > exclude_num:
    print("exclude example is enough!")
else:
    print("hard example is not enough to exclude!")

exclude example is enough!


### Include samples from: 1 automatic prediction; 2 expert annotation

In [9]:
ranked_stats = np.load(f"../Step4_Training/runs/unlabel_ranked_sample_entropy_{fold}_{mode}.npy")
df_temp = df.loc[ranked_stats[:,0].astype(int)]
df_temp.head(20)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,tile_number,split,tile_pixel,tile_blueratio,image_idx,k_means_grp,curicullum_cluster_grp
5534,e4858d718f97a3ff5dab42484f5b1bd7,radboud,5,4+5,15,0.0,736481786069617265776835394528,333132302829252829282721222220,5534,2,2
900,1e1862f8927813cacf7391cc55848ba4,radboud,5,5+4,34,0.0,"66,86,63,65,91,79,80,86,78,76,67,80,57,56,58,5...","33,34,33,30,33,32,32,32,30,29,26,30,25,24,26,2...",900,5,1
8892,50416fa0e1642f024c10c25c90ec5bff,karolinska,4,4+4,44,0.0,"76,69,77,64,66,66,70,66,73,70,72,64,58,67,68,6...","26,26,28,25,25,26,25,25,27,26,26,25,24,25,25,2...",8892,1,0
1311,934f3bcb20dfdd3e184f53df0383b9e0,radboud,4,3+5,16,2.0,"101,92,93,102,107,110,99,105,95,103,91,102,88,...",47424142444637383637343834333125,1311,1,0
7818,82e968eacc5a5abf71fa6cefb97c0910,karolinska,5,5+5,23,0.0,"88,82,81,78,78,77,75,76,69,70,73,61,66,68,66,6...","31,28,30,29,26,30,29,27,26,29,25,24,25,25,26,2...",7818,5,1
813,9aea86aa9b34cebb8bbc0db42d6c7559,karolinska,5,5+5,45,0.0,"106,106,105,105,105,100,97,99,96,96,90,94,95,9...","37,39,38,37,35,34,33,34,32,32,32,31,32,32,33,3...",813,1,1
4260,083ab9e2c95fb0ea2b999c592fb41653,radboud,2,3+4,43,3.0,"111,107,114,103,101,100,110,99,107,113,105,108...","44,34,39,40,37,37,41,31,41,41,42,37,35,32,37,3...",4260,4,1
6430,86ff52ec50c7e0b2b3549e9bb1524770,karolinska,4,3+5,40,0.0,"106,103,99,92,93,92,92,90,85,83,87,84,81,81,77...","32,34,33,33,32,33,32,33,33,30,32,31,31,31,28,3...",6430,1,0
3252,cc66d0b4ad6375c6bed929d10c98a716,karolinska,4,4+4,37,0.0,"102,99,95,94,91,93,83,84,81,80,79,82,76,76,75,...","38,35,33,33,32,33,32,31,31,30,30,30,29,27,28,2...",3252,1,0
4774,0c7d6e0e07621b5582117e9977327b4c,radboud,3,4+3,22,2.0,"71,77,68,73,64,74,70,68,64,62,70,56,59,55,59,4...","29,30,27,29,27,28,28,27,26,26,28,25,26,24,25,2...",4774,1,1


In [10]:
df_temp_ = df_temp.loc[df["curicullum_cluster_grp"] != 2] ## no hard group
include_id_auto = df_temp_.image_idx.tolist()[::-1]
if len(include_id_auto) > include_auto_num:
    print("include auto easy example is enough!")
else:
    print("include auto easy is not enough!")

include auto easy is not enough!


In [11]:
include_id_expert = df_temp.image_idx.tolist()[:include_expert_num]

In [12]:
len(include_id_expert), len(include_id_auto)

(81, 57)

### Save the selection ID

In [13]:
selected_idx = {"exclude_noisy":exclude_id[:exclude_num], 
               "include_auto": include_id_auto[:include_auto_num],
               "include_expert": include_id_expert}
np.save(f"selected_idx_{mode}_{fold}.npy", selected_idx)