In [29]:
from umap import UMAP
import h5py
import numpy as np
import torch
import pickle
from IPython import display
from time import time
import pandas as pd
import json

In [3]:
save_stuff_path = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/OUTPUT/LIBRARIES_OF_THEORETICAL_D/Basler_like_R0_300x_w0_300x_Z_50x_64bit/SIMULATIONS/UMAP_Regressor/"

# Image directories
GT_file_path_train = f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TRAIN/GROUND_TRUTHS.json"
images_dir_path_train =f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TRAIN/" 
GT_file_path_test = f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TEST/GROUND_TRUTHS.json"
images_dir_path_test =f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TEST/" 

# Embedder parameters
num_images=3200
umap_dims=10
min_dist=0.1
n_neighbours=700
metric='hamming'
embedder_path = save_stuff_path + \
        f'UMAP_EMBEDDER_num_images_{num_images}_umap_dims_{umap_dims}_min_dist_{min_dist}_n_neighbours_{n_neighbours}_metric_{metric}.sav'


In [30]:
from torch.utils.data import Dataset
from torchvision.io import read_image
from torch.utils.data import DataLoader

class EmbeddingImageDataset(Dataset):
    def __init__(self, GT_file_path, images_dir_path, embedder):
        self.df_GTs = pd.DataFrame.from_dict(json.load(open(GT_file_path)))
        self.images_dir_path = images_dir_path
        self.len_data = len(self.df_GTs)
        self.embedder = embedder
        #self.embdding_dims = embedder.embedding_.shape[-1]

    def __len__(self):
        return self.len_data

    def __getitem__(self, idx):
        img_path = f"{self.images_dir_path}/IM_{self.df_GTs.iloc[idx,0]}_phiCR_{self.df_GTs.iloc[idx,1]}.png"
        image = read_image(img_path) #[1, 2X+1, 2X+1] torch tensor
        label = float(self.df_GTs.iloc[idx, 1])
        return image, label
    
training_data = EmbeddingImageDataset(GT_file_path_train, images_dir_path_train, umap_embedder)

In [7]:
umap_embedder = UMAP(n_components=umap_dims, min_dist=min_dist, n_neighbors=n_neighbours, metric=metric) #euclidean, canberra, cosine, manhattan, braycurtis, mahalanobis, hamming
umap_embedder = pickle.load((open(embedder_path, 'rb')))

In [37]:
X=302
nims_per_batch=40
number_of_batches=1650*3
np.random.seed(666)
output_info_every=1

beg=time()

# Initialize the hdf5 dataset saver
h5f = h5py.File(f"{save_stuff_path}/Dataset_Embedding.h5", 'a') # append if exists, create if not
prev_batches = len(h5f)

images = np.zeros((nims_per_batch, (2*X+1)**2), dtype=np.float32)
embedding_and_labels = np.zeros((nims_per_batch, umap_embedder.embedding_.shape[-1]+1), dtype=np.float32)

for batch in range(0,number_of_batches):
    random_indices = np.random.choice(range(len(training_data)), nims_per_batch, replace=False)
    for i, idx in enumerate(random_indices):
        image, label = training_data[idx]
        images[i] = image.flatten()
        embedding_and_labels[i, -1] = label
    embedding_and_labels[:,:-1] = umap_embedder.transform(images)
    ID=f"{batch+prev_batches}"
    h5f.create_dataset(ID, data=embedding_and_labels, compression="lzf", shuffle=True)
    if batch%output_info_every==0:
        h5f.flush()
        display.clear_output(wait=True)
        elapsed=time()-beg
        print(f"["+'#'*(int(100*batch/number_of_batches))+' '*(100-int(100*batch/number_of_batches))+f"] {100*batch/number_of_batches:3.4}% \n\nSimulated: {batch}/{number_of_batches}\nElapsed time: {elapsed//3600} h {elapsed//60-(elapsed//3600)*60} min {elapsed-(elapsed//60)*60-(elapsed//3600)*60:2.4} s")

h5f.flush()

[####################                                                                                ] 20.0% 

Simulated: 2/10
Elapsed time: 0.0 h 2.0 min 37.16 s


KeyboardInterrupt: 