### Restart and Run All Function to do the whole pipeline!

In [1]:
from IPython.display import HTML, Javascript

def restart_run_all():
    display(HTML(
        '''
            <script>
                code_show = false;
                IPython.notebook.kernel.restart();
                setTimeout(function(){
                        IPython.notebook.execute_all_cells();
                    }, 1000)
                
            </script>
        '''
    ))
#restart_run_all()

# (A) Determine Adequate Embedding Space Dimensionality
Using MDA (to see when the manifolds torsion starts), using KNN to see when locality of same class can be preserved well, also UMAP for this.

In [2]:
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.manifold as manifold
import os
import pandas as pd
import json
import torch
import json
import umap

from torch.utils.data import Dataset
from torchvision.io import read_image
from torch.utils.data import DataLoader

class ImageDataset(Dataset):
    def __init__(self, GT_file_path, images_dir_path):
        self.df_GTs = pd.DataFrame.from_dict(json.load(open(GT_file_path)))
        self.images_dir_path = images_dir_path
        self.len_data = len(self.df_GTs)

    def __len__(self):
        return self.len_data

    def __getitem__(self, idx):
        img_path = f"{self.images_dir_path}/IM_{self.df_GTs.iloc[idx,0]}_phiCR_{self.df_GTs.iloc[idx,1]}.png"
        image = read_image(img_path) #[1, 2X+1, 2X+1] torch tensor
        label = torch.Tensor([float(self.df_GTs.iloc[idx, 1])]).type(torch.float32) #[1] torch tensor of float32
        return image, label
    
# Noisy Train set!
GT_file_path_train_noisy = f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TRAIN/GROUND_TRUTHS.json"
images_dir_path_train_noisy =f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TRAIN"

# Non-Noisy Train set!
GT_file_path_train_non_noisy = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/OUTPUT/LIBRARIES_OF_THEORETICAL_D/Basler_like_R0_300x_w0_300x_Z_50x_64bit/IMAGE_LIBRARY/NON_NOISY/TEST/GROUND_TRUTHS.json"
images_dir_path_train_non_noisy = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/OUTPUT/LIBRARIES_OF_THEORETICAL_D/Basler_like_R0_300x_w0_300x_Z_50x_64bit/IMAGE_LIBRARY/NON_NOISY/TEST"

# Test set
GT_file_path_test_noisy = f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TEST/GROUND_TRUTHS.json"
images_dir_path_test_noisy =f"/home/oiangu/Desktop/Conical_Refraction_Polarimeter/OUTPUT/NOISY/TEST"

# Non-Noisy Train set!
GT_file_path_test_non_noisy = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/OUTPUT/LIBRARIES_OF_THEORETICAL_D/Basler_like_R0_300x_w0_300x_Z_50x_64bit/IMAGE_LIBRARY/NON_NOISY/TEST/GROUND_TRUTHS.json"
images_dir_path_test_non_noisy = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/OUTPUT/LIBRARIES_OF_THEORETICAL_D/Basler_like_R0_300x_w0_300x_Z_50x_64bit/IMAGE_LIBRARY/NON_NOISY/TEST"

use_noisy=False

num_images=2000
num_images_test = 700  # for the UMAP and NNE curve

num_decimals = 2
random_seed = 666
n_jobs=10
exp_name="Non_Noisy_Dataset"

emb_dims= [1,2,3,4,5,6,7,8,10,15,20] #[1,2,3,4,5,7,10,13,16,20]

save_stuff_path = f"/home/oiangu/Hippocampus/Conical_Refraction_Polarimeter/Embedders/Finding_Dimensionality_of_Manifold/{exp_name}/"
os.makedirs( save_stuff_path, exist_ok=True )

In [3]:
if use_noisy:
    training_data = ImageDataset(GT_file_path_train_noisy, images_dir_path_train_noisy)
    test_data = ImageDataset(GT_file_path_test_noisy, images_dir_path_test_noisy)
else:
    training_data = ImageDataset(GT_file_path_train_non_noisy, images_dir_path_train_non_noisy)
    test_data = ImageDataset(GT_file_path_test_non_noisy, images_dir_path_test_non_noisy)

Get the training data

In [4]:
np.random.seed(random_seed)
random_indices = np.random.choice(range(len(training_data)), num_images, replace=False)
#random_indices = np.random.choice(range(2850), num_images, replace=False)
X21 = training_data[0][0].shape[1]
X = np.zeros( (num_images, X21**2), dtype=np.float32)
y = np.zeros((num_images), dtype=np.float64)

for j,idx in enumerate(random_indices):
    im, lab = training_data[idx]
    X[j, :] = im[0].flatten()
    y[j] = lab   

y_categoric = (np.around(y+np.pi, num_decimals)*10**num_decimals).astype(int)

Testing data for KNN tests

In [5]:
random_indices_test = np.random.choice(range(len(test_data)), num_images_test, replace=False)
#random_indices = np.random.choice(range(2850), num_images, replace=False)
X_test = np.zeros( (num_images_test, X21**2), dtype=np.float32)
y_test= np.zeros((num_images_test), dtype=np.float64)

for j,idx in enumerate(random_indices_test):
    im, lab = test_data[idx]
    X_test[j, :] = im[0].flatten()
    y_test[j] = lab   

Get state of pipeline:

In [6]:
try:
    f = open(f"state_{exp_name}.txt", "r")
    current_state = int(f.read())
    f.close()
    
    current_state+=1
except:
    current_state = 0

Start metric saver:

In [7]:
try:
    out_metrics = open(f"{save_stuff_path}/Metrics_{exp_name}.json", "r")
  
    metrics = json.load(out_metrics)
    metrics = metrics['Metrics']
    
    out_metrics.close()
    
except: 
    metrics={}

### Run Dimensionality Determination:
- If state 0-> do MDS # y not used
- If state 1-> do LLE # y not used
- If state 2 -> do UAMP # y used in continous version
- If stte 3 -> do NCA # y used in categroical version for training of embedder

In [8]:
if current_state == 0:
    # MDS
    args = {'metric':True, 'n_init':4, 'max_iter':40, 'dissimilarity':'euclidean'}
    metrics['MDS']=[]

    for dim in emb_dims:
        embedder = manifold.MDS(n_components=dim, metric=args['metric'], n_init=args['n_init'],
                        max_iter=args['max_iter'], n_jobs=n_jobs, random_state=random_seed, dissimilarity=args['dissimilarity'])

        embedder.fit(X)
        metrics['MDS'].append(embedder.stress_)
        print(f"Embedder of dim {dim} done!")

    fig, ax = plt.subplots(1,1,figsize=(10,10))
    ax.plot(emb_dims, metrics['MDS'], 'o-')
    #ax.set_ylim((0, max(metrics['MDS'])))
    ax.set_title("MDS Stress by Embedding Space Dimension\n"+exp_name)
    ax.grid(True)
    ax.set_xlabel("Embedding space dimensions")
    ax.set_ylabel("MDS Stress")
    plt.savefig(f"{save_stuff_path}/MDS_stress_{exp_name}.png")
    plt.show()
    
elif current_state == 1000:
    # LLE
    args = {'exp':'LLE_standard',"method":"standard", "n_neighbors": 50,"emb_dims": emb_dims, 'max_iter':50}
    # Methods: standard, hessian, ltsa, modified (modified_tol) 
    methods = ["standard", "ltsa", "hessian"]

    for method in methods:
        args['method']=method
        metrics["LLE_"+method]=[]
        if method=='hessian':
            args['n_neighbors'] = dim * (dim + 3) //2 + 2
        for dim in emb_dims:
            embedder = sk.manifold.LocallyLinearEmbedding(method=args['method'], n_neighbors=args['n_neighbors'],
                  n_components=dim, max_iter=args['max_iter'], random_state=random_seed, n_jobs=n_jobs) 
            embedder = embedder.fit(X)
            metrics["LLE_"+method].append(embedder.reconstruction_error_)
            print(f"Done dim {dim}!")
        print(f"Done method {method}")
        fig, ax = plt.subplots(1,1,figsize=(10,10))
        ax.plot(emb_dims, metrics["LLE_"+method], 'o-')
        #ax.set_ylim((0, max(metrics["LLE_"+method])))
        ax.set_title(f"LLE {method} Reconstruction Error\n by Embedding Space Dimension\n"+exp_name)
        ax.set_xlabel("Embedding space dimensions")
        ax.set_ylabel("LLE reconstruction error")
        ax.grid(True)
        plt.savefig(f"{save_stuff_path}/LLE_{method}.png")
        plt.show()

    fig, ax = plt.subplots(1,1,figsize=(10,10))
    for method in methods:
        ax.plot(emb_dims, metrics["LLE_"+method], 'o-',label=method)

    ax.legend()
    ax.set_title(f"LLE Reconstruction Error\n by Embedding Space Dimension\n"+exp_name)
    ax.set_xlabel("Embedding space dimensions")
    ax.set_ylabel("LLE reconstruction error")
    ax.grid(True)
    plt.savefig(f"{save_stuff_path}/LLE_{exp_name}.png")
    plt.show()

elif current_state == 2:
    args_sk_KNN = {'n_neighbors':5, 'weights':'uniform', 'algorithm':'auto', 'leaf_size':20, 'p':2,
               'metric':'minkowski', 'n_jobs':n_jobs}

    # UMAP
    args = {'exp':'UMAP', 'emb_dims':emb_dims, 'min_dist':0.2, 'n_neighbors':50, 'metric':'euclidean', 'n_epochs':40,
           'target_metric':'l2'}
    # Metrics: euclidean, canberra, cosine, manhattan, braycurtis, mahalanobis, hamming
    metrics["UMAP"]=[]
    for dim in emb_dims:
        embedder = umap.UMAP(n_components=dim, min_dist=args['min_dist'], n_epochs=args['n_epochs'],
                n_neighbors=args['n_neighbors'], metric=args['metric'], random_state=random_seed, n_jobs=n_jobs,
                target_metric=args['target_metric']) 
        embedder = embedder.fit(X,y)
        print(f"Dimension {dim} trained")
        KNN = sk.neighbors.KNeighborsRegressor(n_neighbors=args_sk_KNN['n_neighbors'],
                weights=args_sk_KNN['weights'], algorithm=args_sk_KNN['algorithm'],
                leaf_size=args_sk_KNN['leaf_size'], p=args_sk_KNN['p'], 
                metric=args_sk_KNN['metric'], n_jobs=args_sk_KNN['n_jobs'])
        KNN = KNN.fit(embedder.embedding_, y)
        metrics["UMAP"].append(1-KNN.score(embedder.transform(X_test),y_test))
        print(f"KNN ran")

    fig, ax = plt.subplots(1,1,figsize=(10,10))
    ax.plot(emb_dims, metrics['UMAP'], 'o-')
    #ax.set_ylim((0, max(metrics['UMAP'])))
    ax.set_title("UMAP Embedding 1-KNN Regression Score\n"+exp_name)
    ax.set_xlabel("Embedding space dimensions")
    ax.set_ylabel("1-KNN score on test set")
    ax.grid(True)
    plt.savefig(f"{save_stuff_path}/UMAP_score_knn_{exp_name}.png")
    plt.show()
    
elif current_state == 3:
    args_sk_KNN = {'n_neighbors':5, 'weights':'uniform', 'algorithm':'auto', 'leaf_size':30, 'p':2,
               'metric':'minkowski', 'n_jobs':n_jobs}

    # NCA
    args = {'exp':'NCA', 'emb_dims':emb_dims, 'init':'auto', 'max_iter':40, }
    # init ‘auto’, ‘pca’, ‘lda’, ‘identity’, ‘random’
    metrics["NCA"]=[]
    for dim in emb_dims:
        embedder = sk.neighbors.NeighborhoodComponentsAnalysis(n_components=dim, init=args['init'],
                                    max_iter=args['max_iter'], random_state=random_seed)
        X_emb = embedder.fit_transform(X,y_categoric)
        print(f"Dimension {dim} done!")
        KNN = sk.neighbors.KNeighborsRegressor(n_neighbors=args_sk_KNN['n_neighbors'],
                weights=args_sk_KNN['weights'], algorithm=args_sk_KNN['algorithm'],
                leaf_size=args_sk_KNN['leaf_size'], p=args_sk_KNN['p'], 
                metric=args_sk_KNN['metric'], n_jobs=args_sk_KNN['n_jobs'])
        KNN = KNN.fit(X_emb, y)
        metrics["NCA"].append(1-KNN.score(embedder.transform(X_test), y_test))
        print("KNN done!")

    fig, ax = plt.subplots(1,1,figsize=(10,10))
    ax.plot(emb_dims, metrics['NCA'], 'o-')
    #ax.set_ylim((0, max(metrics['NCA'])))
    ax.set_title("NCA Embedding 1-KNN Regression Score\n"+exp_name)
    ax.set_xlabel("Embedding space dimensions")
    ax.set_ylabel("1-KNN score on test set")
    ax.grid(True)
    plt.savefig(f"{save_stuff_path}/NCA_score_knn_{exp_name}.png")
    plt.show()  

else:
    raise ValueError

ValueError: 

### Save gathered metrics

In [None]:
# the json file where the output must be stored
out_metrics = open(f"{save_stuff_path}/Metrics_{exp_name}.json", "w")

json.dump({'Emb_dims':emb_dims, 'Metrics':metrics}, out_metrics)
  
out_metrics.close()

### If everything fine until here then updte state

In [None]:
f = open(f"state_{exp_name}.txt", "w")
f.write(str(current_state))
f.close()

## Restart kernel and re-run all

In [None]:
restart_run_all()

In [None]:
# pa ver la variedad nooisy y non-noisy, porke los noisy pueden hacer de puente de einstein rosen sino entre partes del manifold
# pa hacer embedding spaces para knn o fc o pa usarlo de simulated metric en plan euclidean ya no lo sé...
# supongo que si es un embedding ke mira los GT entonces noisy, si no...sólo se fija en la topología digamos...mejor non-noisy pa evitar puentes?