# labeled_dataset

### Imports

In [None]:
import os
import cv2
import keras
import joblib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import os
from tqdm.contrib.concurrent import process_map
from matplotlib import pyplot as plt
from tensorflow.keras.applications import VGG16, VGG19, ResNet50, InceptionV3, DenseNet121, MobileNetV2
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization
from tensorflow.keras.models import Model
import shutil

### Definitions

In [None]:
MODEL_PATH = "../../models/clustering_model.pkl"
IMAGE_PATH = "/content/drive/My Drive/Investigacion/UTN/GIAR/Dataset/cropped_cells_original"
INPUT_SHAPE = (128, 128, 3)
OUTPUT_PATH = f"/content/drive/My Drive/Investigacion/UTN/GIAR/Dataset/clustering/{MODEL}_{METHOD}_v0"

### Functions

In [None]:
def get_relative_file_paths(folder_path):

    """
    Gets a list of relative paths to all files within a given folder.

    Args:
        folder_path (str): The path to the folder.

    Returns:
        list: A list of relative file paths.
    """

    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

def load_image (x):
    if MODEL == "AutoEncoder":
        return cv2.imread(x, cv2.IMREAD_GRAYSCALE)
    else:
        return cv2.imread(x)

#### Get cluster representatrives

In [None]:
image_paths = sorted(get_relative_file_paths(IMAGE_PATH))

images = process_map(
                load_image,
                image_paths,
                total=len(image_paths),
                max_workers=16,
                chunksize=32,
            )

resized_images = [cv2.resize(image, INPUT_SHAPE[0:2]) for image in images]
resized_images = np.array(resized_images)

In [None]:
clustering_loaded = joblib.load('kmeans_model.pkl')
seleccted_class = clustering_loaded.labels_

NUM_CLUSTERS = len(np.unique(seleccted_class))
GRID_SIZE = 8
NUM_SHOW = GRID_SIZE*GRID_SIZE

rep_images =[[] for _ in range(NUM_CLUSTERS)]
for cluster in range(NUM_CLUSTERS):
    for idx, label in enumerate(seleccted_class):
        if label >= 0:
            if len(rep_images[label]) < NUM_SHOW:
                rep_images[label].append(resized_images[idx])

In [None]:
for rep_idx, rep_list in enumerate(rep_images):

    plt.figure()
    n = 1
    for image in rep_list:
        plt.subplot(GRID_SIZE,GRID_SIZE,n)
        plt.imshow(image)
        plt.axis(False)
        n+=1
    plt.suptitle(f"Cluster {rep_idx}")
    plt.tight_layout()

    plt.show()

### Dataset generation

In [None]:
# Manually pick one
CELL_CLUSTERS = [2,5,13]
NOT_CLUSTERS=[1,6,7,14]

In [None]:
SPLIT = 0.7
realization_samples = dict()
for file in os.listdir(IMAGE_PATH):
    base, realiz, *_ = file.split("_")
    if base not in realization_samples.keys():
        realization_samples[base]  = set()
    realization_samples[base].add(realiz)


train_samples = dict()
validation_samples = dict()
for key in realization_samples.keys():
    images_here = len(realization_samples[key])

    train_images_here = int(np.floor(images_here*SPLIT))
    this_train_sample = np.random.choice(list(realization_samples[key]), train_images_here, replace=False)
    if base not in train_samples.keys():
        train_samples[key]  = list()
        validation_samples[key]  = list()
    train_samples[key] = this_train_sample
    validation_samples[key] = [a for a in list(realization_samples[key]) if a not in this_train_sample]

print("Train:")
for key in train_samples.keys():
    print(key, len(train_samples[key]))
print("Validation:")
for key in validation_samples.keys():
    print(key, len(validation_samples[key]))

In [None]:
max_class = 3500

# Random Shuffle
indices = np.arange(seleccted_class.shape[0])
np.random.shuffle(indices)
seleccted_class_shuffle = seleccted_class[indices]
image_paths_shuffle = np.array(image_paths)[indices]

if os.path.exists(OUTPUT_PATH):
    shutil.rmtree(OUTPUT_PATH)
os.makedirs(os.path.join(OUTPUT_PATH, 'train', "not"))
os.makedirs(os.path.join(OUTPUT_PATH, 'train', "cells"))
os.makedirs(os.path.join(OUTPUT_PATH, 'validation', "not"))
os.makedirs(os.path.join(OUTPUT_PATH, 'validation', "cells"))

for idx, cluster in enumerate(seleccted_class_shuffle):
    if cluster >= 0:
        if cluster in CELL_CLUSTERS:
            folder = "cells"
        elif cluster in NOT_CLUSTERS:
            folder = "not"
        else:
            continue

        file = image_paths_shuffle[idx]
        img_name = os.path.basename(file)
        base, realiz, *_ = img_name.split("_")

        if realiz in train_samples[base]:
            split = "train"
        else:
            split = "validation"

        if len(os.listdir(os.path.join(OUTPUT_PATH, split,folder))) >= max_class:
            continue


        shutil.copyfile(file, os.path.join(OUTPUT_PATH, split, folder, img_name))

In [None]:
print(os.path.basename(OUTPUT_PATH))

AutoEncoder_agglomerative_v0


In [None]:
print(len(os.listdir(os.path.join(OUTPUT_PATH, "train", "not"))))
print(len(os.listdir(os.path.join(OUTPUT_PATH, "train", "cells"))))
print(len(os.listdir(os.path.join(OUTPUT_PATH, "validation", "not"))))
print(len(os.listdir(os.path.join(OUTPUT_PATH, "validation", "cells"))))

3500
3500
3500
2714
