In [162]:
import os
import cv2
import keras
import joblib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

#### Functions

In [163]:
def get_relative_file_paths(folder_path):
    
    """
    Gets a list of relative paths to all files within a given folder.

    Args:
        folder_path (str): The path to the folder.

    Returns:
        list: A list of relative file paths.
    """

    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

def train_kmeans(n_clusters, features, output_dir):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(features)
    joblib.dump(kmeans, output_dir)


#### Paths

In [14]:

IMAGE_DIR = "./output/cropped_cells_full"
TEST_DIR = "./output/cropped_cells"
KMEANS_HISTOGRAM = 'models/kmeans_histogram.pkl'
KMEANS_ENCODER = 'models/kmeans_encoder.pkl'
ENCODER_PATH = "./models/encoder.keras"

#### Load the train images

In [15]:
image_paths = sorted(get_relative_file_paths(IMAGE_DIR))
print(len(image_paths))
images = [cv2.imread(file, cv2.IMREAD_GRAYSCALE) for file in image_paths]

53642


#### Load the test images

In [86]:
test_paths = sorted(get_relative_file_paths(TEST_DIR))
print(len(test_paths))
test_images = [cv2.imread(file, cv2.IMREAD_GRAYSCALE) for file in test_paths]

116


### Use color histogram as features

In [152]:
# Load images and extract features (e.g., color histograms)
features = [cv2.calcHist(img, [0], None, [8], [0, 256]).flatten() for img in images]
features_array = np.array(features)#.reshape(-1, 1)
features_array = features_array/255
features_array.shape


(53642, 8)

In [158]:
train_kmeans(n_clusters=2, features=features_array, output_dir=KMEANS_HISTOGRAM)

array([1, 1, 1, ..., 0, 1, 0], dtype=int32)

#### Test color histogram 

In [166]:
img_n = 12
loaded_kmeans = joblib.load(KMEANS_HISTOGRAM)

feature = cv2.calcHist(test_images[img_n], [0], None, [8], [0, 256]).flatten()
feature = np.array(feature).reshape(1, -1)
feature = feature/255
labels = loaded_kmeans.predict(feature)

print(f"Image: {test_paths[img_n]} - Prediction: {labels[0]} - shape{feature.shape}")


Image: ./output/cropped_cells_full/001_00001_2.png - Prediction: 1 - shape(1, 8)


### Use encoder embeddings as features

In [27]:
#Transform input images for encoder input
images = [cv2.resize(image, (128, 128)) for image in images]
images = np.array(images)
images = images/255
images = [np.expand_dims(image, axis=(0, -1)) for image in images]

In [28]:
# Extract features from encoder
encoder = keras.saving.load_model(ENCODER_PATH)
predictions = []
for image in images:
    predictions.append(encoder.predict(image, verbose=0))

features = np.array(predictions)
features_array = features.reshape(features.shape[0], features.shape[2]) # Create a 2D array of features since features is 3D

#### Test encoder kmeans

In [None]:
img_n = 1
loaded_kmeans = joblib.load(KMEANS_HISTOGRAM)
encoder = keras.saving.load_model(ENCODER_PATH)

img = np.expand_dims(test_images[img_n], axis=(0, -1))
feature = loaded_kmeans.predict(encoder.predict(img, verbose=0))[0]
labels = loaded_kmeans.predict(feature)

print(f"Image: {test_paths[img_n]} - Prediction: {labels[0]}")

### Create datasets

In [170]:
import shutil

In [178]:
if not os.path.exists('./output/noise'):
    os.makedirs('./output/noise')

if not os.path.exists('./output/cells'):
    os.makedirs('./output/cells')

#### Get info from centroid distances

In [169]:
kmeans_loaded = joblib.load(KMEANS_HISTOGRAM)
centroids = kmeans_loaded.cluster_centers_
distances = []
for idx, feature in enumerate(features_array):
    distance_to_noise_centroid = np.linalg.norm(feature - centroids[0])
    distance_to_cell_centroid = np.linalg.norm(feature - centroids[1])

    distances.append([image_paths[idx], distance_to_noise_centroid, distance_to_cell_centroid])

df = pd.DataFrame(distances, columns=['Image', 'distance_to_noise_centroid', 'distance_to_cell_centroid'])
df.to_csv('./distances.csv')

print(f"Distances to noise centroid: ")
print(f"Min:  {df['distance_to_noise_centroid'].min()}")
print(f"Max:  {df['distance_to_noise_centroid'].max()}")
print(f"Mean: {df['distance_to_noise_centroid'].mean()}")
print()
print(f"Distances to cell centroid: ")
print(f"Min:  {df['distance_to_cell_centroid'].min()}")
print(f"Max:  {df['distance_to_cell_centroid'].max()}")
print(f"Mean: {df['distance_to_cell_centroid'].mean()}")

Distances to noise centroid: 
Min:  0.014769027940928936
Max:  0.8673118948936462
Mean: 0.4468424320220947

Distances to cell centroid: 
Min:  0.018689196556806564
Max:  0.8502339720726013
Mean: 0.36665523052215576


#### Create noise/cell dataset from predictions

In [19]:
for image in image_paths:
    img = cv2.imread(image)
    img_name = os.path.basename(image)
    prediction = cv2.calcHist([img], [0], None, [8], [0, 256]).flatten()
    prediction = np.array(prediction).reshape(1, -1)
    prediction = prediction/255
    cell = True if kmeans_loaded.predict(prediction)[0] == 1 else False
    folder = "cells" if cell else "noise"
    shutil.copyfile(image, f"./output/{folder}/{img_name}")

#### Create new noise/cell datasets choosing by centroid distance

In [179]:
cell_threshold  = 0.36
noise_threshold = 0.16
for distance in distances:
    img_name = os.path.basename(distance[0])

    #distance[2] is the distance to the cell centroid and distance[1] to the noise centroid
    cell = True if distance[2] <= cell_threshold and distance[1] >= noise_threshold else False

    folder = "cells" if cell else "noise"
    shutil.copyfile(distance[0], f"./output/{folder}/{img_name}")
