## imports

In [None]:
import pickle
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import cv2
import numpy as np

# Part 1

## load dataset

In [None]:
def load_dataset():
  image_pickle_file_path = 'images.pkl'
  label_pickle_file_path = 'label.pkl'

  with open(image_pickle_file_path, 'rb') as file:
    images = pickle.load(file)

  with open(label_pickle_file_path, 'rb') as file:
    labels = pickle.load(file)

  return images, labels

In [None]:
n_images = 30

In [None]:
images, labels = load_dataset()
labels = np.array(labels)
random_indices = np.random.choice(560, size=n_images, replace=False)
# Extract the randomly selected values
random_values = images[random_indices]
random_labels = labels[random_indices]
from dataclasses import dataclass, field


@dataclass
class Images:
    image: np.ndarray = None
    original_features = []
    features = []
    clusters = None

## Proccess on images
- Extract features
- normalize features
- make features a single dimnetion vector

In [None]:
def extract_features(image):
    features = []
    original_features = []

    img_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # For each pixel in the image
    for i in range(img_hsv.shape[0]):
        for j in range(img_hsv.shape[1]):
            pixel = img_hsv[i, j]
            h, s, v = pixel
            x, y = i, j
            
            features.append((h, s, v, x, y))

    return features

In [None]:
def normalize_features(features):
    features_array = np.array(features)
    scaler = MinMaxScaler()
    normalized_features_array = scaler.fit_transform(features_array)
    normalized_features = normalized_features_array.tolist()
    return normalized_features

In [None]:
new_features = []

for img in random_values:
    features = extract_features(img)
    new_features.append(features)

In [None]:
# normalize features
for i, fimg in enumerate(new_features):
    new_features[i] = normalize_features(fimg)

In [None]:
def set_priority(arr, color, pos):
    color_columns = np.tile(arr[:, :3], (1, color))
    pos_columns = np.tile(arr[:, 3:], (1, pos))
    duplicated_arr = np.concatenate((color_columns, pos_columns), axis=1)

    return duplicated_arr

In [None]:
new_features_freq = []
for nf in new_features:
    new_features_freq.append(set_priority(np.array(nf), 4, 2).tolist())

## clustering

### k-means

In [None]:
k_number = 15
kmeans = KMeans(n_clusters=k_number)
new_clusters = []
for nf in new_features_freq:
    clusters = kmeans.fit_predict(nf)
    new_clusters.append(clusters)

In [None]:
for i, nc in enumerate(new_clusters):
    new_clusters[i] = nc.reshape(random_values.shape[1], random_values.shape[2])

## extracting clusters features for each image sepratedly

In [None]:
def find_matching_triads(array1, array2):   

    unique_values = np.unique(array1)
    result = []

    for value in unique_values:
        indexes = np.where(array1 == value)[0]
        triads = array2[indexes, :].tolist()
        result.append(triads)
        
    return result

### mean color for each cluster in each image

In [None]:
def calculate_mean(data):

    data_array = np.array(data)
    mean_values = np.mean(data_array, axis=0)
    mean_values = mean_values.astype(int)
    mean_values_list = mean_values.tolist()
    
    return mean_values_list

In [None]:
def mean_clusters_each_image(new_cls, random_values):

    all_images_mean_values = []
    
    for i, image_cls in enumerate(new_cls):

        one_image_clusters_mean_values = []
        reshaped_image_cls = image_cls.reshape(image_cls.shape[0] * image_cls.shape[1])
        reshaped_random_value = random_values[i].reshape(random_values.shape[1] * random_values.shape[2], random_values.shape[3])

        similar_clusters_pixels = find_matching_triads(reshaped_image_cls, reshaped_random_value)

        for smp in similar_clusters_pixels:
            one_image_clusters_mean_values.append(calculate_mean(smp))

        all_images_mean_values.append(one_image_clusters_mean_values)

    return all_images_mean_values

In [None]:
# mean_each_cluster_each_image = mean_clusters_each_image(new_clusters, random_values)

## display clustering

In [None]:
def display_clustering(image_clusters, each_cluster_color, image_index):
    # Create an empty array to hold the HSV values for each pixel
    rgb_image = np.zeros((image_clusters.shape[0], image_clusters.shape[1], 3), dtype=np.uint8)

    # Assign the corresponding HSV values to each pixel
    for i in range(image_clusters.shape[0]):
        for j in range(image_clusters.shape[1]):
            value = image_clusters[i, j] 
            hsv = each_cluster_color[value]
            # rgb = colorsys.hsv_to_rgb(hsv[0] / 360, hsv[1] / 100, hsv[2] / 100)
            # rgb_image[i, j] = np.round(np.array(rgb) * 255).astype(int)
            rgb_image[i, j] = hsv

    # Display the image with corresponding RGB values using cv2.imshow
    cv2.imshow('org', random_values[image_index])
    cv2.imshow('Image with Corresponding RGB Values', cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR))

    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [None]:
each_cluster_color = [
    [255, 0, 0],    # Red
    [0, 255, 0],    # Green
    [0, 0, 255],    # Blue
    [255, 255, 0],  # Yellow
    [255, 0, 255],  # Magenta
    [0, 255, 255],  # Cyan
    [128, 0, 0],    # Maroon
    [0, 128, 0],    # Green (Dark)
    [0, 0, 128],    # Navy
    [128, 128, 128], # Gray
    [255, 128, 0],  # Orange
    [128, 0, 128],  # Purple
    [0, 128, 128],  # Teal
    [128, 128, 0],  # Olive
    [192, 192, 192]  # Silver
]

In [None]:
# for i, img_clusters in enumerate(new_clusters[:2]):
#     display_clustering(img_clusters, each_cluster_color, i)

# Part 2

## create clusters feature vectors 

In [None]:
def extract_cluster_features(image_number, image_clusters, arr_new_features, k_number):
    clusters_features = []
    clusters = [[] for i in range(k_number)]
    for index, value in enumerate(image_clusters):
        clusters[value].append(arr_new_features[image_number, index])
    for cluster in clusters: # clusters contains all pixels which belong to the same cluster.
        temp_array = np.array(cluster)
        min_value = np.min(temp_array, axis=0)
        max_value = np.max(temp_array, axis=0)
        mean_value = np.mean(temp_array, axis=0)
        x_field = max_value[3] - min_value[3]
        y_field = max_value[4] - min_value[4]
        shape_field = np.array([x_field / y_field])
        combined_results = np.concatenate((min_value[:3], max_value[:3], mean_value[:3], shape_field), axis=None)
        clusters_features.append(combined_results)

    return np.array(clusters_features)

In [None]:
all_images_clusters_features = []
arr_new_features = np.array(new_features)
for image_num, image_clusters in enumerate(new_clusters):
    reshaped_image_clusters = image_clusters.reshape(image_clusters.shape[0] * image_clusters.shape[1])
    all_images_clusters_features.append(extract_cluster_features(image_num, 
                                                                 reshaped_image_clusters,
                                                                 arr_new_features, k_number))

## set priority for clusters features

In [None]:
color_pri = 5
pos_pri = 6

In [None]:
def set_priority_clusters_vectors(arr, color, pos):
    color_columns = np.tile(arr[:, :9], (1, color))
    pos_columns = np.tile(arr[:, 9:], (1, pos))
    duplicated_arr = np.concatenate((color_columns, pos_columns), axis=1)

    return duplicated_arr

In [None]:
# all_images_clusters_features_freq = []
# for nf in all_images_clusters_features:
#     all_images_clusters_features_freq.\
#             append(set_priority_clusters_vectors(np.array(nf), color_pri, pos_pri).tolist())

In [None]:
# all_images_clusters_features_freq

## clustering clusters

### k-means

In [None]:
# k_number_2 = 100
# kmeans_2 = KMeans(n_clusters=k_number_2)
# arr_all_images_clusters_features_freq = np.array(all_images_clusters_features_freq)
# arr_all_images_clusters_features_freq = arr_all_images_clusters_features_freq.reshape(\
#                                             arr_all_images_clusters_features_freq.shape[0] *\
#                                             arr_all_images_clusters_features_freq.shape[1],
#                                             arr_all_images_clusters_features_freq.shape[2])

# clusters2 = kmeans_2.fit_predict(arr_all_images_clusters_features_freq)

In [None]:
# clusters2_arr = clusters2.reshape(np.array(all_images_clusters_features_freq).shape[0], np.array(all_images_clusters_features_freq).shape[1])
# clusters2_arr

In [None]:
def count_numbers(lst, k_number_2):
    # Initialize a list to store counts
    count_list = [0] * k_number_2  # Index 0 is not used to align with numbers 1 to 10

    # Count occurrences of each number in the list
    for num in lst:
        if 1 <= num <= 10:
            count_list[num - 1] += 1  # Adjust index to align with numbers 1 to 10

    return count_list

In [None]:
# histogram = np.empty((n_images, k_number_2))
# for i, img_cls in enumerate(clusters2_arr):
#     histogram[i] = count_numbers(img_cls, k_number_2)

# histogram

## classify

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Classify the datapoints with the Random Forest Classifier
def classify(datapoints, labels):
  test_size = 0.2
  X_train, X_test, y_train, y_test = train_test_split(datapoints, labels, test_size=test_size, random_state=42)

  clf = RandomForestClassifier(n_estimators=100, random_state=42)

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  y_prob_test = clf.predict_proba(X_test)
  y_prob_train = clf.predict_proba(X_train)
  y_prob = np.concatenate((y_prob_test, y_prob_train), axis=0)

  true_lables = np.concatenate((y_test, y_train))

  accuracy = accuracy_score(y_test, y_pred)
  # print(f"Accuracy: {accuracy * 100:.2f}%")

  # Print the true labels and predicted labels
  # print("True labels:")
  # print(y_test)

  # print("Predicted labels:")
  # print(y_pred)

  # print("Probability estimates:")
  # print(y_prob)

  return [accuracy, true_lables, y_pred, y_prob]

# Phase 2

## create images feature vectors

### create images feature vectors without removing clusters

In [None]:
"""
    Extract a features for each image 
    by calculating the mean of all its culsters
"""
def calculate_mean_imgs_clstrs_features():
    histograms = []
    for ftr_clstrs_img in all_images_clusters_features:
        histograms.append(np.mean(ftr_clstrs_img, axis=0))
    return histograms

In [None]:
# histograms = calculate_mean_imgs_clstrs_features()

### create images feature vectors with removing clusters

In [None]:
all_images_clusters_features_bak = all_images_clusters_features
k_number_bak = k_number

In [None]:
def apply_back_ups():
    all_images_clusters_features_bak = all_images_clusters_features
    k_number_bak = k_number

In [None]:
"""
    Extract a features for each image 
    by calculating the mean of its culsters excpet one of them
"""
def calculate_mean_imgs_clstrs_except_index(index_, img_num, remove):
    histograms = []
    for i, ftr_clstrs_img in enumerate(all_images_clusters_features_bak):
        if i == img_num and remove:
            histograms.append(np.mean(np.concatenate((ftr_clstrs_img[:index_], ftr_clstrs_img[index_+1:])), axis=0))
        else:
            histograms.append(np.mean(ftr_clstrs_img, axis=0))  

    return histograms

In [None]:
# classifies images while a cluster of an image has been removed.
# This happens for all clusters of that image.
def cal_classify_results(img_num, remove):
    classify_results = []
    for clstr_num in range(k_number_bak):
        histograms = calculate_mean_imgs_clstrs_except_index(index_= clstr_num, img_num = img_num, remove=remove)
        rslt = classify(histograms, random_labels)
        classify_results.append(rslt)
    return classify_results

In [None]:
# classify_results = cal_classify_results(1, remove=True)

## gather the labels probability of each image

In [None]:
def cal_probs(classify_results):
    probs = [ [] for _ in range(len(classify_results[0][3])) ]
    for result in classify_results:
        for i, imgs_probs in enumerate(result[3]):
            probs[i].append(imgs_probs)
    
    return(probs)

In [None]:
# probs = cal_probs(classify_results)

## check the lables probability of each image

In [None]:
def show_probs(probs, classify_results):
    for i, prob in enumerate(probs):
        print(np.array(prob))
        print(f"acc={classify_results[i][0]}, true labels={classify_results[i][1]}, predicted labels={classify_results[i][2]}")
        print("---------------------------------")

In [None]:
# show_probs(probs, classify_results)

## determine clusters importance

In [None]:
default_images_clusters_importance = [{} for _ in range(n_images)]

for d in default_images_clusters_importance:
    for i in range(15):
        d[i] = None 

In [None]:
'''
    prob for each image and true label would be checked,
    ckeck in which one of those 15 state has best result for true label

'''
def check_clusters_importance(default_images_clusters_importance):
    images_clusters_importance = default_images_clusters_importance
    # for each image in test
    for img in range(n_images): 
        
        Effect_of_cluster_removal = images_clusters_importance[img]
        clusters = list(Effect_of_cluster_removal.keys())

        classify_results = cal_classify_results(img, remove=False)
        classify_results_remove = cal_classify_results(img, remove=True)
        
        probs = cal_probs(classify_results)
        probs_remove = cal_probs(classify_results_remove)
        
        # print(classify_results[0][4])
        true_label = classify_results[0][1][img]
        # print(true_label)
        # print(classify_results)
        class_probability = probs[img][0][true_label]
        # print(probs)
        # print(class_probability)
        # check the effect of cluster removal
        # for each cluster
        for i, cls in enumerate(clusters):
            class_probability_after_remove = probs_remove[img][i][true_label]
            Effect_of_cluster_removal[cls] = class_probability_after_remove - class_probability

        # this dictionary contains the importance of
        # clusters for an image in ascending order.
        # (first element has the least importance)
        importance_of_clusters = dict(sorted(Effect_of_cluster_removal.items(), key=lambda item: item[1]))
        images_clusters_importance.append(importance_of_clusters)
    
    return images_clusters_importance
            

## remove least important cluster from each image

In [None]:
def remove_least_important_clusters(images_clusters_importance):
    modified_all_images_clusters_features = []
    for i, img_clstr in enumerate(images_clusters_importance):
        keys_list = list(img_clstr.keys())
        least_important_clstr = keys_list[0]
        modified_all_images_clusters_features.append(np.delete(all_images_clusters_features_bak[i], least_important_clstr, axis=0))
    
    return modified_all_images_clusters_features

In [None]:
# images_clusters_importance = check_clusters_importance()
# images_clusters_importance

In [None]:
# all_images_clusters_features_bak = remove_least_important_clusters(images_clusters_importance)
# k_number_bak -= 1
# all_images_clusters_features_bak

## remove clusters based on importance

In [None]:
apply_back_ups()

In [None]:
for i in range(k_number - 1):
    print(i, "----------------------------------------------------")
    images_clusters_importance = check_clusters_importance(default_images_clusters_importance)
    default_images_clusters_importance = images_clusters_importance
    print(np.array(images_clusters_importance[0]))
    all_images_clusters_features_bak = remove_least_important_clusters(images_clusters_importance)
    print(np.array(all_images_clusters_features_bak[0]))
    k_number_bak -= 1
    print("-------------------------------------------------------")

In [None]:
images_clusters_importance

In [None]:
import numpy as np
import cv2

def display_selected_clusters(image_clusters, each_cluster_color, selected_clusters, image_index):
    # Create an empty array to hold the HSV values for each pixel
    rgb_image = np.zeros((image_clusters.shape[0], image_clusters.shape[1], 3), dtype=np.uint8)

    # Assign the corresponding HSV values to each pixel for the selected clusters
    for i in range(image_clusters.shape[0]):
        for j in range(image_clusters.shape[1]):
            value = image_clusters[i, j]
            if value in selected_clusters:
                hsv = each_cluster_color[value]
                rgb_image[i, j] = hsv

    # Display the image with corresponding RGB values using cv2.imshow
    cv2.imshow('org', random_values[image_index])
    cv2.imshow('Image with Selected Clusters', cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR))


    cv2.waitKey(0)
    cv2.destroyAllWindows()


In [None]:
selected_clusters = [5]  # Specify the clusters you want to display
display_selected_clusters(new_clusters[1], each_cluster_color, selected_clusters, 1)