In [None]:
# from google.colab import drive
# drive.mount('/content/drive')  # use this if you are using google colab

In [108]:
from sklearn.cluster import KMeans
import pickle, os
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
def load_dataset(name_file):

    desired_directory = '.' # Replace with your desired directory path

    file_path = os.path.join(desired_directory, name_file) 

    with open(file_path, 'rb') as f:
        data_dict = pickle.load(f)

    return data_dict


In [3]:
loaders_dict = load_dataset(f"dataset-flowers102-features.pkl")
x_train = loaders_dict["x_train"]
x_test = loaders_dict["x_test"]
y_train = loaders_dict["y_train"]
y_test = loaders_dict["y_test"]
print(f"x_train:{x_train.shape}, y_train:{y_train.shape}")
print(f"x_test:{x_test.shape}, y_test:{y_test.shape}")


x_train:(4094, 512), y_train:(4094,)
x_test:(4095, 512), y_test:(4095,)


## clustering images

In [86]:
k_number = 50
kmeans = KMeans(n_clusters=k_number)
clusters = kmeans.fit_predict(x_train)
centroids = kmeans.cluster_centers_

  super()._check_params_vs_input(X, default_n_init=10)


In [87]:
# centroid_distances = [distance.euclidean(x_test[0], centroid) for centroid in centroids]
# centroid_distances

## find nearest clusters neighbors

### find nearest clusters and their indices

In [88]:
def find_nearest_clusters_neighbors(x_train, x_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(x_train, range(len(x_train)))
    nearest_indices = knn_classifier.kneighbors([x_test], n_neighbors=k, return_distance=False)[0]
    nearest_neighbors = x_train[nearest_indices]
  
    return nearest_indices, nearest_neighbors

In [138]:
n_nearest_neighbors = 3
x_train_clusters = centroids
nearest_clusters_indices, nearest_clusters_neighbors = find_nearest_clusters_neighbors(x_train_clusters, 
                                                                                       x_test[1], 
                                                                                       k=n_nearest_neighbors)

In [139]:
print(f"clusters Indices:")
print(nearest_clusters_indices)
print(f"nearest neighbor clusters centroids:")
print(nearest_clusters_neighbors)

clusters Indices:
[48  9  5]
nearest neighbor clusters centroids:
[[0.39521152 0.28513736 0.7226784  ... 1.7530988  1.626411   1.7072948 ]
 [1.2427926  0.72997695 0.2645297  ... 1.3031917  0.29992753 0.38164878]
 [1.4638481  1.3813342  0.58729076 ... 1.1703478  1.6293299  0.9211251 ]]


### get the data of nearest clusters

In [140]:
def gather_clusters_data(k_number, clusters):
    clusters_data = {i: [] for i in range(k_number)}
    clusters_data_labels = {i: [] for i in range(k_number)}
    for i, label in enumerate(clusters):
        clusters_data[label].append(x_train[i])
        clusters_data_labels[label].append(y_train[i])
    return clusters_data, clusters_data_labels

In [141]:
clusters_data, clusters_data_labels = gather_clusters_data(k_number, clusters)

In [142]:
def get_nearest_clusters_data(nearest_clusters_indices, clusters_data, clusters_data_labels):
    # Access data points in the cluster with centroid index 'nearest_clusters_indices'
    data_in_nearest_clusters = []
    labels_in_nearest_clusters = [] 
    for index in nearest_clusters_indices:
        data_in_nearest_clusters.extend(clusters_data[index])
        labels_in_nearest_clusters.extend(clusters_data_labels[index])
    return data_in_nearest_clusters, labels_in_nearest_clusters

In [143]:
data_in_nearest_clusters, labels_in_nearest_clusters = get_nearest_clusters_data(nearest_clusters_indices, 
                                                                                 clusters_data, 
                                                                                 clusters_data_labels)

## classify using data in nearest clusters

In [144]:
def classify_knn(x_train, y_train, x_test, y_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(x_train, y_train)
    y_pred = knn_classifier.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return y_pred, accuracy

In [145]:
k = 10
x_train_data = np.array(data_in_nearest_clusters)
y_train_data = np.array(labels_in_nearest_clusters)
predictions, accuracy = classify_knn(x_train_data, y_train_data, [x_test[1]], [y_test[1]], k=k)
print(f"Predicted labels: {predictions}")
print(f"Accuracy: {accuracy * 100:.2f}%")

Predicted labels: [91]
Accuracy: 100.00%
