In [None]:
# from google.colab import drive
# drive.mount('/content/drive')  # use this if you are using google colab

In [198]:
from sklearn.cluster import KMeans
import pickle, os
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [199]:
def load_dataset(name_file):

    desired_directory = '.' # Replace with your desired directory path

    file_path = os.path.join(desired_directory, name_file) 

    with open(file_path, 'rb') as f:
        data_dict = pickle.load(f)

    return data_dict


In [200]:
loaders_dict = load_dataset(f"dataset-flowers102-features.pkl")
x_train = loaders_dict["x_train"]
x_test = loaders_dict["x_test"]
y_train = loaders_dict["y_train"]
y_test = loaders_dict["y_test"]
print(f"x_train:{x_train.shape}, y_train:{y_train.shape}")
print(f"x_test:{x_test.shape}, y_test:{y_test.shape}")


x_train:(4094, 512), y_train:(4094,)
x_test:(4095, 512), y_test:(4095,)


## clustering images

In [201]:
k_number = 50
kmeans = KMeans(n_clusters=k_number)
clusters = kmeans.fit_predict(x_train)
centroids = kmeans.cluster_centers_

  super()._check_params_vs_input(X, default_n_init=10)


In [202]:
# centroid_distances = [distance.euclidean(x_test[0], centroid) for centroid in centroids]
# centroid_distances

## find nearest clusters neighbors

### find nearest clusters and their indices

In [203]:
def find_nearest_clusters_neighbors(x_train, x_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(x_train, range(len(x_train)))
    nearest_indices = knn_classifier.kneighbors(x_test, n_neighbors=k, return_distance=False)
    # nearest_neighbors = x_train[nearest_indices]
    nearest_neighbors = [x_train[indices] for indices in nearest_indices]

    return nearest_indices, nearest_neighbors

In [204]:
n_nearest_neighbors = 3
x_train_clusters = centroids
nearest_clusters_indices, nearest_clusters_neighbors = find_nearest_clusters_neighbors(x_train_clusters, 
                                                                                       x_test, 
                                                                                       k=n_nearest_neighbors)

In [205]:
print(f"clusters Indices:")
print(nearest_clusters_indices)
print(f"nearest neighbor clusters centroids:")
print(nearest_clusters_neighbors)

clusters Indices:
[[ 0 11 23]
 [26 38 16]
 [22 28  8]
 ...
 [ 3  0 35]
 [21  0 35]
 [41 38  5]]
nearest neighbor clusters centroids:


[array([[0.6148209 , 0.64063656, 1.0724483 , ..., 1.6442416 , 0.2187941 ,
        2.1226964 ],
       [0.11966872, 0.87763184, 0.84185386, ..., 1.3357865 , 1.619072  ,
        1.2549186 ],
       [0.30489254, 1.0125575 , 2.350429  , ..., 1.00089   , 0.3960762 ,
        1.0579458 ]], dtype=float32), array([[0.5336711 , 0.48241735, 0.6204545 , ..., 1.905231  , 1.4772571 ,
        1.9315851 ],
       [1.3324612 , 0.7095132 , 0.34643227, ..., 1.2523346 , 0.33857   ,
        0.37111884],
       [0.5449337 , 0.95993173, 0.6490431 , ..., 1.134692  , 1.7412119 ,
        0.96180654]], dtype=float32), array([[0.5614548 , 0.42259133, 0.78233373, ..., 0.74009526, 1.0604414 ,
        3.182969  ],
       [1.5603476 , 2.1440182 , 0.73919666, ..., 1.1661571 , 0.7107098 ,
        1.9442952 ],
       [1.2752569 , 1.6435878 , 0.9971185 , ..., 0.5689803 , 1.8854063 ,
        0.80480033]], dtype=float32), array([[1.4580972 , 0.76294553, 1.5185149 , ..., 0.3870284 , 0.32269388,
        0.66222656],
       [

### get the data of nearest clusters

In [206]:
def gather_clusters_data(k_number, clusters):
    clusters_data = {i: [] for i in range(k_number)}
    clusters_data_labels = {i: [] for i in range(k_number)}
    for i, label in enumerate(clusters):
        clusters_data[label].append(x_train[i])
        clusters_data_labels[label].append(y_train[i])
    return clusters_data, clusters_data_labels

In [207]:
clusters_data, clusters_data_labels = gather_clusters_data(k_number, clusters)

In [208]:
def get_nearest_clusters_data(nearest_clusters_indices, clusters_data, clusters_data_labels):
    # Access data points in the cluster with centroid index 'nearest_clusters_indices'
    data_in_nearest_clusters = []
    labels_in_nearest_clusters = []
    for index in nearest_clusters_indices:
        data_in_nearest_clusters.extend(clusters_data[index])
        labels_in_nearest_clusters.extend(clusters_data_labels[index])
    return data_in_nearest_clusters, labels_in_nearest_clusters

In [237]:
data_in_nearest_clusters, labels_in_nearest_clusters = get_nearest_clusters_data(nearest_clusters_indices[8], 
                                                                                 clusters_data, 
                                                                                 clusters_data_labels)

## classify using data in nearest clusters

In [238]:
def classify_knn(x_train, y_train, x_test, y_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(x_train, y_train)
    y_pred = knn_classifier.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return y_pred, accuracy

In [239]:
k = 10
x_train_data = np.array(data_in_nearest_clusters)
y_train_data = np.array(labels_in_nearest_clusters)
predictions, accuracy = classify_knn(x_train_data, y_train_data, [x_test[1]], [y_test[1]], k=k)
print(f"Predicted labels: {predictions}")
print(f"Accuracy: {accuracy * 100:.2f}%")

Predicted labels: [93]
Accuracy: 0.00%
