In [56]:

import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from sklearn.model_selection import train_test_split



In [25]:
dataset_small_labeled = np.load("../dataset/small_train_features_resnet50.npz")
dataset_unlabeled = np.load("../dataset/train_unlabeled_features_resnet50_completed.npz")

X_small_labeled = dataset_small_labeled['X']
y_small_labeled = dataset_small_labeled['y']
X_unlabeled = dataset_unlabeled['X']
y_unlabeled = dataset_unlabeled['y']

In [26]:
X_unlabeled.shape

(113455, 2048)

In [27]:
extracted_indeces = np.random.choice(X_unlabeled.shape[0], size=int(X_unlabeled.shape[0] * 0.10), replace=False)
print(extracted_indeces)

[78726 38252  9319 ... 93055 19462   557]


In [28]:
X_unlabeled = X_unlabeled[extracted_indeces, :]
y_unlabeled = y_unlabeled[extracted_indeces]

In [29]:
print(X_unlabeled.shape)

(11345, 2048)


In [44]:

N_clusters = 251

kmeans = KMeans(n_clusters=N_clusters, random_state=42)
kmeans_groups = kmeans.fit_predict(X_unlabeled)

score = silhouette_score(X_unlabeled, kmeans_groups)
print(f"Silhouette Score: {score:.3f}")



Silhouette Score: 0.012


In [42]:


def image_retrieval_using_knn(n_neighbors, dataset, queryset):
    pipeline = Pipeline([
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))  
    ])

    pipeline.fit(dataset[0], dataset[1])
    predictions = pipeline.predict(queryset)

    return predictions
    

In [43]:
candidate_labels = image_retrieval_using_knn(5, (X_small_labeled, y_small_labeled), X_unlabeled)

In [49]:
groups_to_index = [[] for i in range(N_clusters)]

for example_index, group in enumerate(kmeans_groups):
    groups_to_index[group].append(example_index)

for examples_indexes in groups_to_index:
    votes = [candidate_labels[i] for i in examples_indexes]
    final_label = stats.mode(votes).mode
    for index in examples_indexes:
        y_unlabeled[index] = final_label



In [53]:
for i in y_unlabeled:
    print(i)

191
178
1
34
69
63
51
59
34
26
1
35
68
191
245
24
134
223
9
60
156
34
249
64
90
18
12
11
4
43
227
123
23
17
11
11
205
230
77
42
124
22
42
51
22
29
4
3
42
204
0
12
51
41
188
6
6
203
150
17
42
83
55
81
13
203
43
223
23
25
4
4
191
11
52
122
23
6
43
11
75
7
23
18
9
11
12
21
96
6
17
41
168
1
14
35
7
25
57
17
35
64
84
178
17
77
17
18
168
5
75
203
144
86
105
205
183
205
198
10
24
144
6
117
168
6
157
64
51
11
3
0
42
11
223
11
0
52
76
39
55
0
26
19
39
42
98
7
63
101
205
6
22
42
3
29
3
19
12
88
76
41
35
157
99
1
81
208
1
44
5
164
195
12
17
168
51
68
35
8
3
29
7
183
16
3
7
29
144
26
40
13
75
205
18
223
117
84
11
1
85
230
24
1
188
109
77
52
22
51
24
23
188
188
165
250
57
195
156
27
19
13
23
34
10
57
8
157
24
26
7
34
220
223
7
51
204
7
11
3
16
24
157
111
21
1
1
203
22
42
23
77
6
35
56
13
42
44
223
250
90
63
13
105
71
13
3
99
203
75
134
13
26
5
117
157
123
16
249
24
96
223
10
11
63
24
60
111
5
51
17
12
68
4
121
12
29
123
7
42
35
230
22
188
63
77
51
174
4
164
25
4
223
1
156
75
30
71
117
23
36
75
60
2

In [67]:
dataset_extendex_10 = np.load("../dataset/features_extended_10.npz")


In [68]:


X_train, X_test, y_train, y_test = train_test_split(X_small_labeled, y_small_labeled, test_size=int((.2 * (X_unlabeled.shape[0] + X_small_labeled.shape[0]) )/ 251)/20, stratify=y_small_labeled, random_state=42)

X_train = np.concatenate((X_train, X_unlabeled), axis=0)  # Combine rows
y_train = np.concatenate((y_train, y_unlabeled), axis=0)  # Combine


In [69]:
np.array_equal(X_test, dataset_extendex_10['X_val'])

True

In [71]:
np.savez('../dataset/features_extended_10_retrieval_kmeans.npz', X_train=X_train, X_val=X_test, y_train=y_train, y_val = y_test)