In [16]:

import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from sklearn.model_selection import train_test_split



In [17]:
dataset_small_labeled = np.load("../dataset/small_train_features_resnet50.npz")
dataset_unlabeled = np.load("../dataset/train_unlabeled_features_resnet50_completed.npz")

X_small_labeled = dataset_small_labeled['X']
y_small_labeled = dataset_small_labeled['y']
X_unlabeled = dataset_unlabeled['X']
y_unlabeled = dataset_unlabeled['y']

In [18]:
X_unlabeled.shape

(113455, 2048)

In [19]:
extracted_indeces = np.random.choice(X_unlabeled.shape[0], size=int(X_unlabeled.shape[0] * 0.10), replace=False)
print(extracted_indeces)

[25190 73049 93945 ... 10303 96554 92386]


In [20]:
X_unlabeled = X_unlabeled[extracted_indeces, :]
y_unlabeled = y_unlabeled[extracted_indeces]

In [21]:
print(X_unlabeled.shape)

(11345, 2048)


In [22]:

N_clusters = 1024

kmeans = KMeans(n_clusters=N_clusters, random_state=42)
kmeans_groups = kmeans.fit_predict(X_unlabeled)

score = silhouette_score(X_unlabeled, kmeans_groups)
print(f"Silhouette Score: {score:.3f}")



Silhouette Score: 0.007


In [23]:


def image_retrieval_using_knn(n_neighbors, dataset, queryset):
    pipeline = Pipeline([
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))  
    ])

    pipeline.fit(dataset[0], dataset[1])
    predictions = pipeline.predict(queryset)

    return predictions
    

In [24]:
candidate_labels = image_retrieval_using_knn(5, (X_small_labeled, y_small_labeled), X_unlabeled)

In [25]:
groups_to_index = [[] for i in range(N_clusters)]

for example_index, group in enumerate(kmeans_groups):
    groups_to_index[group].append(example_index)

for examples_indexes in groups_to_index:
    votes = [candidate_labels[i] for i in examples_indexes]
    final_label = stats.mode(votes).mode
    for index in examples_indexes:
        y_unlabeled[index] = final_label



In [26]:
for i in y_unlabeled:
    print(i)

17
157
205
111
6
6
7
4
63
38
238
23
35
23
60
165
52
191
46
5
10
71
29
32
32
18
16
29
17
77
223
42
76
1
17
98
8
42
11
34
52
1
40
7
247
83
198
0
48
10
12
9
52
6
182
46
40
35
16
12
3
109
40
206
14
88
6
6
29
111
42
12
41
115
249
60
198
22
7
4
250
51
34
182
21
109
21
124
157
57
12
28
124
23
23
26
47
23
86
87
19
29
22
5
223
11
75
14
41
17
157
186
8
75
7
25
37
17
239
3
69
205
105
34
86
72
47
8
9
6
30
8
11
21
86
33
69
12
30
88
13
11
34
23
12
56
88
76
51
1
16
3
29
22
157
5
11
6
25
7
156
20
105
17
19
15
11
106
8
83
39
7
17
28
97
11
60
49
22
2
12
8
29
17
9
15
219
5
51
6
7
36
182
157
8
1
7
225
8
10
23
29
86
29
115
173
1
205
5
58
8
5
28
76
11
183
183
6
10
60
233
28
76
23
43
211
12
6
60
17
1
20
30
23
75
9
83
182
13
16
21
11
49
49
250
223
60
97
9
7
51
34
69
48
21
46
10
2
3
97
2
21
3
32
11
12
5
112
245
22
52
153
10
64
31
57
11
97
19
18
35
14
5
121
7
11
49
39
112
30
12
47
1
157
4
223
60
3
68
3
4
51
6
60
16
76
11
45
22
6
49
36
5
26
6
12
17
106
3
4
9
69
12
32
0
33
33
1
6
7
7
8
23
66
88
11
42
6
5
86
14
8


In [27]:
dataset_extendex_10 = np.load("../dataset/features_extended_10.npz")


In [28]:


X_train, X_test, y_train, y_test = train_test_split(X_small_labeled, y_small_labeled, test_size=int((.2 * (X_unlabeled.shape[0] + X_small_labeled.shape[0]) )/ 251)/20, stratify=y_small_labeled, random_state=42)

X_train = np.concatenate((X_train, X_unlabeled), axis=0)  # Combine rows
y_train = np.concatenate((y_train, y_unlabeled), axis=0)  # Combine


In [29]:
np.array_equal(X_test, dataset_extendex_10['X_val'])

True

In [30]:
np.savez('../dataset/features_extended_10_retrieval_kmeans.npz', X_train=X_train, X_val=X_test, y_train=y_train, y_val = y_test)