# **Retrieval with k-NN**

In [1]:
import sys

sys.path.insert(0, "..\\Scripts")

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
name = "mobilenet_v3_small.npy"

path_features = os.path.join("../Features/features/", name)
path_labels = os.path.join("../Features/labels/", name)

## **Load dataset**

In [4]:
query_features = np.load(path_features)
query_labels = np.load(path_labels)

### **PCA study**

In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
query_features = scaler.fit_transform(query_features)

pca = PCA().fit(query_features)

In [10]:
np.sum(pca.explained_variance_ratio_[:4000])

0.94343424

In [12]:
pca_data = pca.transform(query_features)
n_components = 4000

pca_data = pca_data[:, :n_components]

### **Load Unlabelled Data**

In [13]:
train_unlabelled = pd.read_csv("../Dataset/train_unlabeled.csv", header=None)
train_unlabelled.columns = ["Image", "Label"]
train_unlabelled.head()

Unnamed: 0,Image,Label
0,train_059329.jpg,-1
1,train_059330.jpg,-1
2,train_059331.jpg,-1
3,train_059332.jpg,-1
4,train_059333.jpg,-1


In [14]:
n = train_unlabelled.shape[0] // 20_000
for i in range(n + 1):
    tmp = train_unlabelled.iloc[i * 20_000 : (i + 1) * 20_000]
    tmp = tmp.reset_index(drop=True)
    tmp.to_csv(f"../Dataset/train_unlabeled_{i}.csv", index=False, header=False)
    print(f"../Dataset/train_unlabeled_{i}.csv")

../Dataset/train_unlabeled_0.csv
../Dataset/train_unlabeled_1.csv
../Dataset/train_unlabeled_2.csv
../Dataset/train_unlabeled_3.csv
../Dataset/train_unlabeled_4.csv
../Dataset/train_unlabeled_5.csv


In [15]:
from ImageDatastore import ImageDatastore
from torch.utils.data import DataLoader
from torchvision import transforms

In [16]:
batch_size = 512
transform = transforms.Compose(
    [
        transforms.Resize(
            (232, 232), interpolation=transforms.InterpolationMode.BILINEAR
        ),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

In [17]:
train_unlabelled = ImageDatastore("train_unlabeled_0", transform=transform)
train_loader = DataLoader(train_unlabelled, batch_size=batch_size, shuffle=False)

In [18]:
from NeuralFeatureExtractor import MobileNetFeatureExtractor

In [None]:
feature_extractor = MobileNetFeatureExtractor()
features, labels = feature_extractor.compute_features(train_loader)

In [None]:
features = scaler.transform(features)
features = pca.transform(features)