In [1]:
import argparse
import h5py
import numpy as np
import pandas as pd
from sklearn import preprocessing
import os
import time
from pathlib import Path
from urllib.request import urlretrieve
import logging
from li.Baseline import Baseline
from li.LearnedIndex import LearnedIndex
from li.utils import save_as_pickle
from li.model import data_X_to_torch


In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
np.random.seed(2023)

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)


In [4]:
kind='pca32v2'
size='100K'
key='pca32'

In [5]:
data = np.array(h5py.File(os.path.join("data", kind, size, "dataset.h5"), "r")[key])
queries = np.array(h5py.File(os.path.join("data", kind, size, "query.h5"), "r")[key])

In [6]:
kind_search = 'clip768v2'
key_search = 'emb'
data_search = np.array(
    h5py.File(os.path.join("data", kind_search, size, "dataset.h5"), "r")[key_search]
)

In [7]:
queries_search = np.array(
    h5py.File(os.path.join("data", kind_search, size, "query.h5"), "r")[key_search]
)

In [8]:
import torch
from torch import nn
import torch.nn.functional as nnf
import numpy as np
from li.Logger import Logger
from typing import Tuple
import torch.utils.data

torch.manual_seed(2023)
np.random.seed(2023)

class Model(nn.Module):
    def __init__(self, input_dim=768, output_dim=1000, model_type=None):
        super().__init__()
        if model_type == 'MLP':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, output_dim)
            )
        if model_type == 'MLP-2':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 64),
                torch.nn.ReLU(),
                torch.nn.Linear(64, output_dim)
            )
        if model_type == 'MLP-3':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 256),
                torch.nn.ReLU(),
                torch.nn.Linear(256, output_dim)
            )
        if model_type == 'MLP-4':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 512),
                torch.nn.ReLU(),
                torch.nn.Linear(512, output_dim)
            )
        if model_type == 'MLP-5':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 256),
                torch.nn.ReLU(),
                torch.nn.Linear(256, 128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, output_dim)
            )
        if model_type == 'MLP-6':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 32),
                torch.nn.ReLU(),
                torch.nn.Linear(32, output_dim)
            )
        if model_type == 'MLP-7':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 16),
                torch.nn.ReLU(),
                torch.nn.Linear(16, output_dim)
            )
        if model_type == 'MLP-8':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 8),
                torch.nn.ReLU(),
                torch.nn.Linear(8, output_dim)
            )
        if model_type == 'MLP-9':
            self.layers = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 8),
                torch.nn.ReLU(),
                torch.nn.Linear(input_dim, 16),
                torch.nn.ReLU(),
                torch.nn.Linear(16, output_dim)
            )
        self.n_output_neurons = output_dim

    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        outputs = self.layers(x)
        return outputs


def data_X_to_torch(data) -> torch.FloatTensor:
    """ Creates torch training data."""
    data_X = torch.from_numpy(np.array(data).astype(np.float32))
    return data_X


def data_to_torch(data, labels) -> Tuple[torch.FloatTensor, torch.LongTensor]:
    """ Creates torch training data and labels."""
    data_X = data_X_to_torch(data)
    data_y = torch.as_tensor(torch.from_numpy(labels), dtype=torch.long)
    return data_X, data_y


def get_device() -> torch.device:
    """ Gets the `device` to be used by torch.
    This arugment is needed to operate with the PyTorch model instance.

    Returns
    ------
    torch.device
        Device
    """
    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda:0' if use_cuda else 'cpu')
    torch.backends.cudnn.benchmark = True
    return device


class NeuralNetwork(Logger):
    """ The neural network class corresponding to every inner node.

    Parameters
    ----------
    input_dim : int
        The input dimension.
    output_dim : int
        The output dimension.
    loss : torch.nn, optional
        The loss function, the default is torch.nn.CrossEntropyLoss.
    lr : float, optional
        The learning rate, the default is 0.001.
    model_type : str, optional
        The model type, the default is 'MLP'.
    class_weight : torch.FloatTensor, optional
        The class weights, the default is None.
    """
    def __init__(
        self,
        input_dim,
        output_dim,
        loss=torch.nn.CrossEntropyLoss,
        lr=0.1,
        model_type='MLP',
        class_weight=None
    ):
        self.device = get_device()
        self.model = Model(input_dim, output_dim, model_type=model_type).to(self.device)
        if not isinstance(class_weight, type(None)):
            self.loss = loss(weight=class_weight.to(self.device))
        else:
            self.loss = loss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def train(
        self,
        data_X: torch.FloatTensor,
        data_y: torch.LongTensor,
        epochs=500,
        logger=None
    ):
        step = epochs // 10
        losses = []
        if logger:
            logger.info(f'Epochs: {epochs}, step: {step}')
        for ep in range(epochs):
            pred_y = self.model(data_X.to(self.device))
            curr_loss = self.loss(pred_y, data_y.to(self.device))
            if ep % step == 0 and ep != 0:
                if logger:
                    logger.info(f'Epoch {ep} | Loss {curr_loss.item()}')
            losses.append(curr_loss.item())

            self.model.zero_grad()
            curr_loss.backward()

            self.optimizer.step()
        return losses

    def train_batch(
        self,
        dataset,
        epochs=5,
        logger=None
    ):
        step = epochs // 10
        step = step if step > 0 else 1
        losses = []
        if logger:
            logger.info(f'Epochs: {epochs}, step: {step}')
        for ep in range(epochs):
            for data_X, data_y in iter(dataset):
                pred_y = self.model(data_X.to(self.device))
                curr_loss = self.loss(pred_y, data_y.to(self.device))

            if ep % step == 0 and ep != 0:
                if logger:
                    logger.info(f'Epoch {ep} | Loss {curr_loss.item():.5f}')
            losses.append(curr_loss.item())

            self.model.zero_grad()
            curr_loss.backward()

            self.optimizer.step()
        return losses

    def predict(self, data_X: torch.FloatTensor):
        """ Collects predictions for multiple data points (used in structure building)."""
        self.model = self.model.to(self.device)
        self.model.eval()

        all_outputs = torch.tensor([], device=self.device)
        with torch.no_grad():
            outputs = self.model(data_X.to(self.device))
            all_outputs = torch.cat((all_outputs, outputs), 0)

        _, y_pred = torch.max(all_outputs, 1)
        return y_pred.cpu().numpy()

    def predict_proba(self, data_X: torch.FloatTensor):
        """ Collects predictions for a single data point (used in query predictions)."""
        self.model = self.model.to(self.device)
        self.model.eval()

        with torch.no_grad():
            outputs = self.model(data_X.to(self.device))

        if outputs.dim() == 1:
            dim = 0
        else:
            dim = 1
        prob = nnf.softmax(outputs, dim=dim)
        probs, classes = prob.topk(prob.shape[1])

        return probs.cpu().numpy(), classes.cpu().numpy()


class LIDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_x, dataset_y):
        self.dataset_x, self.dataset_y = data_to_torch(dataset_x, dataset_y)

    def __len__(self):
        return self.dataset_x.shape[0]

    def __getitem__(self, idx):
        return self.dataset_x[idx-1], self.dataset_y[idx-1]


In [9]:
import numpy as np
from li.Logger import Logger
from li.utils import pairwise_cosine
import time
import torch
import torch.utils.data
import faiss
from tqdm import tqdm
import numpy as np
torch.manual_seed(2023)
np.random.seed(2023)

class LearnedIndex(Logger):

    def __init__(self):
        self.pq = []
        self.model = None

    def search(
        self,
        data_navigation,
        queries_navigation,
        data_search,
        queries_search,
        pred_categories,
        n_buckets=1,
        k=10
    ):
        """ Search for k nearest neighbors for each query in queries.

        Parameters
        ----------
        queries : np.array
            Queries to search for.
        data : np.array
            Data to search in.
        n_buckets : int
            Number of most similar buckets to search in.
        k : int
            Number of nearest neighbors to search for.

        Returns
        -------
        dists : np.array
            Array of shape (queries.shape[0], k) with distances to nearest neighbors for each query.
        anns : np.array
            Array of shape (queries.shape[0], k) with nearest neighbors for each query.
        time : float
            Time it took to search.
        """
        assert self.model is not None, 'Model is not trained, call `build` first.'
        s = time.time()
        _, pred_proba_categories = self.model.predict_proba(
            data_X_to_torch(queries_navigation)
        )
        t_inference = time.time() - s
        anns_final = None
        dists_final = None
        # sorts the predictions of a bucket for each query, ordered by lowest probability
        data_navigation['category'] = pred_categories

        # iterates over the predicted buckets starting from the most similar (index -1)
        t_all_buckets = 0
        t_all_pairwise = 0
        for bucket in range(n_buckets):
            dists, anns, t_all, t_pairwise = self.search_single(
                data_navigation,
                data_search,
                queries_search,
                pred_proba_categories[:, bucket]
            )
            t_all_buckets += t_all
            t_all_pairwise += t_pairwise
            if anns_final is None:
                anns_final = anns
                dists_final = dists
            else:
                # stacks the results from the previous sorted anns and dists
                # *_final arrays now have shape (queries.shape[0], k*2)
                anns_final = np.hstack((anns_final, anns))
                dists_final = np.hstack((dists_final, dists))
                # gets the sorted indices of the stacked dists
                idx_sorted = dists_final.argsort(kind='stable', axis=1)[:, :k]
                # indexes the final arrays with the sorted indices
                # *_final arrays now have shape (queries.shape[0], k)
                idx = np.ogrid[tuple(map(slice, dists_final.shape))]
                idx[1] = idx_sorted
                dists_final = dists_final[tuple(idx)]
                anns_final = anns_final[tuple(idx)]

                assert anns_final.shape == dists_final.shape == (queries_search.shape[0], k)

        return dists_final, anns_final, time.time() - s, t_inference, t_all_buckets, t_all_pairwise

    def search_single(
        self,
        data_navigation,
        data_search,
        queries_search,
        pred_categories,
        k=10
    ):
        """ Search for k nearest neighbors for each query in queries.

        Parameters
        ----------
        queries : np.array
            Queries to search for.
        data : np.array
            Data to search in.
        k : int
            Number of nearest neighbors to search for.

        Returns
        -------
        anns : np.array
            Array of shape (queries.shape[0], k) with nearest neighbors for each query.
        final_dists_k : np.array
            Array of shape (queries.shape[0], k) with distances to nearest neighbors for each query.
        time : float
            Time it took to search.
        """
        s_all = time.time()
        nns = np.zeros((queries_search.shape[0], k), dtype=np.uint32)
        dists = np.zeros((queries_search.shape[0], k), dtype=np.float32)

        if 'category' in data_search.columns:
            data_search = data_search.drop('category', axis=1, errors='ignore')

        t_pairwise = 0
        for cat, g in tqdm(data_navigation.groupby('category')):
            cat_idxs = np.where(pred_categories == cat)[0]
            bucket_obj_indexes = g.index
            if bucket_obj_indexes.shape[0] != 0 and cat_idxs.shape[0] != 0:
                try:
                    s = time.time()
                    seq_search_dists = pairwise_cosine(
                        queries_search[cat_idxs],
                        data_search.loc[bucket_obj_indexes]
                    )
                    t_pairwise += time.time() - s
                    ann_relative = seq_search_dists.argsort()[
                        :, :k if k < seq_search_dists.shape[1] else seq_search_dists.shape[1]
                    ]
                    nns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative]
                    dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)
                except ValueError as e:
                    print(e)
                    print('cat_idxs', cat_idxs.shape)
                    print('bucket_obj_indexes', bucket_obj_indexes.shape)
                    print('seq_search_dists', seq_search_dists.shape)
                    print('ann_relative', ann_relative.shape)

        t_all = time.time() - s_all
        return dists, nns, t_all, t_pairwise

    def build(self, data, n_categories=100, epochs=100, lr=0.1, model_type='MLP'):
        """ Build the index.

        Parameters
        ----------
        data : np.array
            Data to build the index on.

        Returns
        -------
        time : float
            Time it took to build the index.
        """
        s = time.time()
        # ---- cluster the data into categories ---- #
        _, labels = self.cluster(data, n_categories)

        # ---- train a neural network ---- #
        dataset = LIDataset(data, labels)
        train_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=256,
            sampler=torch.utils.data.SubsetRandomSampler(
                data.index.values.tolist()
            )
        )
        nn = NeuralNetwork(
            input_dim=data.shape[1],
            output_dim=n_categories,
            lr=lr,
            model_type=model_type
        )
        nn.train_batch(train_loader, epochs=epochs, logger=self.logger)
        # ---- collect predictions ---- #
        self.model = nn
        return nn.predict(data_X_to_torch(data)), time.time() - s

    def cluster(
        self,
        data,
        n_clusters,
        n_redo=10,
        spherical=True,
        int_centroids=True,

    ):
        if data.shape[0] < 2:
            return None, np.zeros_like(data.shape[0])

        if data.shape[0] < n_clusters:
            n_clusters = data.shape[0] // 5
            if n_clusters < 2:
                n_clusters = 2

        kmeans = faiss.Kmeans(
            d=np.array(data).shape[1],
            k=n_clusters,
            verbose=True,
            #nredo=n_redo,
            #spherical=spherical,
            #int_centroids=int_centroids,
            #update_index=False,
            seed=2023
        )
        X = np.array(data).astype(np.float32)
        kmeans.train(X)

        return kmeans, kmeans.index.search(X, 1)[1].T[0]


[2023-07-13 15:57:11,655][INFO ][faiss.loader] Loading faiss with AVX2 support.
[2023-07-13 15:57:11,661][INFO ][faiss.loader] Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'",)
[2023-07-13 15:57:11,665][INFO ][faiss.loader] Loading faiss.
[2023-07-13 15:57:11,751][INFO ][faiss.loader] Successfully loaded faiss.


In [10]:
data = pd.DataFrame(data)
data.index += 1

data_search = pd.DataFrame(data_search)
data_search.index += 1

In [None]:
li = LearnedIndex()
pred_categories, build_t = li.build(
    data,
    n_categories=2000,
    epochs=20,
    lr=0.1
)

[2023-07-13 15:57:28,622][INFO ][__main__.LearnedInde] Epochs: 20, step: 2
[2023-07-13 15:57:38,796][INFO ][__main__.LearnedInde] Epoch 2 | Loss 7.68513
[2023-07-13 15:57:45,527][INFO ][__main__.LearnedInde] Epoch 4 | Loss 7.65066
[2023-07-13 15:57:52,321][INFO ][__main__.LearnedInde] Epoch 6 | Loss 7.62184
[2023-07-13 15:57:59,025][INFO ][__main__.LearnedInde] Epoch 8 | Loss 7.54541
[2023-07-13 15:58:05,797][INFO ][__main__.LearnedInde] Epoch 10 | Loss 7.51368
[2023-07-13 15:58:12,500][INFO ][__main__.LearnedInde] Epoch 12 | Loss 7.52358
[2023-07-13 15:58:19,216][INFO ][__main__.LearnedInde] Epoch 14 | Loss 7.57553


In [None]:
data['category'] = pred_categories

In [None]:
data.category.value_counts()

In [14]:
k=10

In [15]:
nns = np.zeros((queries_search.shape[0], k), dtype=np.uint32)
dists = np.zeros((queries_search.shape[0], k), dtype=np.float32)

In [16]:
_, pred_proba_categories = li.model.predict_proba(
    data_X_to_torch(queries)
)

In [17]:
bucket = 0

In [18]:
pred_proba_categories

array([[ 791, 1878, 1144, ...,  542, 1020, 1010],
       [ 791, 1913,  708, ..., 1535,  434, 1547],
       [ 276,  911, 1864, ...,  823, 1636,  190],
       ...,
       [1665,  349, 1418, ..., 1563,  542, 1229],
       [ 791, 1913, 1428, ..., 1020, 1955, 1547],
       [1144,  791, 1878, ..., 1229,  542, 1547]])

In [19]:
for cat, g in tqdm(data.groupby('category')):
    cat_idxs = np.where(pred_proba_categories[:, bucket] == cat)[0]
    bucket_obj_indexes = g.index
    print(queries_search[cat_idxs].shape)
    print(data_search.loc[bucket_obj_indexes].shape)
    break
    seq_search_dists = pairwise_cosine(
        queries_search[cat_idxs],
        data_search.loc[bucket_obj_indexes]
    )
    break

  0%|          | 0/97 [00:00<?, ?it/s]

(322, 768)
(3095, 768)





In [25]:
dists.max(axis=1)

array([0.36959553, 0.32019585, 0.17210594, ..., 0.254975  , 0.36058566,
       0.37660939], dtype=float32)

In [24]:
dists

array([[0.306153  , 0.31319278, 0.3316943 , ..., 0.36904657, 0.36959553,
        0.36959553],
       [0.23895504, 0.2626422 , 0.27826455, ..., 0.3162143 , 0.31905118,
        0.32019585],
       [0.13460062, 0.14457192, 0.15563862, ..., 0.16689926, 0.16764964,
        0.17210594],
       ...,
       [0.2241371 , 0.2337276 , 0.23440455, ..., 0.25322142, 0.25394517,
        0.254975  ],
       [0.32610038, 0.33764818, 0.34348887, ..., 0.35277185, 0.35619137,
        0.36058566],
       [0.27490523, 0.3412929 , 0.35591373, ..., 0.3703666 , 0.37131587,
        0.37660939]], dtype=float32)

In [23]:
max(dists)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [20]:
for cat, g in tqdm(data.groupby('category')):
    cat_idxs = np.where(pred_proba_categories[:, bucket] == cat)[0]
    bucket_obj_indexes = g.index
    if bucket_obj_indexes.shape[0] != 0 and cat_idxs.shape[0] != 0:
        #print(cat, cat_idxs, cat_idxs.shape)
        #break
        seq_search_dists = pairwise_cosine(
            queries_search[cat_idxs],
            data_search.loc[bucket_obj_indexes]
        )
        #print(seq_search_dists.shape)
        ann_relative = seq_search_dists.argsort()[
            :, :k if k < seq_search_dists.shape[1] else seq_search_dists.shape[1]
        ]
        #print('ann_relative: ', ann_relative.shape)
        #print('np.array(bucket_obj_indexes): ', np.array(bucket_obj_indexes), np.array(bucket_obj_indexes).shape)
        #print('ann_relative: ', ann_relative, ann_relative.shape)
        pad_needed = (k - np.array(bucket_obj_indexes).shape[0]) // 2 + 1
        #print('pad_needed: ', pad_needed)
        #print('np.array(bucket_obj_indexes) padded: ', np.pad(np.array(bucket_obj_indexes), pad_needed, 'edge')[:k], np.pad(np.array(bucket_obj_indexes), pad_needed,'edge').shape)
        if bucket_obj_indexes.shape[0] < k:
            #print('here')
            pad_needed = (k - bucket_obj_indexes.shape[0]) // 2 + 1
            bucket_obj_indexes = np.pad(np.array(bucket_obj_indexes), pad_needed, 'edge')[:k]
            ann_relative = np.pad(ann_relative[0], pad_needed, 'edge')[:k].reshape(1, -1)
            seq_search_dists = np.pad(seq_search_dists[0], pad_needed, 'edge')[:k].reshape(1, -1)
            #print('seq_search_dists: ', seq_search_dists)
            u, i = np.unique(seq_search_dists, return_index=True)
            duplicates_i = np.setdiff1d(np.arange(k), i)
            # assign a large number such that the duplicated value gets replaced
            seq_search_dists[0][duplicates_i] = 10_000
            #print('seq_search_dists: ', seq_search_dists)
            #print('bucket_obj_indexes: ', bucket_obj_indexes.shape)
            #print('ann_relative: ', ann_relative, ann_relative.shape)
            #print('seq_search_dists ', seq_search_dists, seq_search_dists.shape)

        nns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative]
        #print('nns[cat_idxs]: ', nns[cat_idxs].shape)
        dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)

100%|██████████| 97/97 [00:09<00:00, 10.40it/s]


In [34]:
cat_idxs.shape

(0,)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
for cat, g in tqdm(data.groupby('category')):
    bucket = 1
    print(cat, g.shape)
    cat_idxs = np.where(pred_proba_categories[:, bucket] == cat)[0]
    bucket_obj_indexes = g.index
    print(queries_search[cat_idxs].shape)
    x = queries_search[cat_idxs]
    y = data_search.loc[bucket_obj_indexes]
    pure_arr = 1-cosine_similarity(x, y)
    break

  0%|          | 0/97 [00:00<?, ?it/s]

37 (3095, 33)
(176, 768)





##### Objective: beat the time of:

In [74]:
%%time
ann_relative = pure_arr.argsort(kind='quicksort')[
    :, :10
]

CPU times: user 38 ms, sys: 76 µs, total: 38.1 ms
Wall time: 38.1 ms


In [187]:
%%time
max_dists = dists.max(axis=1)[cat_idxs]

CPU times: user 751 µs, sys: 0 ns, total: 751 µs
Wall time: 765 µs


In [188]:
%time res = np.repeat(dists.max(axis=1)[cat_idxs, np.newaxis], 3095, 1)

CPU times: user 2.61 ms, sys: 0 ns, total: 2.61 ms
Wall time: 2.61 ms


In [196]:
%%time
target_dists = np.where(pure_arr < res)

CPU times: user 2.35 ms, sys: 0 ns, total: 2.35 ms
Wall time: 2.36 ms


In [197]:
%%time
max_idx = np.unique(target_dists[0], return_counts=True)[1].max()
max_idx

CPU times: user 436 µs, sys: 30 µs, total: 466 µs
Wall time: 473 µs


214

In [198]:
target_dists

(array([  2,   2,   2, ..., 174, 174, 175]),
 array([ 190,  755, 1872, ..., 2538, 2995, 2569]))

In [200]:
%time sample_arr = np.full(shape=(pure_arr.shape[0], max_idx), fill_value=10_000, dtype=np.float)

CPU times: user 156 µs, sys: 11 µs, total: 167 µs
Wall time: 175 µs


In [201]:
sample_arr.shape

(176, 214)

In [233]:
%time to_be_added = pure_arr[target_dists[0], target_dists[1]]

CPU times: user 164 µs, sys: 12 µs, total: 176 µs
Wall time: 180 µs


In [217]:
%time index_df = pd.DataFrame(target_dists[0])

CPU times: user 178 µs, sys: 13 µs, total: 191 µs
Wall time: 197 µs


In [218]:
%time index_df['new_arr_idx'] = index_df.groupby(0).cumcount()

CPU times: user 2.59 ms, sys: 187 µs, total: 2.78 ms
Wall time: 2.9 ms


In [234]:
%time sample_arr[index_df.values[:, 0], index_df.values[:, 1]] = to_be_added

CPU times: user 179 µs, sys: 14 µs, total: 193 µs
Wall time: 197 µs


In [236]:
sample_arr.shape

(176, 214)

In [237]:
%%time
ann_relative_2 = sample_arr.argsort(kind='quicksort')[
    :, :10
]

CPU times: user 923 µs, sys: 0 ns, total: 923 µs
Wall time: 930 µs


In [238]:
ann_relative_2

array([[  0, 136, 137, ..., 142, 143, 144],
       [  0, 136, 137, ..., 142, 143, 144],
       [  1,   0,   2, ..., 140, 141, 142],
       ...,
       [  0, 136, 137, ..., 142, 143, 144],
       [ 15,   3,   1, ...,  14,   7,   4],
       [  0, 136, 137, ..., 142, 143, 144]])

In [242]:
dists_res = np.take_along_axis(sample_arr, ann_relative_2, axis=1)

In [243]:
dists_res

array([[1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.91069871e-01, 4.99491491e-01, 5.04809216e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       ...,
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [8.61833045e-02, 1.38970976e-01, 1.43215593e-01, ...,
        2.43290125e-01, 2.47679775e-01, 2.50551390e-01],
       [2.80074201e-01, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04]])

In [241]:
dists[cat_idxs]

array([[0.16662204, 0.26132104, 0.28283674, ..., 0.3248234 , 0.32667616,
        0.3311572 ],
       [0.4429067 , 0.4474052 , 0.45352405, ..., 0.4677359 , 0.46902606,
        0.46920323],
       [0.48189315, 0.4886164 , 0.48993993, ..., 0.5114319 , 0.51531136,
        0.5171687 ],
       ...,
       [0.30622727, 0.33433402, 0.33730066, ..., 0.361187  , 0.36834347,
        0.3712152 ],
       [0.08808586, 0.20122772, 0.21688594, ..., 0.2642632 , 0.2695809 ,
        0.3098314 ],
       [0.2283353 , 0.25617176, 0.27532452, ..., 0.29867372, 0.30868042,
        0.3158128 ]], dtype=float32)

In [244]:
dists[cat_idxs]

array([[0.16662204, 0.26132104, 0.28283674, ..., 0.3248234 , 0.32667616,
        0.3311572 ],
       [0.4429067 , 0.4474052 , 0.45352405, ..., 0.4677359 , 0.46902606,
        0.46920323],
       [0.48189315, 0.4886164 , 0.48993993, ..., 0.5114319 , 0.51531136,
        0.5171687 ],
       ...,
       [0.30622727, 0.33433402, 0.33730066, ..., 0.361187  , 0.36834347,
        0.3712152 ],
       [0.08808586, 0.20122772, 0.21688594, ..., 0.2642632 , 0.2695809 ,
        0.3098314 ],
       [0.2283353 , 0.25617176, 0.27532452, ..., 0.29867372, 0.30868042,
        0.3158128 ]], dtype=float32)

In [246]:
dists_res

array([[1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.91069871e-01, 4.99491491e-01, 5.04809216e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       ...,
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [8.61833045e-02, 1.38970976e-01, 1.43215593e-01, ...,
        2.43290125e-01, 2.47679775e-01, 2.50551390e-01],
       [2.80074201e-01, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04]])

In [235]:
sample_arr

array([[1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.99491491e-01, 4.91069871e-01, 5.04809216e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       ...,
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [3.05168964e-01, 1.43215593e-01, 2.17929883e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [2.80074201e-01, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04]])

In [253]:
dists_final = np.hstack((dists[cat_idxs], dists_res))
dists_final.shape

(176, 20)

In [254]:
dists_final

array([[1.66622043e-01, 2.61321038e-01, 2.82836735e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.42906708e-01, 4.47405189e-01, 4.53524053e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.81893152e-01, 4.88616407e-01, 4.89939928e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       ...,
       [3.06227267e-01, 3.34334016e-01, 3.37300658e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [8.80858600e-02, 2.01227725e-01, 2.16885939e-01, ...,
        2.43290125e-01, 2.47679775e-01, 2.50551390e-01],
       [2.28335306e-01, 2.56171763e-01, 2.75324523e-01, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04]])

In [255]:
dists_final.argsort(kind='stable', axis=1)[:, :k]

array([[ 0,  1,  2, ...,  7,  8,  9],
       [ 0,  1,  2, ...,  7,  8,  9],
       [ 0,  1,  2, ...,  5, 12, 13],
       ...,
       [ 0,  1,  2, ...,  7,  8,  9],
       [10,  0, 11, ..., 15,  2, 16],
       [ 0,  1,  2, ...,  6,  7,  8]])

In [None]:
    anns_final = np.hstack((anns_final, anns))
    dists_final = np.hstack((dists_final, dists))
    # gets the sorted indices of the stacked dists
    idx_sorted = dists_final.argsort(kind='stable', axis=1)[:, :k]

In [222]:
index_df.values.max()

213

In [212]:
target_dists[0]

array([  2,   2,   2, ..., 174, 174, 175])

In [226]:
index_df.values[:, 0].max(), index_df.values[:, 1].max()

(175, 213)

In [228]:
sample_arr.shape

(176, 214)

In [203]:
%time target_dists[1][target_dists[0] == 2]

CPU times: user 133 µs, sys: 9 µs, total: 142 µs
Wall time: 146 µs


array([ 190,  755, 1872, 2899])

In [186]:
# return_index=True, return_inverse=True
np.unique(np.argwhere(pure_arr < res)[:, 0], return_inverse=True)

(array([  2,   3,   4,   7,   8,   9,  10,  11,  13,  14,  16,  18,  19,
         20,  21,  23,  24,  25,  26,  27,  28,  29,  31,  32,  33,  35,
         37,  38,  40,  41,  43,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  55,  56,  57,  58,  59,  60,  61,  63,  64,  65,  66,  67,
         68,  69,  70,  71,  72,  74,  75,  77,  78,  80,  81,  83,  84,
         85,  86,  88,  89,  91,  92,  93,  94,  95,  97,  98,  99, 100,
        101, 102, 103, 104, 105, 106, 109, 110, 114, 115, 117, 118, 119,
        120, 121, 122, 123, 124, 125, 127, 128, 130, 131, 132, 133, 135,
        136, 137, 138, 139, 140, 141, 142, 145, 147, 148, 149, 151, 153,
        154, 155, 156, 159, 160, 161, 162, 163, 165, 168, 169, 172, 174,
        175]),
 array([  0,   0,   0, ..., 129, 129, 130]))

In [183]:
%time max_arr = np.unique(np.argwhere(pure_arr < res)[:, 0], return_counts=True)[1].max()

CPU times: user 2.43 ms, sys: 0 ns, total: 2.43 ms
Wall time: 2.44 ms


In [None]:
sample_arr

In [181]:
%time res2_w = np.where(pure_arr < res)

CPU times: user 2.32 ms, sys: 0 ns, total: 2.32 ms
Wall time: 6.9 ms


In [180]:
%time res2 = np.argwhere(pure_arr < res)

CPU times: user 3.99 ms, sys: 0 ns, total: 3.99 ms
Wall time: 4 ms


In [182]:
res2_w

(array([  2,   2,   2, ..., 174, 174, 175]),
 array([ 190,  755, 1872, ..., 2538, 2995, 2569]))

In [145]:
res2

array([[   2,  190],
       [   2,  755],
       [   2, 1872],
       ...,
       [ 174, 2538],
       [ 174, 2995],
       [ 175, 2569]])

In [None]:
res2

In [179]:
res2[:, 1]

array([ 190,  755, 1872, ..., 2538, 2995, 2569])

In [147]:
sample_arr[2, 0] = pure_arr[res2]

In [146]:
pure_arr[2, 190]

0.4994914911364663

In [149]:
pure_arr[res2].shape

IndexError: index 190 is out of bounds for axis 0 with size 176

In [148]:
sample_arr

array([[1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [4.99491491e-01, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       ...,
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04],
       [1.00000000e+04, 1.00000000e+04, 1.00000000e+04, ...,
        1.00000000e+04, 1.00000000e+04, 1.00000000e+04]])

In [137]:
res2[res2 == 2]

array([2, 2, 2, 2, 2, 2])

In [138]:
sample_arr[res2[:, 0]] = res2[:, 1]

ValueError: shape mismatch: value array of shape (2379,) could not be broadcast to indexing result of shape (2379,214)

In [None]:
sample_arr

In [None]:
dists = np.zeros((queries_search.shape[0], k), dtype=np.float32)

In [126]:
np.argwhere(pure_arr < res)

array([[   2,  190],
       [   2,  755],
       [   2, 1872],
       ...,
       [ 174, 2538],
       [ 174, 2995],
       [ 175, 2569]])

In [128]:
pure_arr[2, 190]

0.4994914911364663

In [123]:
np.where(pure_arr < res)[0][:100]

array([ 2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  7,  7,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10,
       10, 10, 11, 11, 11, 11, 11, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16])

In [124]:
np.where(pure_arr < res)[1][:100]

array([ 190,  755, 1872, 2899,  118,  386,  659,  775, 1064, 1070, 1582,
       2979,   36,  127,  178,  363,  502,  603,  850,  942, 1095, 2433,
       1142, 1237,  724,  752, 2683,   71,  498, 1967, 2055, 2505, 2930,
        276, 1820, 2939,  567,  627, 1121, 2369, 2930,  497, 1327, 1354,
       1537, 1735, 2349, 2705,   86,  222,  258,  356,  450,  487,  511,
        578,  667,  675,  969,  977, 1326, 1490, 1546, 1660, 2075, 2081,
       2204, 2409, 2492, 2556, 2907, 2921, 2939, 2964, 3020,   20,   59,
         63,   68,   99,  101,  139,  162,  209,  229,  304,  305,  320,
        337,  341,  374,  387,  410,  452,  456,  478,  509,  520,  525,
        536])

In [125]:
pure_arr[np.where(pure_arr < res)[0][:100], np.where(pure_arr < res)[1][:100]]

array([0.49949149, 0.49106987, 0.50480922, 0.50491835, 0.21970675,
       0.21747319, 0.20134505, 0.18773172, 0.22785204, 0.20411416,
       0.18883318, 0.22479822, 0.2812132 , 0.4159223 , 0.26609447,
       0.2619391 , 0.41588517, 0.40184208, 0.39325575, 0.24603255,
       0.35324137, 0.41466428, 0.36104663, 0.43005669, 0.33777408,
       0.31274958, 0.32374122, 0.39482597, 0.39811506, 0.39745085,
       0.39433688, 0.39809865, 0.38307477, 0.33959513, 0.34356986,
       0.30563943, 0.41887541, 0.40099864, 0.42117212, 0.40196987,
       0.40889696, 0.33866255, 0.32742303, 0.34803717, 0.31747813,
       0.34207842, 0.34846552, 0.34598291, 0.44701626, 0.43784671,
       0.36496563, 0.43637098, 0.48160728, 0.38970633, 0.48853607,
       0.49631695, 0.46954897, 0.49961672, 0.39478519, 0.43095072,
       0.5074329 , 0.35632667, 0.38003825, 0.4954067 , 0.41376934,
       0.47545652, 0.44329446, 0.40261632, 0.46516595, 0.34723944,
       0.44903738, 0.50536929, 0.42128971, 0.48607873, 0.47917

In [122]:
pure_arr[np.where(pure_arr < res)].shape

(2379,)

In [105]:
(np.array(dists.max(axis=1)[cat_idxs],)*5).T.shape

(176,)

In [99]:
np.tile(dists.max(axis=1)[cat_idxs], (176, 2)).shape

(176, 352)

In [90]:
pure_arr[1][pure_arr[1] < np.hstack(dists.max(axis=1)[cat_idxs][1], )

array([], dtype=float64)

In [85]:
np.where(pure_arr < 0.79674435)[0].shape

(536398,)

In [None]:
pure_arr[np.where(pure_arr < 0.79674435)]

In [83]:
pure_arr[np.where(pure_arr < 0.79674435)].reshape(pure_arr.shape[0], -1)

ValueError: cannot reshape array of size 536398 into shape (176,newaxis)

In [39]:
pure_arr.shape

(176, 3095)

In [50]:
pure_arr[:2]

array([[0.58516339, 0.57479123, 0.59408487, ..., 0.83530635, 0.72327197,
        0.59838322],
       [0.72275381, 0.60222068, 0.58674307, ..., 0.77942052, 0.62807737,
        0.57211713]])

In [67]:
d = dists.max(axis=1)

In [68]:
d[d < 10_000].max()

0.79674435

In [66]:
np.sort(dists.max(axis=1))

array([-4.440892e-16, -4.440892e-16, -4.440892e-16, ...,  1.000000e+04,
        1.000000e+04,  1.000000e+04], dtype=float32)

In [51]:
dists.max(axis=1)[cat_idxs[:2]]

array([0.3311572 , 0.46920323], dtype=float32)

In [57]:
pure_arr[:2][0]

array([0.58516339, 0.57479123, 0.59408487, ..., 0.83530635, 0.72327197,
       0.59838322])

In [73]:
np.argwhere(pure_arr < d[d < 10_000].max())

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [ 175, 3092],
       [ 175, 3093],
       [ 175, 3094]])

In [72]:
pure_arr[np.argwhere(pure_arr < d[d < 10_000].max())].shape

IndexError: index 176 is out of bounds for axis 0 with size 176

In [71]:
pure_arr.shape

(176, 3095)

In [53]:
mask = pure_arr[:2] > dists.max(axis=1)[cat_idxs[:2]]

ValueError: operands could not be broadcast together with shapes (2,3095) (2,) 

In [49]:
pure_arr[:2] < dists.max(axis=1)[cat_idxs[:2]]

ValueError: operands could not be broadcast together with shapes (2,3095) (2,) 

In [45]:
pure_arr[pure_arr < dists.max(axis=1)[cat_idxs]].shape

ValueError: operands could not be broadcast together with shapes (176,3095) (176,) 

In [None]:
dists, anns, t_all, t_pairwise = self.search_single(
    data_navigation,
    data_search,
    queries_search,
    pred_proba_categories[:, bucket]
)