In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import h5py
import pandas as pd
data_path = 'data/clip768v2/10M/dataset.h5'

In [None]:
%time f = h5py.File(data_path, 'r')

CPU times: user 0 ns, sys: 218 ms, total: 218 ms
Wall time: 261 ms


In [62]:
%time f2 = h5py.File('data/pca32v2/10M/query.h5', 'r')
[k for k in f2.keys()]

CPU times: user 2.41 ms, sys: 0 ns, total: 2.41 ms
Wall time: 48.7 ms


['pca32']

## Select random pivots, create labels based on a subset

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

def pairwise_cosine(x, y):
    return 1-cosine_similarity(x, y)

In [12]:
rng = np.random.default_rng(seed=2023)
pivot_indexes = rng.choice(range(10_000_000), size=(10_000), replace=False)
pivot_indexes.shape, pivot_indexes[:10]

((10000,),
 array([5167873, 9776151, 3750648, 9583857, 5416365, 2652464, 1107702,
         421219, 1594416, 5593595]))

In [13]:
pivot_indexes = np.sort(pivot_indexes)
%time pivot_data = f['emb'][pivot_indexes, :]

CPU times: user 2.24 s, sys: 95.9 ms, total: 2.34 s
Wall time: 2.42 s


In [10]:
import numpy as np
rng = np.random.default_rng(12345)
print(rng)
rfloat = rng.random()
rfloat

Generator(PCG64)


0.22733602246716966

In [11]:
rng = np.random.default_rng(12345+1)
print(rng)
rfloat = rng.random()
rfloat

Generator(PCG64)


0.4734724218871649

In [15]:
%%time
for i, (pivot, pivot_index) in enumerate(zip(pivot_data, pivot_indexes)):
    s = time.time()
    # beware of setting a correct seed, otherwise choice will return the same thing
    rng = np.random.default_rng(seed=2023+(i+1))
    print(rng)
    random_10k = rng.choice(range(10_000_000), size=(10_000), replace=False)
    
    # automatically sorts random_10k -> necessary for indexing in `f`
    random_10k = np.setdiff1d(random_10k, pivot_indexes)
    print(f'Size of random_10k after filtering out conflict objects: {random_10k.shape}')
    loaded_data = f['emb'][random_10k, :]
    if i == 0:
        assert not np.all(pivot == loaded_data[0]), 'Loaded data is the same as pivot, did you set the random seed properly?'
    print(f'Loading data took: {time.time()-s}')
    s = time.time()
    dists = pairwise_cosine([pivot], loaded_data)
    print(f'Computing pairwise took: {time.time()-s}')
    break

Generator(PCG64)
Size of random_10k after filtering out conflict objects: (9993,)
Loading data took: 67.47568726539612
Computing pairwise took: 0.06532931327819824
CPU times: user 3.56 s, sys: 1.14 s, total: 4.7 s
Wall time: 1min 7s


In [18]:
# Loading (+ computing dists) for 1 pivot took 70s, how long would it take to do the same for 10k (in hours)?
10_000 * 70 / 60 / 60

194.44444444444443

In [20]:
# still slow
%%time
for i in range(10):
    rng = np.random.default_rng(seed=2023+(i+1))
    random_10k = rng.choice(range(10_000_000), size=(1_000), replace=False)
    loaded_data = f['emb'][np.sort(random_10k), :]

CPU times: user 12.1 s, sys: 4.17 s, total: 16.3 s
Wall time: 1min 27s


### Torch?

In [21]:
import torch

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler
import os
import h5py

class OxpetDataset(Dataset):
    """Dataset to load data from the Oxford pet dataset .h5 files

    :param dir_path: path to directory containing data (e.g. train, test or val)
    :type dir_path: str
    :param tasks: list of tasks to load data for, must be in 'class', 'seg' or 'bb'
    :type tasks: list
    :param transform: transformation to apply to the images
    :type transform: list
    :param target_transforms: transforms to add to each target, of the form {task:[transforms]}
    :type target_transforms: dict
    :param shuffle: parameter to enable shuffling within batch after loading
    :type shuffle: bool
    :param max_size: maximum size of data to draw from (useful for debugging purposes)
    :type max_size: int
    """
    task_to_file = {
        'class': data_path,
    }
    def __init__(self, dir_path, shuffle=True, max_size=None):

        super(OxpetDataset, self).__init__()

        self.dir_path = dir_path
        self.inputs = self._load_h5_file_with_data()

        self.shuffle = shuffle
        self.max_size = max_size

    def __getitem__(self, index):

        inputs = self.inputs['data'][index]
        return (inputs, )

    def __len__(self):
        return self.max_size if self.max_size else self.inputs['data'].shape[0]

    def _load_h5_file_with_data(self):
        path = os.path.join(self.dir_path)
        file = h5py.File(path)
        key = list(file.keys())[0]
        data = file[key]
        return dict(file=file, data=data)

    def _permute_tf_to_torch(self, tensor):
        """Function to load PIL images in correct format required by PyTorch
        This extends the capabiliy of torchvision.transforms.ToTensor to 4D arrays
        """
        return tensor.permute([0, 3, 2, 1])

    def _from_numpy(self, tensor):
        return torch.from_numpy(tensor).float()


class RandomBatchSampler(Sampler):
    """Sampling class to create random sequential batches from a given dataset
    E.g. if data is [1,2,3,4] with bs=2. Then first batch, [[1,2], [3,4]] then shuffle batches -> [[3,4],[1,2]]
    This is useful for cases when you are interested in 'weak shuffling'

    :param dataset: dataset you want to batch
    :type dataset: torch.utils.data.Dataset
    :param batch_size: batch size
    :type batch_size: int
    :returns: generator object of shuffled batch indices
    """
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.dataset_length = len(dataset)
        self.n_batches = self.dataset_length / self.batch_size
        self.batch_ids = torch.randperm(int(self.n_batches))

    def __len__(self):
        return self.batch_size

    def __iter__(self):
        for id in self.batch_ids:
            idx = torch.arange(id * self.batch_size, (id + 1) * self.batch_size)
            for index in idx:
                yield int(index)
        if int(self.n_batches) < self.n_batches:
            idx = torch.arange(int(self.n_batches) * self.batch_size, self.dataset_length)
            for index in idx:
                yield int(index)


def normal_loader(dataset, batch_size=32, drop_last=False, shuffle=True):
    """Implements a normal loading scheme
    This scheme indexes the dataset one index at a time. It is slow because the .h5 causes a bottleneck that
    scales linearly with the number of calls made to it. However, this allows strong shuffling to be used.

    :param dataset: dataset
    :type dataset: torch.utils.data.Dataset
    :param batch_size: batch_size
    :type batch_size: int
    :param drop_last: bool to determine if last batch dropped if not full size
    :type drop_last: bool
    :returns: batched dataset
    :rtype: torch.utils.data.DataLoader
    """
    return DataLoader(dataset, batch_size=batch_size, drop_last=drop_last, shuffle=shuffle)

def fast_loader(dataset, batch_size=32, drop_last=False, transforms=None):
    """Implements fast loading by taking advantage of .h5 dataset
    The .h5 dataset has a speed bottleneck that scales (roughly) linearly with the number
    of calls made to it. This is because when queries are made to it, a search is made to find
    the data item at that index. However, once the start index has been found, taking the next items
    does not require any more significant computation. So indexing data[start_index: start_index+batch_size]
    is almost the same as just data[start_index]. The fast loading scheme takes advantage of this. However,
    because the goal is NOT to load the entirety of the data in memory at once, weak shuffling is used instead of
    strong shuffling.

    :param dataset: a dataset that loads data from .h5 files
    :type dataset: torch.utils.data.Dataset
    :param batch_size: size of data to batch
    :type batch_size: int
    :param drop_last: flag to indicate if last batch will be dropped (if size < batch_size)
    :type drop_last: bool
    :returns: dataloading that queries from data using shuffled batches
    :rtype: torch.utils.data.DataLoader
    """
    return DataLoader(
        dataset, batch_size=batch_size,  # must be disabled when using samplers
        sampler=BatchSampler(RandomBatchSampler(dataset, batch_size), batch_size=batch_size, drop_last=drop_last)
    )

In [51]:
dataset = OxpetDataset(data_path)
len(dataset)

10120191

In [52]:
loader = normal_loader(dataset)
it = iter(loader)

In [53]:
len(loader)

316256

In [54]:
%time next(it)

CPU times: user 416 ms, sys: 107 ms, total: 523 ms
Wall time: 991 ms


[tensor([[ 4.7485e-02,  1.0368e-02, -4.3144e-03,  ...,  1.5228e-02,
           2.3098e-03,  1.0536e-02],
         [ 5.7098e-02, -6.4735e-03, -4.8399e-05,  ...,  1.2787e-02,
          -3.7048e-02,  4.7333e-02],
         [-1.9913e-02, -7.9422e-03,  3.1052e-03,  ...,  1.0750e-02,
          -2.0813e-02,  9.8495e-03],
         ...,
         [ 1.8387e-02,  1.0597e-02,  6.0768e-03,  ..., -4.3716e-03,
           2.9251e-02,  1.2230e-02],
         [-2.4689e-02,  2.4719e-02,  1.5900e-02,  ...,  3.9711e-03,
           1.5305e-02, -6.0501e-03],
         [ 6.7383e-02, -1.0902e-02,  2.1458e-03,  ..., -3.0640e-02,
          -2.5444e-03,  1.5688e-04]], dtype=torch.float16)]

In [60]:
loader = fast_loader(dataset, batch_size=10_000)
it = iter(loader)
%time data = next(it)

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 10000 and 191 in dimension 1 at /pytorch/aten/src/TH/generic/THTensor.cpp:711

In [61]:
data.shape

NameError: name 'data' is not defined

In [None]:
%time next(it)

In [16]:
np.argmin(dists[0]), np.argmax(dists[0])

(3930, 4)

In [92]:
np.argmax(dists[0])

6259

In [54]:
loaded_data.shape

(9933, 768)

In [63]:
random_10k

array([    745,    2569,    5565, ..., 9997836, 9999556, 9999874])

In [64]:
pivot_index

6693269

In [72]:
pairwise_cosine([pivot], [loaded_data[0]])

array([[2.22044605e-16]])

In [74]:
loaded_data[:2, :]

array([[ 0.03105 ,  0.04623 ,  0.006065, ...,  0.00693 ,  0.006313,
         0.0137  ],
       [ 0.0464  ,  0.02809 ,  0.01153 , ..., -0.02838 ,  0.02496 ,
        -0.012794]], dtype=float16)

In [71]:
pairwise_cosine([pivot], loaded_data[:2, :])

array([[-4.44089210e-16,  5.81326135e-01]])

In [70]:
pairwise_cosine([pivot], loaded_data[:3, :])

array([[-4.44089210e-16,  5.81326135e-01,  4.57118379e-01]])

In [None]:
dists = pairwise_cosine([pivot], loaded_data)

In [62]:
dists[0]

array([-4.44089210e-16,  5.81326135e-01,  4.57118379e-01, ...,
        5.09698219e-01,  6.22019395e-01,  5.73977351e-01])

In [78]:
dists[0][np.argmax(dists[0])]

0.9990791981225113

In [79]:
random_10k[0]

745

In [80]:
pivot_index

6693269

In [82]:
r = f['emb'][[random_10k[0], pivot_index], :]

In [84]:
r

array([[ 0.03105 ,  0.04623 ,  0.006065, ...,  0.00693 ,  0.006313,
         0.0137  ],
       [ 0.02454 ,  0.0679  ,  0.01191 , ..., -0.01563 , -0.01168 ,
        -0.0253  ]], dtype=float16)

In [83]:
pairwise_cosine([r[0]], [r[1]])

array([[0.38861764]])

In [None]:
for chunk in data_iter:
    print(chunk.shape)
    break

In [None]:
f

In [None]:
import pandas as pd
%time data = pd.DataFrame(list(f['emb']))

In [1]:
data.head(2)

NameError: name 'data' is not defined

In [5]:
import numpy as np

In [None]:
q_path = 'data/clip768v2/10M/query.h5'
f = h5py.File(q_path, 'r')
queries = np.array(f['emb'])

In [None]:
gt_path = '../data/groundtruth-10M.h5'
f = h5py.File(gt_path, 'r')
gt_knns = pd.DataFrame(list(f['knns']))

## 1. baseline: brute-force approach
- 0.5-0.6s per query

In [17]:
from tqdm import tqdm

In [24]:
queries

array([[ 0.00142498,  0.02577562, -0.02312023, ..., -0.00711157,
        -0.00148698,  0.01573396],
       [ 0.0099068 ,  0.06058476, -0.00822135, ...,  0.0515245 ,
         0.03816291, -0.02117111],
       [ 0.02220821,  0.01497834, -0.00682948, ..., -0.02011856,
        -0.01174472, -0.01334628],
       ...,
       [ 0.03751928,  0.04933374,  0.03388642, ...,  0.01226475,
         0.01598157,  0.03229894],
       [ 0.00230957,  0.02555594, -0.02645612, ...,  0.00753328,
        -0.03265058, -0.02793607],
       [-0.00378157,  0.01177591,  0.01089825, ...,  0.05198799,
        -0.00759748, -0.01337096]], dtype=float32)

In [32]:
anns.shape

(10000, 100000)

In [39]:
nns = anns.argsort()[:k]
nns.shape

(10, 100000)

In [None]:
nns = nns + 1

In [None]:
nns

In [42]:
%time dists = np.sort(anns)[:k]

CPU times: user 1min 15s, sys: 8.32 s, total: 1min 23s
Wall time: 1min 27s


In [43]:
dists.shape

(10, 100000)

In [44]:
dists[0]

array([0.27284035, 0.30615299, 0.31319278, ..., 1.07019817, 1.09353056,
       1.11341086])

In [40]:
dists = anns[nns]

MemoryError: Unable to allocate 745. GiB for an array with shape (10, 100000, 100000) and data type float64

In [33]:
k = 10

In [36]:
anns[0]

array([0.27284035, 0.30615299, 0.31319278, ..., 1.07019817, 1.09353056,
       1.11341086])

In [35]:
dists = anns.sort()[:k]

TypeError: 'NoneType' object is not subscriptable

In [None]:
dists.shape

In [None]:
anns.shape

In [None]:
dists = anns.sort()[:k]

In [30]:
anns[0]

79171

In [15]:
%%time

recalls = []

def evaluate_recall(anns, knns, k=10):
    n_hits = knns.intersection(anns).shape[0]
    return n_hits / k

def search_bruteforce(query_idx, k=10):
    query = queries[query_idx]
    anns = pairwise_cosine(queries, data)[0].argsort()[:k]
    return anns

anns = search_bruteforce(0)

CPU times: user 349 ms, sys: 683 ms, total: 1.03 s
Wall time: 1.08 s


In [16]:
anns

array([79171, 15734, 22336,   230, 74172, 41078, 38158, 71848, 69014,
       92810])

In [17]:
gt_knns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,79172,15735,22337,231,74173,41079,38159,71849,69015,92811,...,45784,5492,35205,21690,16711,95028,19961,1708,68450,20333
1,14347,82848,79302,85923,6016,67067,29567,54566,34591,11620,...,79591,11637,80269,71577,45876,3581,48943,78719,78261,24809
2,51954,87690,3444,5491,65271,92521,58848,72008,63515,68040,...,7561,36433,88320,26411,63747,9646,19546,5171,97307,54264
3,92326,41249,719,70712,62434,90089,44995,12118,86491,25939,...,60939,73206,82366,97741,59386,56940,57245,19146,81212,26278
4,14561,69890,23198,5241,68802,92516,87332,45813,98006,98421,...,98075,99085,75659,20061,94633,84659,62323,37606,18919,79713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,84803,99470,25522,18080,82624,77158,74610,87484,4949,91291,...,85213,29377,67528,95900,46351,32017,70546,76284,19790,74703
9996,98249,64946,7681,21787,20204,35606,3623,37419,762,93695,...,68525,11361,72298,17904,86048,74668,17754,41172,79999,31679
9997,39741,13601,2327,68421,15206,87376,14895,40424,77120,63448,...,96120,71419,2195,55978,21082,4504,14819,25930,52271,18391
9998,14313,98071,34615,94650,84991,10239,61708,72572,53644,60539,...,62230,35776,15108,85067,93696,34297,61442,70148,12706,96325


### Simple training approach -- 1 Logistic regression
brute-force time to beat: 0.5s/query

In [9]:
import logging
logging.basicConfig(level=logging.INFO)
logging.info('Initialized logger')

INFO:root:Initialized logger


In [10]:
def create_category(data, random_state_offset):
    main_datapoint = data.sample(1, random_state=2023+random_state_offset)
    #logging.debug(f'using object: {main_datapoint.index[0]}')
    distances = pairwise_cosine(main_datapoint.values, data)
    idxs = distances[0].argsort()[1:1001]
    return (main_datapoint.index[0], np.vstack([idxs, distances[0][idxs]]))


In [14]:
import numpy as np
from tqdm import tqdm

def get_train_labels(data, n_categories=100):
    categories = []
    main_objs = []
    for i in tqdm(range(n_categories)):
        obj_id, cat = create_category(data, random_state_offset=i)
        main_objs.append(obj_id)
        categories.append(cat)

    df_all = pd.DataFrame(np.empty(0, dtype=np.uint32))
    for cat_id, c in enumerate(categories):
        df_ = pd.DataFrame(c.T)
        df_[2] = cat_id
        df_all = pd.concat([df_all, df_])

    df_all = df_all.rename(
        columns={
            0: 'object_id', 1: 'dist', 2: 'category_id'
        }
    ).sort_values(
        'dist', ascending=True
    ).drop_duplicates(
        'object_id', keep='first'
    )
    return df_all

%time labels_df = get_train_labels(data)

100%|██████████| 100/100 [00:53<00:00,  1.85it/s]


CPU times: user 38 s, sys: 14.7 s, total: 52.7 s
Wall time: 54.3 s


### Time to create training labels for 100 categories (100k data): ~ 1min

In [15]:
from sklearn.linear_model import LogisticRegression

def train_index(data, labels_df):
    X = data.loc[labels_df.object_id.astype(int)]
    y = labels_df.category_id.astype(int)
    model = LogisticRegression(random_state=2023, max_iter=500).fit(X, y)
    return model

%time model = train_index(data, labels_df)

CPU times: user 1min 16s, sys: 287 ms, total: 1min 16s
Wall time: 1min 17s


### Time to train: ~70s for increased n_iters (TODO: Hyp. search)

## Evaluate search with stop cond: single bucket

In [16]:
%time data_categories = pd.DataFrame(model.predict(data), index=data.index, columns=['category'])

CPU times: user 675 ms, sys: 12.4 ms, total: 687 ms
Wall time: 696 ms


In [78]:
%time predicted_categories = np.argmax(model.predict_proba(queries), axis=1)

CPU times: user 104 ms, sys: 0 ns, total: 104 ms
Wall time: 107 ms


In [19]:
predicted_categories

array([54, 26, 32, ..., 70,  0, 31])

In [82]:
%time predicted_categories = np.argsort(model.predict_proba(queries), axis=1)

CPU times: user 86.8 ms, sys: 307 µs, total: 87.1 ms
Wall time: 181 ms


In [59]:
predicted_categories.shape

(10000, 100)

In [60]:
predicted_categories[0]

array([82,  7, 52, 78, 28, 43, 13, 86, 91, 87, 93, 72, 15, 65, 39, 24, 34,
       35, 70, 83, 23, 17, 56,  5,  8,  3, 20,  9, 71, 85, 67, 41, 55,  1,
       18, 79, 32, 11, 96, 94, 33, 29, 37, 42, 26, 74, 27, 14, 44, 98, 81,
       66, 36, 45,  6, 47, 88, 63, 68, 31, 21, 46, 99, 76, 30, 25,  2, 69,
       50, 58, 80, 49, 51,  0, 97, 40, 48, 10, 53, 90, 19, 61, 59, 16, 77,
       89, 73,  4, 62, 84, 38, 12, 92, 22, 95, 75, 57, 64, 60, 54])

In [40]:
predicted_categories.shape

(10000,)

In [26]:
%%time
for cat in np.unique(predicted_categories):
    print(cat)
    bucket_obj_indexes = data_categories.query('category == @cat').index
    seq_search_dists = pairwise_cosine(
        queries[np.where(predicted_categories == cat)],
        data.loc[bucket_obj_indexes]
    )
    break

0
CPU times: user 69.4 ms, sys: 4.17 ms, total: 73.6 ms
Wall time: 73.7 ms


In [32]:
gt_knns.iloc[-2]

0     14313
1     98071
2     34615
3     94650
4     84991
      ...  
95    34297
96    61442
97    70148
98    12706
99    96325
Name: 9998, Length: 100, dtype: int64

In [73]:
for cat in np.unique(predicted_categories[:, -1]):
    cat_idxs = np.where(predicted_categories[:, -1] == cat)[0]
    bucket_obj_indexes = data_categories.query('category == @cat').index
    seq_search_dists = pairwise_cosine(queries[cat_idxs], data.loc[bucket_obj_indexes])
    ann_relative = seq_search_dists.argsort()[:, :k]
    anns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative] + 1
    dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)
    break

In [80]:
anns

array([[79172, 22337,   231, ..., 69015, 92811, 99973],
       [14347, 82848, 79302, ..., 54566, 34591, 11620],
       [51954, 87690,  3444, ..., 72008, 63515, 34745],
       ...,
       [39741, 13601,  2327, ..., 63448, 95475, 71610],
       [94650, 61708, 60539, ..., 54719, 15887, 59625],
       [ 2453, 83964, 66428, ..., 31684, 98652, 41742]], dtype=uint32)

In [93]:
def search_single_test(predicted_categories):
    anns = np.zeros((queries.shape[0], k), dtype=np.uint32)
    dists = np.zeros((queries.shape[0], k), dtype=np.float32)
    for cat in np.unique(predicted_categories):
        cat_idxs = np.where(predicted_categories == cat)[0]
        bucket_obj_indexes = data_categories.query('category == @cat').index
        seq_search_dists = pairwise_cosine(
            queries[cat_idxs],
            data.loc[bucket_obj_indexes]
        )
        ann_relative = seq_search_dists.argsort()[:, :k]
        anns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative] + 1
        dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)
    return anns, dists

In [202]:
%time anns, dists = search_single_test(predicted_categories[:, -1])

CPU times: user 2.39 s, sys: 10.5 ms, total: 2.4 s
Wall time: 2.46 s


In [203]:
%time anns2, dists2 = search_single_test(predicted_categories[:, -2])

CPU times: user 2.23 s, sys: 4.61 ms, total: 2.23 s
Wall time: 2.4 s


In [204]:
%%time
dists_all = np.hstack((dists, dists2))
anns_all = np.hstack((anns, anns2))

CPU times: user 474 µs, sys: 46 µs, total: 520 µs
Wall time: 525 µs


In [195]:
dists_all.shape

(10000, 20)

In [212]:
%time a = dists_all.argsort(kind='stable', axis=1)[:, :k]

CPU times: user 1.72 ms, sys: 170 µs, total: 1.89 ms
Wall time: 1.89 ms


In [213]:
%time idx = np.ogrid[tuple(map(slice, dists_all.shape))]

CPU times: user 274 µs, sys: 0 ns, total: 274 µs
Wall time: 280 µs


In [214]:
idx

[array([[   0],
        [   1],
        [   2],
        ...,
        [9997],
        [9998],
        [9999]]),
 array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
         16, 17, 18, 19]])]

In [215]:
idx[1] = a

In [216]:
%time dists_all[tuple(idx)]

CPU times: user 1.97 ms, sys: 196 µs, total: 2.16 ms
Wall time: 2.18 ms


array([[0.27284035, 0.306153  , 0.31319278, ..., 0.33513942, 0.3403605 ,
        0.3453628 ],
       [0.19747424, 0.21135962, 0.22881095, ..., 0.27025187, 0.2768707 ,
        0.27826455],
       [0.13460062, 0.14457192, 0.15563862, ..., 0.16689926, 0.16764964,
        0.17210594],
       ...,
       [0.21224909, 0.2241371 , 0.232968  , ..., 0.24021728, 0.2431261 ,
        0.2443094 ],
       [0.32610038, 0.33764818, 0.34348887, ..., 0.35277185, 0.35619137,
        0.36058566],
       [0.20044547, 0.29376504, 0.30769375, ..., 0.33250993, 0.3403026 ,
        0.34105274]], dtype=float32)

In [217]:
dists_all[tuple(idx)].shape

(10000, 10)

In [None]:
9998 -- {0.34511334, 0.35073513, 0.3679119, 0.36058566, 0.36601555,
         0.37538353, 0.37619492, 0.37755504, 0.3786396, 0.38301477} 
- {14313, 60539, 98071, 61708, 53644, 34615, 94650, 10239, 72572, 84991}


In [218]:
%time anns_all[tuple(idx)]

CPU times: user 1.16 ms, sys: 116 µs, total: 1.28 ms
Wall time: 1.28 ms


array([[79172, 15735, 22337, ..., 71849, 69015, 92811],
       [14347, 82848, 79302, ..., 54566, 34591, 11620],
       [51954, 87690,  3444, ..., 72008, 63515, 68040],
       ...,
       [39741, 13601,  2327, ..., 63448, 95475, 71610],
       [14313, 98071, 34615, ..., 72572, 53644, 60539],
       [24525, 46778,  2453, ..., 77816, 31450, 80968]], dtype=uint32)

In [219]:
anns_all[tuple(idx)].shape

(10000, 10)

In [220]:
anns_all[tuple(idx)][-2]

array([14313, 98071, 34615, 94650, 84991, 10239, 61708, 72572, 53644,
       60539], dtype=uint32)

In [193]:
dists_all

array([[0.27284035, 0.31319278, 0.319519  , ..., 0.4095242 , 0.412036  ,
        0.41297266],
       [0.19747424, 0.21135962, 0.22881095, ..., 0.3407105 , 0.3536085 ,
        0.37176004],
       [0.13460062, 0.14457192, 0.15563862, ..., 0.21025163, 0.21637371,
        0.22164102],
       ...,
       [0.21224909, 0.2241371 , 0.232968  , ..., 0.39077035, 0.39085406,
        0.39089644],
       [0.34511334, 0.35073513, 0.36058566, ..., 0.36065924, 0.36065924,
        0.36065924],
       [0.30769375, 0.32123458, 0.32372558, ..., 0.3425054 , 0.3425054 ,
        0.3425054 ]], dtype=float32)

In [145]:
dists_all[:, a].shape

(10000, 10000, 10)

In [156]:
dists_all[0][a[0]]

array([0.27284035, 0.306153  , 0.31319278, 0.319519  , 0.32419464,
       0.3316943 , 0.332923  , 0.33513942, 0.3403605 , 0.3453628 ],
      dtype=float32)

In [161]:
dists_all[:2]

array([[0.27284035, 0.31319278, 0.319519  , 0.32419464, 0.3316943 ,
        0.332923  , 0.33513942, 0.3403605 , 0.3453628 , 0.34960502,
        0.306153  , 0.38185748, 0.39277422, 0.3947172 , 0.39499146,
        0.39705977, 0.39887363, 0.4095242 , 0.412036  , 0.41297266],
       [0.19747424, 0.21135962, 0.22881095, 0.23895504, 0.25260547,
        0.25981018, 0.2626422 , 0.27025187, 0.2768707 , 0.27826455,
        0.299805  , 0.30436933, 0.3079004 , 0.3285277 , 0.3316455 ,
        0.33722755, 0.33748493, 0.3407105 , 0.3536085 , 0.37176004]],
      dtype=float32)

In [164]:
dists_all[:2][:, np.array([0])]

array([[0.27284035],
       [0.19747424]], dtype=float32)

In [221]:
n_buckets = 2

In [242]:
def search_single(queries, data, data_categories, predicted_categories, k=10):
    nns = np.zeros((queries.shape[0], k), dtype=np.uint32)
    dists = np.zeros((queries.shape[0], k), dtype=np.float32)
    for cat in np.unique(predicted_categories):
        cat_idxs = np.where(predicted_categories == cat)[0]
        bucket_obj_indexes = data_categories.query('category == @cat').index
        seq_search_dists = pairwise_cosine(queries[cat_idxs], data.loc[bucket_obj_indexes])
        ann_relative = seq_search_dists.argsort()[:, :k]
        nns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative] + 1
        dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)
    return dists, nns

In [227]:
data_categories = pd.DataFrame(
    model.predict(data),
    index=data.index,
    columns=['category']
)
anns_final = None
dists_final = None
predicted_categories = np.argsort(model.predict_proba(queries), axis=1)
predicted_categories

array([[82,  7, 52, ..., 64, 60, 54],
       [90, 17,  8, ..., 53, 63, 26],
       [36, 20, 80, ..., 87, 24, 32],
       ...,
       [93, 31,  5, ..., 11, 27, 70],
       [43, 11, 17, ..., 31, 95,  0],
       [33, 15, 66, ..., 36, 95, 31]])

In [248]:
anns_final = None
dists_final = None
for bucket in range(n_buckets):
    dists, anns = search_single(
        queries,
        data,
        data_categories,
        predicted_categories[:, -(bucket+1)]
    )
    if anns_final is None:
        anns_final = anns
        dists_final = dists
        print(f'anns_final it 1: {anns_final[-2]}')
    else:
        anns_final = np.hstack((anns_final, anns))
        dists_final = np.hstack((dists_final, dists))

        idx = np.ogrid[tuple(map(slice, dists_final.shape))]
        idx_sorted = dists_final.argsort(kind='stable', axis=1)[:, :k]
        idx[1] = idx_sorted
        dists_final = dists_final[tuple(idx)]
        anns_final = anns_final[tuple(idx)]
        print(f'anns_final it 2: {anns_final[-2]}')
        assert anns_final.shape == dists_final.shape == (queries.shape[0], k)

anns_final it 1: [94650 61708 60539   368 34688  1840 87459 54719 15887 59625]
anns_final it 2: [14313 98071 34615 94650 84991 10239 61708 72572 53644 60539]


In [247]:
anns[-2]

array([94650, 61708, 60539,   368, 34688,  1840, 87459, 54719, 15887,
       59625], dtype=uint32)

In [249]:
anns_final[-2]

array([14313, 98071, 34615, 94650, 84991, 10239, 61708, 72572, 53644,
       60539], dtype=uint32)

In [226]:
anns_all[tuple(idx)][-2]

array([14313, 98071, 34615, 94650, 84991, 10239, 61708, 72572, 53644,
       60539], dtype=uint32)

In [166]:
a[0], a[1]

(array([ 0, 10,  1,  2,  3,  4,  5,  6,  7,  8]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [167]:
dists_all[:2][:, a[0]]

array([[0.27284035, 0.306153  , 0.31319278, 0.319519  , 0.32419464,
        0.3316943 , 0.332923  , 0.33513942, 0.3403605 , 0.3453628 ],
       [0.19747424, 0.299805  , 0.21135962, 0.22881095, 0.23895504,
        0.25260547, 0.25981018, 0.2626422 , 0.27025187, 0.2768707 ]],
      dtype=float32)

In [169]:
df = pd.DataFrame(dists_all)

In [170]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.27284,0.313193,0.319519,0.324195,0.331694,0.332923,0.335139,0.34036,0.345363,0.349605,0.306153,0.381857,0.392774,0.394717,0.394991,0.39706,0.398874,0.409524,0.412036,0.412973
1,0.197474,0.21136,0.228811,0.238955,0.252605,0.25981,0.262642,0.270252,0.276871,0.278265,0.299805,0.304369,0.3079,0.328528,0.331645,0.337228,0.337485,0.34071,0.353608,0.37176
2,0.134601,0.144572,0.155639,0.157445,0.158151,0.161204,0.163761,0.166899,0.16765,0.178188,0.172106,0.175907,0.176241,0.182629,0.185003,0.189098,0.200092,0.210252,0.216374,0.221641


In [173]:
df.sort_values(by='index')

KeyError: 'index'

In [152]:
dists_all[:2].shape

(2, 20)

In [151]:
a[:2].shape

(2, 10)

In [None]:
dists_all[:, a]

In [141]:
%time dists_final = dists_all[a]

CPU times: user 2.64 ms, sys: 228 µs, total: 2.87 ms
Wall time: 2.88 ms


In [142]:
dists_final.shape

(10000, 10, 20)

In [109]:
a.shape

(10000, 10)

In [117]:
a

array([[ 0, 10,  1, ..., 18,  9, 19],
       [ 0, 10,  1, ..., 18,  9, 19],
       [ 0, 10,  1, ..., 18,  9, 19],
       ...,
       [ 0, 10,  1, ..., 18,  9, 19],
       [ 0, 10,  1, ..., 18,  9, 19],
       [ 0, 10,  1, ..., 18,  9, 19]])

In [112]:
dists_all.shape

(10000, 20)

In [83]:
%%time
anns_all = []
dists_all = []
for predicted_cat in [predicted_categories[:, -1], predicted_categories[:, -2]]:
    anns = np.zeros((queries.shape[0], k), dtype=np.uint32)
    dists = np.zeros((queries.shape[0], k), dtype=np.float32)
    for cat in np.unique(predicted_cat):
        cat_idxs = np.where(predicted_cat == cat)[0]
        bucket_obj_indexes = data_categories.query('category == @cat').index
        seq_search_dists = pairwise_cosine(
            queries[cat_idxs],
            data.loc[bucket_obj_indexes]
        )
        ann_relative = seq_search_dists.argsort()[:, :k]
        anns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative] + 1
        dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)
    anns_all.append(anns)
    dists_all.append(dists)

CPU times: user 4.73 s, sys: 0 ns, total: 4.73 s
Wall time: 9.74 s


In [84]:
anns_all[0]

array([[79172, 22337,   231, ..., 69015, 92811, 99973],
       [14347, 82848, 79302, ..., 54566, 34591, 11620],
       [51954, 87690,  3444, ..., 72008, 63515, 34745],
       ...,
       [39741, 13601,  2327, ..., 63448, 95475, 71610],
       [94650, 61708, 60539, ..., 54719, 15887, 59625],
       [ 2453, 83964, 66428, ..., 31684, 98652, 41742]], dtype=uint32)

In [85]:
anns_all[1]

array([[15735, 20970, 15425, ..., 29033, 83588,  6911],
       [84845, 78536, 45013, ..., 77697, 80630, 69875],
       [68040, 95368, 80177, ...,  9314, 29894, 58073],
       ...,
       [50020, 50979, 72511, ..., 89497, 24958, 91729],
       [14313, 98071, 34615, ..., 61491, 79170, 81986],
       [24525, 46778, 80968, ..., 43029, 87329, 42695]], dtype=uint32)

In [86]:
dists_all[0]

array([[0.27284035, 0.31319278, 0.319519  , ..., 0.3403605 , 0.3453628 ,
        0.34960502],
       [0.19747424, 0.21135962, 0.22881095, ..., 0.27025187, 0.2768707 ,
        0.27826455],
       [0.13460062, 0.14457192, 0.15563862, ..., 0.16689926, 0.16764964,
        0.17818761],
       ...,
       [0.21224909, 0.2241371 , 0.232968  , ..., 0.24021728, 0.2431261 ,
        0.2443094 ],
       [0.34511334, 0.35073513, 0.36058566, ..., 0.37755504, 0.3786396 ,
        0.38301477],
       [0.30769375, 0.32123458, 0.32372558, ..., 0.34151778, 0.3454288 ,
        0.3474231 ]], dtype=float32)

In [87]:
dists_all[1]

array([[0.306153  , 0.38185748, 0.39277422, ..., 0.4095242 , 0.412036  ,
        0.41297266],
       [0.299805  , 0.30436933, 0.3079004 , ..., 0.3407105 , 0.3536085 ,
        0.37176004],
       [0.17210594, 0.17590745, 0.17624122, ..., 0.21025163, 0.21637371,
        0.22164102],
       ...,
       [0.3210537 , 0.35754788, 0.37685978, ..., 0.39077035, 0.39085406,
        0.39089644],
       [0.32610038, 0.33764818, 0.34348887, ..., 0.36065924, 0.36065924,
        0.36065924],
       [0.20044547, 0.29376504, 0.34105274, ..., 0.3425054 , 0.3425054 ,
        0.3425054 ]], dtype=float32)

In [36]:
data_categories.shape

(100000, 1)

In [37]:
data_categories.head(2)

Unnamed: 0,category
0,1
1,85


In [45]:
data_categories.iloc[84991-1]

category    95
Name: 84990, dtype: int64

In [28]:
seq_search_dists

array([[0.53772484, 0.59553683, 0.5918337 , ..., 0.62560013, 0.63789878,
        0.5541769 ],
       [0.41973521, 0.51362575, 0.72756897, ..., 0.64051346, 0.68542026,
        0.49951072],
       [0.54615992, 0.60047767, 0.57002357, ..., 0.6087055 , 0.6968379 ,
        0.60658357],
       ...,
       [0.59972851, 0.61609018, 0.73336688, ..., 0.63054562, 0.66535502,
        0.53874219],
       [0.63197782, 0.69779297, 0.69259975, ..., 0.66826415, 0.74303128,
        0.64715563],
       [0.46578994, 0.57214831, 0.61152174, ..., 0.56146973, 0.61727093,
        0.41274009]])

In [41]:
anns = np.zeros((10_000, 10))

In [45]:
k = 10

In [50]:
cat_idxs[0]

array([  26,  147,  199,  223,  301,  332,  475,  485,  539,  551,  617,
        619,  683,  779,  810,  829,  863,  891,  957,  977,  998, 1024,
       1056, 1144, 1162, 1383, 1507, 1531, 1563, 1570, 1646, 1698, 1741,
       1759, 1766, 1779, 1794, 1823, 1949, 1981, 2048, 2166, 2222, 2292,
       2309, 2317, 2364, 2371, 2384, 2426, 2443, 2461, 2588, 2589, 2656,
       2659, 2716, 2772, 2908, 2926, 2970, 2971, 2979, 3015, 3016, 3054,
       3120, 3319, 3327, 3336, 3345, 3395, 3398, 3483, 3485, 3490, 3509,
       3541, 3562, 3566, 3595, 3649, 3729, 3774, 3803, 3940, 3977, 4049,
       4063, 4073, 4132, 4233, 4248, 4300, 4442, 4459, 4542, 4553, 4598,
       4606, 4718, 4746, 4760, 4763, 4781, 4841, 4873, 4891, 4960, 4962,
       5006, 5017, 5029, 5032, 5071, 5084, 5212, 5294, 5323, 5361, 5373,
       5428, 5509, 5538, 5542, 5599, 5760, 5812, 5938, 6108, 6188, 6195,
       6349, 6351, 6371, 6569, 6587, 6589, 6615, 6671, 6785, 6919, 6920,
       6925, 6929, 6938, 6947, 6952, 7001, 7002, 70

In [62]:
ann.shape

(236, 10)

In [67]:
seq_search_dists.shape

(236, 2202)

In [72]:
seq_search_dists[ann]

IndexError: index 1776 is out of bounds for axis 0 with size 236

In [70]:
ann.shape

(236, 10)

In [76]:
np.take_along_axis(seq_search_dists, ann, axis=1).shape

(236, 10)

In [None]:
%%time
data_categories = pd.DataFrame(
    model.predict(data),
    index=data.index,
    columns=['category']
)
predicted_categories = np.argmax(model.predict_proba(queries), axis=1)
anns = np.zeros((queries.shape[0], k), dtype=np.uint32)
dists = np.zeros((queries.shape[0], k), dtype=np.float32)
for cat in np.unique(predicted_categories):
    cat_idxs = np.where(predicted_categories == cat)[0]
    bucket_obj_indexes = data_categories.query('category == @cat').index
    seq_search_dists = pairwise_cosine(queries[cat_idxs], data.loc[bucket_obj_indexes])
    ann = seq_search_dists.argsort()[:, :k]
    anns[cat_idxs] = ann + 1
    dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann, axis=1)

In [None]:
anns

In [42]:
anns[np.array([0, 2, 3])]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
for pred_cat

In [37]:
predicted_categories[np.where(predicted_categories == 0)]

(array([  26,  147,  199,  223,  301,  332,  475,  485,  539,  551,  617,
         619,  683,  779,  810,  829,  863,  891,  957,  977,  998, 1024,
        1056, 1144, 1162, 1383, 1507, 1531, 1563, 1570, 1646, 1698, 1741,
        1759, 1766, 1779, 1794, 1823, 1949, 1981, 2048, 2166, 2222, 2292,
        2309, 2317, 2364, 2371, 2384, 2426, 2443, 2461, 2588, 2589, 2656,
        2659, 2716, 2772, 2908, 2926, 2970, 2971, 2979, 3015, 3016, 3054,
        3120, 3319, 3327, 3336, 3345, 3395, 3398, 3483, 3485, 3490, 3509,
        3541, 3562, 3566, 3595, 3649, 3729, 3774, 3803, 3940, 3977, 4049,
        4063, 4073, 4132, 4233, 4248, 4300, 4442, 4459, 4542, 4553, 4598,
        4606, 4718, 4746, 4760, 4763, 4781, 4841, 4873, 4891, 4960, 4962,
        5006, 5017, 5029, 5032, 5071, 5084, 5212, 5294, 5323, 5361, 5373,
        5428, 5509, 5538, 5542, 5599, 5760, 5812, 5938, 6108, 6188, 6195,
        6349, 6351, 6371, 6569, 6587, 6589, 6615, 6671, 6785, 6919, 6920,
        6925, 6929, 6938, 6947, 6952, 

In [36]:
categories, indexnp.unique(predicted_categories, return_index=True)#.shape

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([  26,  185,   57,   87,  359,   48,  353,  315,  200,  282,   25,
          36,  721,   82,   77,  183,  369,  227,  172,  197,   16,  271,
         263,  198,  739,  157,    1,   15,  104, 1171,  226,   20,    2,
         106,   76,   35,   56,  243,   23,   32,   17,   31,   84,  382,
           6,   44,   83,   13,   96,  135,   22,   27,    5,  338,    0,
         204,   41,   51,    3,   89,   95,   37,  160,  133,   98,   42,
         360,  176,   65,    8,    7,   29,  308,  320,  114,  131,   12,
          40,  298,   59,   19, 

In [31]:
#predicted_categories = np.argmax(model.predict_proba(queries), axis=1)
bucket_obj_indexes = data_categories.query('category ==(@predicted_categories)').index

In [32]:
bucket_obj_indexes

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            99990, 99991, 99992, 99993, 99994, 99995, 99996, 99997, 99998,
            99999],
           dtype='int64', length=100000)

In [None]:
%%time

recalls = []

# stop cond: single bucket
# time to evaluate 10k queries:
# mean recall: 0.5
def search_single_bucket_all_queries(queries, k=10):
    predicted_category = np.argmax(model.predict_proba(queries))
    bucket_obj_indexes = data_categories.query('category == @predicted_category').index
    bucket_df = data.loc[bucket_obj_indexes]
    final_dists = pairwise_cosine(query, bucket_df)[0]
    final_gts = final_dists.argsort()[:k]
    anns = bucket_df.iloc[final_gts].index + 1
    return evaluate_recall(anns, pd.Index(gt_knns.iloc[query_idx, :10].values))

for q_idx in tqdm(range(queries.shape[0])):
    recalls.append(search_single_bucket(q_idx))

In [21]:
%%time

recalls = []

# stop cond: single bucket
# time to evaluate 10k queries:
# mean recall: 0.5
def search_single_bucket(query_idx, k=10):
    query = queries.iloc[[query_idx]]
    predicted_category = np.argmax(model.predict_proba(query))
    bucket_obj_indexes = data_categories.query('category == @predicted_category').index
    bucket_df = data.loc[bucket_obj_indexes]
    final_dists = pairwise_cosine(query, bucket_df)[0]
    final_gts = final_dists.argsort()[:k]
    anns = bucket_df.iloc[final_gts].index + 1
    return evaluate_recall(anns, pd.Index(gt_knns.iloc[query_idx, :10].values))

for q_idx in tqdm(range(queries.shape[0])):
    recalls.append(search_single_bucket(q_idx))

100%|██████████| 10000/10000 [03:47<00:00, 44.01it/s]

CPU times: user 3min 40s, sys: 361 ms, total: 3min 40s
Wall time: 3min 47s





In [None]:
## Time: 0.0137s/query

In [22]:
np.mean(recalls)

0.47175

### Search stop cond: n buckets
#### n = 5

In [29]:
%%time

recalls_n = []

# stop cond: single bucket
# time to evaluate 10k queries:
# mean recall: 0.5
def search_n_buckets(query_idx, n, k=10):
    query = queries.iloc[[query_idx]]
    predicted_categories = model.predict_proba(query)[0].argsort()[::-1][:n]
    bucket_df = pd.DataFrame([])
    for predicted_category in predicted_categories:
        bucket_obj_indexes = data_categories.query('category == @predicted_category').index
        bucket_df = pd.concat([bucket_df, data.loc[bucket_obj_indexes]])
    final_dists = pairwise_cosine(query, bucket_df)[0]
    final_gts = final_dists.argsort()[:k]
    anns = bucket_df.iloc[final_gts].index + 1
    return evaluate_recall(anns, pd.Index(gt_knns.iloc[query_idx, :10].values))

for q_idx in tqdm(range(queries.shape[0])):
    recalls_n.append(search_n_buckets(q_idx, 5))

100%|██████████| 10000/10000 [17:58<00:00,  9.27it/s]

CPU times: user 17min 28s, sys: 2.29 s, total: 17min 30s
Wall time: 17min 58s





In [30]:
np.mean(recalls_n)

0.81451

In [None]:
# time: 0.108/query

In [None]:
%%time

recalls_n2 = []
for q_idx in tqdm(range(queries.shape[0])):
    recalls_n2.append(search_n_buckets(q_idx, 10))
np.mean(recalls_n2)

 55%|█████▌    | 5523/10000 [22:09<16:49,  4.43it/s]

In [None]:
# time: 0.108/query

In [64]:
k = 10

In [66]:
anns = np.zeros((queries.shape[0], k), dtype=np.uint32)
dists = np.zeros((queries.shape[0], k), dtype=np.float32)
data_categories = pd.DataFrame(
    model.predict(data),
    index=data.index,
    columns=['category']
)

In [67]:
predicted_categories = np.argsort(model.predict_proba(queries), axis=1)
predicted_categories.shape

(10000, 100)

In [71]:
predicted_categories[:, -1]

array([54, 26, 32, ..., 70,  0, 31])

In [72]:
np.unique(predicted_categories[:, -1])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])