In [19]:
%reload_ext autoreload
%autoreload 2

In [20]:
import h5py
import pandas as pd
import pickle
from tqdm import tqdm
from li.utils import pairwise_cosine
import time
import logging
import numpy as np
import os
from scipy import sparse


In [21]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)

def increase_max_recursion_limit():
    """ Increases the maximum recursion limit.
    Source: https://stackoverflow.com/a/16248113
    """
    import sys
    import resource
    resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1))
    sys.setrecursionlimit(10**6)


In [22]:
size = '100K'

LOG.info(f'Loading pca32 data')
data_path = f'data/pca32v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_data = f['pca32'][:, :]
data = pd.DataFrame(loaded_data)
data.index += 1

LOG.info(f'Loading queries')
base_path = f'data/pca32v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries = f2['pca32'][:, :]

base_path = f'data/clip768v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries_seq = f2['emb'][:, :]

LOG.info(f'Loading clip data')
data_path = f'data/clip768v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_clip_data = f['emb'][:, :]
loaded_clip_data = pd.DataFrame(loaded_clip_data)
loaded_clip_data.index += 1

LOG.info(f'Loading GT')
gt_path = f'data/groundtruth-{size}.h5'
f3 = h5py.File(gt_path, 'r')
loaded_gt = f3['knns'][:, :]


[2023-07-06 14:27:28,065][INFO ][__main__] Loading pca32 data
[2023-07-06 14:27:28,273][INFO ][__main__] Loading queries
[2023-07-06 14:27:28,723][INFO ][__main__] Loading clip data
[2023-07-06 14:27:30,391][INFO ][__main__] Loading GT


In [23]:
from li.BaseLMI import cluster_kmeans_faiss
from sklearn import preprocessing

In [28]:
%time kmeans, result = cluster_kmeans_faiss(data, n_clusters=100)

CPU times: user 333 ms, sys: 2.33 ms, total: 335 ms
Wall time: 358 ms


In [30]:
np.unique(result, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([ 800, 1055, 1009,  962,  808, 1083, 1153,  803,  811,  773,  981,
         825, 1062,  786,  843,  966, 1586,  965,  975, 1188, 1027,  832,
         687,  534, 1372,  699,  759,  919,  947, 1105,  846, 1067, 1636,
        1319,  899, 1225,  921,  509, 1267,  866,  616, 1032,  889,  839,
        1344, 1102,  705,  858, 1019, 1337, 1015, 1642,  412, 1062,  728,
        1435,  807,  912, 1262, 1398, 1189, 1289,  732,  501, 1242,  613,
        1110, 1240, 1264,  921,  913, 1008,  844,  819,  685,  987,  878,
         833, 1012,  744, 1300, 

In [31]:
import torch
import torch.utils.data

from li.model import NeuralNetwork, data_X_to_torch, data_to_torch, LIDataset_Single
        
dataset = LIDataset_Single(data, result)
        
train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=256,
    sampler=torch.utils.data.SubsetRandomSampler(data.index.values.tolist())
)

In [32]:
nn = NeuralNetwork(
    input_dim=data.shape[1], output_dim=100, lr=0.1, model_type='MLP'
)

In [33]:
losses = nn.train_batch(train_loader, epochs=100, logger=LOG)

[2023-07-06 14:30:13,762][INFO ][__main__] Epochs: 100, step: 10
[2023-07-06 14:30:28,029][INFO ][__main__] Epoch 10 | Loss 2.31404
[2023-07-06 14:30:40,289][INFO ][__main__] Epoch 20 | Loss 0.98153
[2023-07-06 14:30:52,629][INFO ][__main__] Epoch 30 | Loss 1.08155
[2023-07-06 14:31:04,930][INFO ][__main__] Epoch 40 | Loss 0.90105
[2023-07-06 14:31:17,355][INFO ][__main__] Epoch 50 | Loss 0.77351
[2023-07-06 14:31:29,569][INFO ][__main__] Epoch 60 | Loss 0.69315
[2023-07-06 14:31:41,864][INFO ][__main__] Epoch 70 | Loss 0.74654
[2023-07-06 14:31:54,170][INFO ][__main__] Epoch 80 | Loss 0.69056
[2023-07-06 14:32:06,409][INFO ][__main__] Epoch 90 | Loss 0.82842


In [34]:
data['category_nn'] = nn.predict(data_X_to_torch(data))

In [35]:
data['category_nn'].value_counts()

86    1984
87    1902
33    1851
83    1841
55    1792
      ... 
92     354
52     346
22     299
54     130
65     122
Name: category_nn, Length: 100, dtype: int64

In [36]:
%time probs, classes = nn.predict_proba(data_X_to_torch(loaded_queries))

CPU times: user 70 ms, sys: 88 µs, total: 70.1 ms
Wall time: 265 ms


In [46]:
%%time
k=10
nns = np.zeros((loaded_queries.shape[0], k), dtype=np.uint32)
dists = np.zeros((loaded_queries.shape[0], k), dtype=np.float32)
for class_ in np.unique(classes[:, 0]):
    cat_idxs = np.where(classes[:, 0] == class_)[0]
    bucket_obj_indexes = data.query(f'category_nn == {class_}', engine='python').index
    seq_search_dists = pairwise_cosine(loaded_queries_seq[cat_idxs], loaded_clip_data.loc[bucket_obj_indexes])
    ann_relative = seq_search_dists.argsort()[:, :k]
    nns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative]
    dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)

CPU times: user 1.89 s, sys: 0 ns, total: 1.89 s
Wall time: 1.98 s


In [47]:
nns.shape

(10000, 10)

In [50]:
overlaps = []
for i in range(10_000):
    overlaps.append(np.intersect1d(nns[i], loaded_gt[i]).shape[0])

In [51]:
np.mean(overlaps)

9.1554