In [52]:
%reload_ext autoreload
%autoreload 2

In [53]:
import h5py
import pandas as pd
import pickle
from tqdm import tqdm
from li.utils import pairwise_cosine
import time
import logging
import numpy as np
import os
from scipy import sparse


In [54]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)

def increase_max_recursion_limit():
    """ Increases the maximum recursion limit.
    Source: https://stackoverflow.com/a/16248113
    """
    import sys
    import resource
    resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1))
    sys.setrecursionlimit(10**6)


In [55]:
size = '10M'

LOG.info(f'Loading pca32 data')
data_path = f'data/pca32v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_data = f['pca32'][:, :]
data = pd.DataFrame(loaded_data)
data.index += 1

LOG.info(f'Loading queries')
base_path = f'data/pca32v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries = f2['pca32'][:, :]

base_path = f'data/clip768v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries_seq = f2['emb'][:, :]

LOG.info(f'Loading clip data')
data_path = f'data/clip768v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_clip_data = f['emb'][:, :]
loaded_clip_data = pd.DataFrame(loaded_clip_data)
loaded_clip_data.index += 1

LOG.info(f'Loading GT')
gt_path = f'data/groundtruth-{size}.h5'
f3 = h5py.File(gt_path, 'r')
loaded_gt = f3['knns'][:, :]


[2023-07-06 14:38:03,909][INFO ][__main__] Loading pca32 data
[2023-07-06 14:38:17,250][INFO ][__main__] Loading queries
[2023-07-06 14:38:17,706][INFO ][__main__] Loading clip data
[2023-07-06 14:40:57,614][INFO ][__main__] Loading GT


In [56]:
from li.BaseLMI import cluster_kmeans_faiss
from sklearn import preprocessing

In [57]:
%time kmeans, result = cluster_kmeans_faiss(data, n_clusters=100)

CPU times: user 5.15 s, sys: 313 ms, total: 5.47 s
Wall time: 5.58 s


In [58]:
np.unique(result, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([137846,  84578,  64977,  63083, 121910,  66487, 112356, 143534,
        107602, 127883, 119215, 126814,  82047,  82712,  53245,  69419,
        133778, 121060, 139095,  83346,  58019,  46916, 119296,  60359,
         91497,  41448,  94526, 147443,  63823, 114869, 123470, 102668,
         92954,  55334, 103322, 179235,  83676,  94945,  69837, 163405,
        109906, 128452,  67458,  90685,  70172,  60274,  98134, 128141,
        111783, 134609, 113524,  70085,  85169,  84793,  92790,  90704,
        138608, 146013,  74300, 100450, 145770

In [59]:
import torch
import torch.utils.data

from li.model import NeuralNetwork, data_X_to_torch, data_to_torch, LIDataset_Single
        
dataset = LIDataset_Single(data, result)
        
train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=256,
    sampler=torch.utils.data.SubsetRandomSampler(data.index.values.tolist())
)

In [60]:
nn = NeuralNetwork(
    input_dim=data.shape[1], output_dim=100, lr=0.1, model_type='MLP'
)

In [None]:
%time losses = nn.train_batch(train_loader, epochs=100, logger=LOG)

[2023-07-06 14:41:09,487][INFO ][__main__] Epochs: 100, step: 10
[2023-07-06 15:04:47,288][INFO ][__main__] Epoch 10 | Loss 1.78789


In [None]:
%time data['category_nn'] = nn.predict(data_X_to_torch(data))

In [None]:
data['category_nn'].value_counts()

In [None]:
%time probs, classes = nn.predict_proba(data_X_to_torch(loaded_queries))

In [None]:
%%time
k=10
nns = np.zeros((loaded_queries.shape[0], k), dtype=np.uint32)
dists = np.zeros((loaded_queries.shape[0], k), dtype=np.float32)
for class_ in np.unique(classes[:, 0]):
    cat_idxs = np.where(classes[:, 0] == class_)[0]
    bucket_obj_indexes = data.query(f'category_nn == {class_}', engine='python').index
    seq_search_dists = pairwise_cosine(loaded_queries_seq[cat_idxs], loaded_clip_data.loc[bucket_obj_indexes])
    ann_relative = seq_search_dists.argsort()[:, :k]
    nns[cat_idxs] = np.array(bucket_obj_indexes)[ann_relative]
    dists[cat_idxs] = np.take_along_axis(seq_search_dists, ann_relative, axis=1)

In [None]:
nns.shape

In [None]:
overlaps = []
for i in range(10_000):
    overlaps.append(np.intersect1d(nns[i], loaded_gt[i]).shape[0])

In [None]:
np.mean(overlaps)