In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import h5py
import pandas as pd
import pickle
from tqdm import tqdm
from li.utils import pairwise_cosine
import time
import logging
import numpy as np
import os
from scipy import sparse


In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)

def increase_max_recursion_limit():
    """ Increases the maximum recursion limit.
    Source: https://stackoverflow.com/a/16248113
    """
    import sys
    import resource
    resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1))
    sys.setrecursionlimit(10**6)


In [4]:
size = '10M'

LOG.info(f'Loading pca32 data')
data_path = f'data/pca32v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_data = f['pca32'][:, :]
data = pd.DataFrame(loaded_data)
data.index += 1

LOG.info(f'Loading queries')
base_path = f'data/pca32v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries = f2['pca32'][:, :]

base_path = f'data/clip768v2/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries_seq = f2['emb'][:, :]

LOG.info(f'Loading clip data')
data_path = f'data/clip768v2/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_clip_data = f['emb'][:, :]
loaded_clip_data = pd.DataFrame(loaded_clip_data)
loaded_clip_data.index += 1

LOG.info(f'Loading GT')
gt_path = f'data/groundtruth-{size}.h5'
f3 = h5py.File(gt_path, 'r')
loaded_gt = f3['knns'][:, :]


[2023-07-05 12:59:38,551][INFO ][__main__] Loading pca32 data
[2023-07-05 12:59:52,690][INFO ][__main__] Loading queries
[2023-07-05 12:59:53,164][INFO ][__main__] Loading clip data
[2023-07-05 13:02:32,962][INFO ][__main__] Loading GT


In [5]:
from li.BaseLMI import cluster_kmeans_faiss
from sklearn import preprocessing

[2023-07-05 13:02:41,316][INFO ][faiss.loader] Loading faiss with AVX2 support.
[2023-07-05 13:02:41,328][INFO ][faiss.loader] Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'",)
[2023-07-05 13:02:41,331][INFO ][faiss.loader] Loading faiss.
[2023-07-05 13:02:42,498][INFO ][faiss.loader] Successfully loaded faiss.


In [6]:
%time data_prep = preprocessing.normalize(loaded_clip_data.values)

CPU times: user 3min 12s, sys: 2.17 s, total: 3min 14s
Wall time: 3min 18s


In [None]:
%time kmeans, result = cluster_kmeans_faiss(data_prep, n_clusters=2000)

In [8]:
%time kmeans_wo, result_wo = cluster_kmeans_faiss(loaded_clip_data.values, n_clusters=1000)

CPU times: user 1min 27s, sys: 248 ms, total: 1min 27s
Wall time: 1min 30s


In [None]:
from sklearn import preprocessing
%time queries_prep = preprocessing.normalize(loaded_queries_seq)

In [None]:
def get_partitioning_quality(kmeans, queries, data, loaded_gt, basic_clustering):
    
    res = kmeans.index.search(np.array(queries).astype(np.float32), 1000)
    
    n_cats_covered = []
    n_objects_covered = []

    for i in tqdm(range(1000), position=0, leave=True):
        overall_sum = 0
        overall_objects_sum = 0
        argsorted = res[1][i]#np.argsort(res[0][i])[::-1]
        idx = 0
        while overall_sum < 9:
            overall_sum += np.sum(data.loc[loaded_gt[i][:10]].category == argsorted[idx])
            overall_objects_sum += np.sum(basic_clustering == argsorted[idx])
            #overall_objects_sum += np.sum(pred_positions == argsorted[idx])
            idx += 1
        n_cats_covered.append(idx)
        n_objects_covered.append(overall_objects_sum)
        if i%100 == 0 and i != 0:
            LOG.info(f'n_cats_covered: {np.mean(np.array(n_cats_covered))}')
            LOG.info(f'n_objects_covered: {np.mean(np.array(n_objects_covered))}')

    mean_cats_covered = np.mean(np.array(n_cats_covered))
    mean_objects_covered = np.mean(np.array(n_objects_covered))
    LOG.info(f'mean_cats_covered={mean_cats_covered}, mean_objects_covered={mean_objects_covered}')
    return mean_cats_covered, mean_objects_covered, np.array(n_cats_covered), np.array(n_objects_covered)


In [None]:
data_prep = pd.DataFrame(data_prep)
data_prep.index += 1
data_prep['category'] = result

In [None]:
%time mean_cats_covered_prep, mean_objects_covered_prep, cat_all_prep, objs_all_prep = get_partitioning_quality(kmeans, queries_prep, data_prep, loaded_gt, result)

In [38]:
from li.model import NeuralNetwork, data_X_to_torch, data_to_torch

In [18]:
nn = NeuralNetwork(
    input_dim=data_prep.drop('category', axis=1, errors='ignore').shape[1], output_dim=1000, lr=0.1, model_type='MLP'
)

In [26]:
data_prep.drop('category', axis=1).values.shape

(10120191, 768)

In [27]:
data_prep.category.values.shape

(10120191,)

In [39]:
import torch
import torch.utils.data

class LIDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset_x, self.dataset_y = data_to_torch(
            dataset.drop('category', axis=1).values, dataset.category.values
        )

    def __len__(self):
        return self.dataset_x.shape[0]
    
    def __getitem__(self, idx):
        return self.dataset_x[idx-1], self.dataset_y[idx-1]
        
dataset = LIDataset(data_prep)
        
train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=256,
    sampler=torch.utils.data.SubsetRandomSampler(data_prep.index.values.tolist())
)

In [34]:
len(train_loader) * 256

10120192

In [35]:
a = iter(train_loader)

In [36]:
next(a)

[tensor([[ 0.0056, -0.0220,  0.0338,  ...,  0.0308, -0.0087,  0.0122],
         [ 0.0525,  0.0675, -0.0280,  ...,  0.0082,  0.0393, -0.0194],
         [ 0.0529, -0.0414, -0.0108,  ...,  0.0362,  0.0119,  0.0305],
         ...,
         [ 0.0080,  0.0487,  0.0260,  ..., -0.0218,  0.0243,  0.0201],
         [ 0.0185,  0.0645, -0.0069,  ...,  0.0223, -0.0088,  0.0223],
         [ 0.0177,  0.0296, -0.0213,  ..., -0.0275,  0.0010,  0.0159]],
        dtype=torch.float16),
 tensor([928, 581, 367, 798, 271, 787, 706, 575, 643, 163, 554, 992, 423, 213,
          46, 739, 267, 373, 448, 613, 377,  98, 849, 959, 985, 202, 651,  53,
         641, 342, 509, 422, 888, 709, 154, 506, 683, 657, 920, 926, 766, 772,
         706, 713, 286, 347, 859, 347, 407, 995, 219, 657, 694, 248, 163, 836,
         594, 161, 212, 407,  89, 231, 386, 511, 903,  73, 931, 480, 361, 856,
         590, 899, 387, 521, 144, 845, 780, 802, 225, 333, 222, 804,  46,  93,
         326,  44, 606, 308,  70, 866, 289, 883, 234, 4

In [40]:
%time losses = nn.train_batch(train_loader, epochs=5, logger=LOG)

[2023-07-05 11:14:02,879][INFO ][__main__] Epochs: 5, step: 1
[2023-07-05 11:26:16,120][INFO ][__main__] Epoch 1 | Loss 7.33984
[2023-07-05 11:32:19,579][INFO ][__main__] Epoch 2 | Loss 6.94843
[2023-07-05 11:38:18,453][INFO ][__main__] Epoch 3 | Loss 6.89305
[2023-07-05 11:44:15,565][INFO ][__main__] Epoch 4 | Loss 6.90468


CPU times: user 29min 4s, sys: 15.8 s, total: 29min 20s
Wall time: 30min 12s


In [41]:
def get_partitioning_quality_nn(nn, queries, data, loaded_gt, basic_clustering):
    
    res = nn.predict_proba(data_X_to_torch(queries))
    
    n_cats_covered = []
    n_objects_covered = []

    for i in tqdm(range(10_000), position=0, leave=True):
        overall_sum = 0
        overall_objects_sum = 0
        argsorted = np.argsort(res[0][i])[::-1]
        idx = 0
        while overall_sum < 9:
            overall_sum += np.sum(data.loc[loaded_gt[i][:10]].category_nn == argsorted[idx])
            overall_objects_sum += np.sum(basic_clustering == argsorted[idx])
            #overall_objects_sum += np.sum(pred_positions == argsorted[idx])
            idx += 1
        n_cats_covered.append(idx)
        n_objects_covered.append(overall_objects_sum)

    mean_cats_covered = np.mean(np.array(n_cats_covered))
    mean_objects_covered = np.mean(np.array(n_objects_covered))
    LOG.info(f'mean_cats_covered={mean_cats_covered}, mean_objects_covered={mean_objects_covered}')
    return mean_cats_covered, mean_objects_covered, np.array(n_cats_covered), np.array(n_objects_covered)


In [None]:
%%time
category_nn = nn.predict(data_X_to_torch(data_prep.drop(['category', 'category_nn'], axis=1, errors='ignore')))
data['category_nn'] = category_nn
mean_cats_covered, mean_objects_covered, cats_all, objs_all = get_partitioning_quality_nn(nn, queries_prep, data_prep, loaded_gt, category_nn)
mean_cats_covered, mean_objects_covered

In [None]:
get_partitioning_quality_nn

In [20]:
result_wo[:100]

array([580,  84, 185, 667, 999, 987, 430, 165, 972, 538, 966, 694, 645,
       462, 335, 209,  66, 607, 453, 828, 677,  41, 335,   1, 908,  45,
       595, 127, 462, 944, 301,  89,  32, 567,  10, 987, 403, 791, 320,
        27, 805,  10, 567, 629, 509, 540, 320, 797, 761, 168, 544, 351,
       152, 183, 925, 706, 865, 585, 159, 194, 874, 125, 560,  27, 643,
       808, 406, 696, 922, 825, 911, 496, 453, 792, 774, 647,  18, 932,
       212, 805, 599, 739, 626, 729,   6, 795, 323, 979, 907, 329, 982,
       788, 830, 490, 299, 352,  51, 652, 791, 379])

In [21]:
result[:100]

array([580,  84, 185,  84, 999, 987, 430, 165, 972, 538, 966, 694, 645,
       462, 335, 209,  66, 607, 453, 828, 677, 201, 335,   1, 908,  45,
       595, 127, 462, 944, 301,  89,  32, 567,  10, 987, 403, 791, 320,
        27, 805,  10, 567, 629, 509, 540, 320, 908, 761, 168, 544, 351,
       152, 183, 825, 706, 865, 795, 159, 194, 874, 932, 560,  27, 643,
       808, 406, 696, 922, 415, 911, 496, 453, 792, 774, 647,  18, 932,
       212, 805, 599, 739, 626, 729,   6, 795, 323, 979, 907, 329, 982,
       788, 830, 490, 299, 352,  51, 537, 791, 379])

In [None]:
cat_all_prep[cat_all_prep > 100].shape

In [59]:
mean_cats_covered_prep, mean_objects_covered_prep

(35.4671, 3448.8709)

## Problem: Many "outliers" (== difficult objects to find knns for)
- try a deeper structure rather than wider -> 10 cats on L1 -- can we minimize the spread of knns here?
    - could speed up the training of NN
- try clip instead of PCA

## How best to train
- Observation: My simple MLP cannot handle 10M data (probably overloads CPU and job terminated prematurely)
    - Solution: train iteratively with 100k subsets
        - Problem: Takes a long time
            - Proposed solution: Try to use just a subset of data, check train as well as validation performance

In [None]:
## batch training wih periodic check regarding quality of the splits (similar to get_partitioning_quality, but gauging the NN)


In [None]:
import torch
class LIDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset_x = dataset.drop('category', axis=1).values
        self.dataset_y = dataset.category.values

    def __len__(self):
        return self.dataset_x.shape[0]
    
    def __getitem__(self, idx):
        return self.dataset_x[idx], self.dataset_y[idx]
        
dataset = LIDataset(data_prep)
        
train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=256,
    sampler=torch.utils.data.SubsetRandomSampler(data_prep.index)
)

In [None]:
a = iter(train_loader)
next(a)

In [29]:
res[1][0]

array([549, 235, 891, 693, 779, 848, 164,  51, 963, 694, 500, 932, 301,
       213, 940, 372, 698,  94, 769, 635, 489, 849, 728, 657, 223, 103,
       654, 143,  49, 529,  40, 716, 166, 420, 952, 291, 643, 677, 547,
       417, 672, 682, 559, 858, 799, 496, 441, 197, 435, 199, 491, 812,
       200, 828,  16,  56,   6,  61, 892, 738, 910, 938, 306, 374, 202,
        42, 777, 817, 571, 130,  62, 881, 596, 843, 113, 659, 582, 125,
       948,  69,  14, 818, 780, 376, 869, 537, 127, 416,  67, 919, 636,
       882, 316, 353, 766, 951, 311,  63,  37, 315,  45, 252, 683, 334,
       736, 997, 887, 332, 327, 773, 255, 342, 501, 820, 457, 832, 193,
       759,  20, 122, 279, 811, 308, 521, 254, 169, 748,  13, 646, 150,
       623, 266, 535, 484, 222, 606, 867, 292, 658, 317, 729, 368, 586,
       602,  41, 705,  77, 505, 493, 357, 739,  66, 191, 558, 300, 733,
       137, 348, 861, 390, 243, 391,  86, 815, 267, 752, 962, 715, 955,
       661, 555, 941, 794, 908, 585, 757, 701, 168, 825, 776, 32

In [151]:
np.array(queries_prep[0]).astype(np.float32)

array([-0.16376908, -0.26283354,  0.13444653, -0.27405524, -0.08517819,
        0.21695286,  0.17806946,  0.25665694,  0.09803253,  0.02090712,
       -0.1490172 , -0.40555754,  0.01857678, -0.23689921,  0.11829832,
       -0.28729078,  0.00851109, -0.18199685,  0.20004214, -0.07592768,
       -0.02429815,  0.17020628, -0.03283008, -0.25096416, -0.00096307,
       -0.00693814,  0.30826217,  0.13394353,  0.01870702,  0.02429006,
        0.14889674, -0.02364473], dtype=float32)

In [157]:
kmeans.index.search(np.array(queries_prep[:1]).astype(np.float32), 3)

(array([[0.31023687, 0.39056206, 0.39874762]], dtype=float32),
 array([[848, 891, 235]]))

In [156]:
result[loaded_gt[0] - 1]

array([235, 235, 235, 891, 891, 848, 848, 891, 891, 848, 891, 891, 891,
       235, 693, 625, 625, 625, 891, 549, 693, 549, 235,  94, 547, 848,
       891, 891, 963, 625, 779, 693, 235, 963, 891, 625, 549, 891, 779,
       848, 496, 891, 625, 489, 188, 919, 848, 301, 848, 848, 693, 683,
       500, 891, 891,  51, 143, 693, 166, 891, 223, 625, 891, 672, 705,
       223, 656, 891, 625, 625, 625, 625,  51,  51, 166, 698, 625, 848,
       891, 891, 693, 143,  51, 848, 683, 693, 549, 891, 848, 848, 848,
       301, 891, 891, 891,  51, 848, 683, 875, 869])

In [155]:
loaded_gt[0] - 1

array([79171, 15734, 22336,   230, 74172, 41078, 38158, 71848, 69014,
       92810, 99972, 79895, 13235, 64155, 86178, 55217, 60621, 64727,
       86340, 63838, 52856,   884, 83735, 87456, 36441, 24313, 73878,
        5984, 20969, 60558,  5413, 26293, 42585, 41369, 22972,  4414,
       23974, 82380, 15424, 79752, 19744, 74572, 40381,  8734, 70296,
       23883, 92500, 66708, 91231, 61307, 94072,  4326, 25524,  7179,
       12400,  5450, 59682, 17543, 53817, 52101, 85601,   897, 76552,
       18966, 37888, 50122, 80242, 35692, 73073, 31648, 55411, 13492,
       67225, 21471, 81543, 70032, 91178, 97386, 94425, 76537, 64252,
       29032, 87036, 38032, 80912, 70409, 83587, 44951,  6910, 65793,
       45783,  5491, 35204, 21689, 16710, 95027, 19960,  1707, 68449,
       20332], dtype=int32)

In [146]:
loaded_queries[0]

array([-0.06877425, -0.11037602,  0.05646035, -0.11508854, -0.03577028,
        0.09110859,  0.07477964,  0.10778218,  0.04116842,  0.00877987,
       -0.06257925, -0.17031246,  0.00780125, -0.099485  ,  0.04967897,
       -0.12064676,  0.0035742 , -0.07642894,  0.08400699, -0.03188556,
       -0.01020392,  0.07147753, -0.01378688, -0.10539152, -0.00040444,
       -0.00291365,  0.12945361,  0.05624911,  0.00785595,  0.01020052,
        0.06252866, -0.00992952], dtype=float32)

In [11]:
import h5py
import pandas as pd
import pickle
from tqdm import tqdm
from li.utils import pairwise_cosine
import time
import logging
import numpy as np
import os
from li.BaseLMI import cluster_kmeans_faiss, cluster_kmedoids
from li.BaseLMI import BaseLMI
prepare_data_cluster_kmedoids = BaseLMI.prepare_data_cluster_kmedoids
collect_predictions_kmedoids = BaseLMI.collect_predictions_kmedoids
from li.model import NeuralNetwork, data_X_to_torch, data_to_torch
import argparse
from datetime import datetime
from sklearn import preprocessing


In [12]:
k=10
n_categories=1000
epochs=100
lr=0.1
model_type='MLP'

In [13]:
nn = NeuralNetwork(
    input_dim=data.shape[1], output_dim=n_categories, lr=lr, model_type=model_type
)
data_x, data_y = data_to_torch(data_prep[:100_000], result[:100_000])
#data_x, data_y = data_to_torch(data_part, labels)
LOG.info(f'Starting training')
losses = nn.train(data_x, data_y, epochs=epochs, logger=LOG)

[2023-07-04 19:24:41,551][INFO ][__main__] Starting training
[2023-07-04 19:24:41,555][INFO ][__main__] Epochs: 100, step: 10
[2023-07-04 19:25:08,326][INFO ][__main__] Epoch 10 | Loss 0.8489696979522705
[2023-07-04 19:25:39,862][INFO ][__main__] Epoch 20 | Loss 0.347359836101532
[2023-07-04 19:27:47,805][INFO ][__main__] Epoch 30 | Loss 0.18590112030506134
[2023-07-04 19:31:56,197][INFO ][__main__] Epoch 40 | Loss 0.11469919979572296
[2023-07-04 19:36:55,095][INFO ][__main__] Epoch 50 | Loss 0.0794241800904274
[2023-07-04 19:41:57,903][INFO ][__main__] Epoch 60 | Loss 0.05990879237651825
[2023-07-04 19:46:50,506][INFO ][__main__] Epoch 70 | Loss 0.047846656292676926
[2023-07-04 19:51:25,096][INFO ][__main__] Epoch 80 | Loss 0.03951886296272278
[2023-07-04 19:55:19,481][INFO ][__main__] Epoch 90 | Loss 0.033408571034669876


In [None]:
data_x, data_y = data_to_torch(data_prep[100_000:200_000], result[100_000:200_000])
#data_x, data_y = data_to_torch(data_part, labels)
LOG.info(f'Starting training')
losses = nn.train(data_x, data_y, epochs=epochs, logger=LOG)

In [15]:
np.arange(0, 10_000_000, 100_000)

array([      0,  100000,  200000,  300000,  400000,  500000,  600000,
        700000,  800000,  900000, 1000000, 1100000, 1200000, 1300000,
       1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000,
       2100000, 2200000, 2300000, 2400000, 2500000, 2600000, 2700000,
       2800000, 2900000, 3000000, 3100000, 3200000, 3300000, 3400000,
       3500000, 3600000, 3700000, 3800000, 3900000, 4000000, 4100000,
       4200000, 4300000, 4400000, 4500000, 4600000, 4700000, 4800000,
       4900000, 5000000, 5100000, 5200000, 5300000, 5400000, 5500000,
       5600000, 5700000, 5800000, 5900000, 6000000, 6100000, 6200000,
       6300000, 6400000, 6500000, 6600000, 6700000, 6800000, 6900000,
       7000000, 7100000, 7200000, 7300000, 7400000, 7500000, 7600000,
       7700000, 7800000, 7900000, 8000000, 8100000, 8200000, 8300000,
       8400000, 8500000, 8600000, 8700000, 8800000, 8900000, 9000000,
       9100000, 9200000, 9300000, 9400000, 9500000, 9600000, 9700000,
       9800000, 9900

In [None]:
from sklearn import preprocessing
from sklearn.cluster import KMeans

kmeans = KMeans().fit(preprocessing.normalize(X))

In [133]:
%%time
clusters = DBSCAN(
    eps=0.2, min_samples=4, metric='cosine', leaf_size=9, p=0.005
).fit(data_s.values)
# eps = 0.5

KeyboardInterrupt: 

In [None]:
unique_result = np.unique(clusters.labels_, return_counts=True)

In [114]:
unique_result

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
 array([11172, 88727,     4,     4,     1,     6,     4,     3,    11,
            4,     1,     3,     4,     3,     4,     5,     4,     4,
            4,     3,     3,     2,     4,     4,     2,     2,     3,
            5,     4]))

In [115]:
unique_result[1].max()

88727

In [126]:
from os import listdir

In [127]:
r = listdir('/auto/brno12-cerit/nfs4/home/tslaninakova/sisap-challenge/results-dbscan')

In [128]:
min([int(r_.split('.csv')[0].split('-')[-1]) + int(r_.split('.csv')[0].split('-')[-2]) for r_ in r])

99916

In [129]:
[int(r_.split('.csv')[0].split('-')[-1]) + int(r_.split('.csv')[0].split('-')[-2]) for r_ in r].index(56796)

ValueError: 56796 is not in list

In [125]:
r[12]

'2023-07-04--17-12-10-28398-28398.csv'

In [110]:
unique_result[1][0]

11172

In [111]:
data_s = data.loc[100_000:200_000]

In [112]:
clusters2 = DBSCAN(
    eps=0.2, min_samples=4, metric='cosine', leaf_size=9, p=0.005
).fit(data_s.values)

In [113]:
unique_result2 = np.unique(clusters2.labels_, return_counts=True)
unique_result2[1][0]

11346

In [101]:
data_1 = data.loc[:100_000]
data_1 = data_1.loc[clusters.labels_ == -1]
data_1.shape

(72320, 32)

In [102]:
data_2 = data.loc[100_000:200_000]
data_2 = data_2.loc[clusters2.labels_ == -1]
data_2.shape

(71964, 32)

In [103]:
pd.concat([data_1, data_2])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
1,0.142266,0.121459,-0.054467,-0.058987,0.049365,-0.107044,0.123232,0.003732,0.044797,-0.118452,...,0.063983,0.037666,-0.089511,0.077113,-0.118540,0.058264,0.065489,0.028578,-0.089683,0.126804
2,0.213747,-0.011409,-0.015184,0.149988,0.106374,0.082410,-0.120636,-0.061065,0.068683,0.033811,...,-0.098325,-0.013612,0.004494,-0.046666,-0.043621,-0.086326,-0.011769,-0.047742,0.020944,-0.030367
3,0.069696,-0.213891,0.132873,-0.150283,-0.049209,0.080035,-0.023633,0.017212,-0.208762,0.053342,...,0.035556,0.023209,-0.048412,-0.051262,0.019094,-0.036998,0.006380,0.011270,-0.029592,0.076340
4,0.136413,-0.124811,-0.154160,0.239878,0.002287,0.145432,-0.069391,-0.006645,0.124315,0.054875,...,-0.045162,0.027401,0.072960,-0.035548,-0.003664,-0.007761,-0.065726,-0.013585,-0.012794,-0.030778
5,-0.389241,0.076076,-0.072001,0.103547,-0.058551,0.033468,0.017906,0.094761,0.034065,-0.220189,...,-0.051880,-0.016713,0.018730,0.050222,-0.021090,0.004709,0.043465,-0.049461,-0.052292,0.027773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.016623,0.261374,-0.108525,-0.135211,0.032195,0.070787,0.079449,-0.092412,-0.100304,0.067358,...,0.081138,-0.013030,0.065910,-0.052886,0.022053,0.036298,-0.059659,0.027475,-0.006544,-0.060666
199996,-0.135043,0.058869,-0.033821,-0.066608,-0.032245,0.102940,0.085796,0.028704,0.096673,-0.053070,...,-0.040772,-0.024435,0.013444,0.064723,0.059284,-0.020601,0.065155,-0.004740,-0.011218,-0.096802
199997,0.056166,0.045453,-0.165639,-0.001400,0.177127,-0.021528,-0.009026,-0.040884,0.220318,-0.039731,...,-0.096172,0.015361,-0.034067,-0.060641,0.030858,0.005659,0.010588,0.035196,-0.021983,0.015925
199998,0.172810,0.136016,0.052640,-0.048010,0.118766,-0.002332,0.111289,-0.026583,-0.051066,-0.125786,...,0.088246,-0.000755,0.012355,0.057855,-0.032556,0.118518,-0.025377,0.024111,0.063948,-0.016831


In [104]:
clusters3 = DBSCAN(
    eps=0.1, min_samples=5, metric='cosine', leaf_size=10, p=0.01
).fit(pd.concat([data_1, data_2]).values)

In [105]:
unique_result3 = np.unique(clusters3.labels_, return_counts=True)
unique_result3[1][0]

140027

In [87]:
data_s.loc[clusters.labels_ == -1].shape

(72315, 32)

In [None]:
data.

In [88]:
clusters_2 = DBSCAN(
    eps=0.1, min_samples=5, metric='cosine', leaf_size=10, p=0.01
).fit(data_s.loc[clusters.labels_ == -1])

In [89]:
unique_result_2 = np.unique(clusters_2.labels_, return_counts=True)

In [90]:
unique_result_2[1][0]

72315

In [75]:
unique_result[1][0]

78487

In [None]:
%time clusters = DBSCAN(eps=3, min_samples=2).fit(data_s)

In [53]:
from scipy.spatial.distance import cdist

In [None]:
loaded_clip_data[:1_000_000]

In [58]:
loaded_queries[0].shape

(32,)

In [None]:
loaded_queries[0]

In [61]:
from scipy import sparse

In [None]:
data_part_sparse = sparse.csr_matrix(clip_data.loc[object_ids])
query_part_sparse = sparse.csr_matrix(loaded_queries_seq[i])

In [59]:
%%time
res = cdist(
    [loaded_queries_seq[0]], loaded_clip_data[:1_000_000], metric='cosine'
)

CPU times: user 4.19 s, sys: 2.2 s, total: 6.39 s
Wall time: 6.68 s


In [64]:
%%time
res = cdist(
    sparse.csr_matrix(loaded_queries_seq[:2]), sparse.csr_matrix(loaded_clip_data[:1_000_000]), metric='cosine'
)

ValueError: XA must be a 2-dimensional array.