In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import h5py
import pandas as pd
import pickle
from tqdm import tqdm
from li.utils import pairwise_cosine
import time
import logging
import numpy as np
import os
from li.BaseLMI import cluster_kmeans_faiss, cluster_kmedoids
from li.BaseLMI import BaseLMI
prepare_data_cluster_kmedoids = BaseLMI.prepare_data_cluster_kmedoids
collect_predictions_kmedoids = BaseLMI.collect_predictions_kmedoids
from li.model import NeuralNetwork, data_X_to_torch, data_to_torch
import argparse
from datetime import datetime

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)


In [4]:
dataset='pca32v2'#'clip768v2'
kind='pca32v2'#'clip768v2'
size='10M'
epochs=100
lr=0.1
n_categories=1000
model_type='MLP'

In [5]:
size_mapping = {
    "100K": 100_000,
    "300K": 300_000,
    "10M": 10_000_000
}

kind_mapping = {
    "clip768v2": 768,
    "pca32v2": 32,
    "pca96v2": 96
}

emb_mapping = {
    "clip768v2": 'emb',
    "pca32v2": 'pca32',
    "pca96v2": 'pca96'
}

base_data_path = f'/auto/brno12-cerit/nfs4/home/tslaninakova/sisap-challenge/repo/data'

LOG.info(f'Loading {kind} data')
data_path = f'{base_data_path}/{kind}/{size}/dataset.h5'
f = h5py.File(data_path, 'r')
loaded_data = f[emb_mapping[kind]][:, :]
data = pd.DataFrame(loaded_data)
data.index += 1

LOG.info(f'Loading queries')
base_path = f'{base_data_path}/{kind}/{size}/'
queries_path = f'{base_path}/query.h5'
f2 = h5py.File(queries_path, 'r')
#loaded_queries = f2['emb'][:, :]
loaded_queries = f2[emb_mapping[kind]][:, :]

search_kind = 'clip768v2'

LOG.info(f'Loading GT')
gt_path = f'{base_data_path}/groundtruth-{size}.h5'
f3 = h5py.File(gt_path, 'r')
loaded_gt = f3['knns'][:, :]

[2023-07-04 12:05:43,082][INFO ][__main__] Loading pca32v2 data
[2023-07-04 12:05:54,691][INFO ][__main__] Loading queries
[2023-07-04 12:05:54,717][INFO ][__main__] Loading GT


In [6]:
data.shape

(10120191, 32)

In [8]:
LOG.info(f'Clustering with K-Medoids and data of shape: {data.shape}')

(
    data_part,
    data_predict,
    data_part_index,
    data_all
) = prepare_data_cluster_kmedoids(
    data.values, n_categories, data.index.values.tolist()
)

[2023-07-04 12:07:36,403][INFO ][__main__] Clustering with K-Medoids and data of shape: (10120191, 32)


In [9]:
data_part.shape

(101201, 32)

In [None]:
%time data_sparse = sparse.csr_matrix(data.values)

In [16]:
data_part_sparse[:2, :]

<2x768 sparse matrix of type '<class 'numpy.float32'>'
	with 1536 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

#A =  np.array([[0, 1, 0, 0, 1], [0, 0, 1, 1, 1],[1, 1, 0, 1, 0]])
%time data_part_sparse = sparse.csr_matrix(data_part)

CPU times: user 3.36 s, sys: 1.12 s, total: 4.48 s
Wall time: 4.6 s


In [14]:
%time query_part_sparse = sparse.csr_matrix(loaded_queries[0])

CPU times: user 243 µs, sys: 516 µs, total: 759 µs
Wall time: 766 µs


In [None]:
data_part_sparse

In [11]:
%%time
similarities = cosine_similarity(query_part_sparse, data_part_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[ 0.22825958  0.5315493   0.06101639 ...  0.47762138 -0.2940306
   0.0380674 ]]

CPU times: user 77.8 ms, sys: 22.9 ms, total: 101 ms
Wall time: 101 ms


In [13]:
np.argsort(similarities[0])[:10]

array([63163,  8205, 87136, 39311, 44553, 47104, 87530, 56174,  3997,
       47954])

In [12]:
%%time
similarities_sparse = cosine_similarity(query_part_sparse, data_part_sparse, dense_output=False)
print('pairwise sparse output:\n {}\n'.format(similarities_sparse))

pairwise sparse output:
   (0, 101200)	0.0380674
  (0, 101199)	-0.2940306
  (0, 101198)	0.47762138
  (0, 101197)	0.028470194
  (0, 101196)	0.03202287
  (0, 101195)	-0.020826954
  (0, 101194)	-0.034779858
  (0, 101193)	0.1186546
  (0, 101192)	0.47139296
  (0, 101191)	-0.044738226
  (0, 101190)	-0.05661361
  (0, 101189)	0.15819879
  (0, 101188)	0.03317749
  (0, 101187)	0.07791596
  (0, 101186)	0.24315906
  (0, 101185)	0.48116136
  (0, 101184)	-0.20434326
  (0, 101183)	-0.25789338
  (0, 101182)	-0.19057587
  (0, 101181)	0.20158559
  (0, 101180)	0.06528239
  (0, 101179)	0.41035482
  (0, 101178)	-0.09057258
  (0, 101177)	0.06517075
  (0, 101176)	-0.045182317
  :	:
  (0, 24)	0.04692969
  (0, 23)	-0.05638562
  (0, 22)	-0.3653938
  (0, 21)	-0.081963904
  (0, 20)	-0.18360576
  (0, 19)	-0.2704143
  (0, 18)	0.026057381
  (0, 17)	0.12937975
  (0, 16)	-0.18929511
  (0, 15)	0.02480204
  (0, 14)	-0.01618684
  (0, 13)	-0.15286607
  (0, 12)	-0.16358636
  (0, 11)	0.4681005
  (0, 10)	-0.24830072
  (0, 9)

In [17]:
np.argsort(similarities_sparse[0])

array([0])

In [15]:
similarities_sparse[0]

<1x101201 sparse matrix of type '<class 'numpy.float32'>'
	with 101201 stored elements in Compressed Sparse Row format>

In [11]:
%time labels = cluster_kmedoids(data_part[:1000], n_clusters=10, LOG=LOG)

[2023-07-04 12:10:43,872][INFO ][__main__] Running pairwise_cosine on (1000, 32), this might take a while
[2023-07-04 12:10:43,879][INFO ][__main__] Created sparse matrix
[2023-07-04 12:10:44,035][INFO ][__main__] Running kmedoids on (1000, 32), this might take a while


CPU times: user 42.1 s, sys: 7.05 s, total: 49.2 s
Wall time: 49.7 s


In [12]:
%time labels = cluster_kmedoids(data_part[:10_000], n_clusters=10, LOG=LOG)

[2023-07-04 12:12:00,434][INFO ][__main__] Running pairwise_cosine on (10000, 32), this might take a while
[2023-07-04 12:12:00,451][INFO ][__main__] Created sparse matrix
[2023-07-04 12:12:10,606][INFO ][__main__] Running kmedoids on (10000, 32), this might take a while


CPU times: user 9min 34s, sys: 1min 37s, total: 11min 12s
Wall time: 11min 18s


In [13]:
%time labels = cluster_kmedoids(data_part[:10_000], n_clusters=100, LOG=LOG)

[2023-07-04 12:23:19,966][INFO ][__main__] Running pairwise_cosine on (10000, 32), this might take a while
[2023-07-04 12:23:19,987][INFO ][__main__] Created sparse matrix
[2023-07-04 12:23:30,103][INFO ][__main__] Running kmedoids on (10000, 32), this might take a while


CPU times: user 31min 1s, sys: 5min 5s, total: 36min 6s
Wall time: 36min 27s


In [None]:
# .txt file, kazdy objekt zacina @object

In [19]:
data.shape

(10120191, 32)

In [27]:
from tqdm import tqdm

In [None]:
def save_dataframe_into_file_row_by_row(df: pd.DataFrame, file_path: str):
    """ Saves a dataframe into a file row by row.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to save.
    file_path : str
        The path to the file to save the dataframe to.
    """
    with open(file_path, 'w') as f:
        for index, row in tqdm(df.iterrows()):
            f.write(f'#objectKey messif.objects.keys.AbstractObjectKey {index}\n')
            f.write(','.join([str(r) for r in row.values]) + '\n')

save_dataframe_into_file_row_by_row(data, file_path='10M-pca32-messif.txt')

3398913it [06:20, 9016.72it/s]

In [18]:
data_part[0]

array([ 0.10179286, -0.18555436,  0.1564548 , -0.05170799,  0.01524584,
        0.11560269,  0.09272191,  0.01423696, -0.05010636, -0.007695  ,
       -0.06595013, -0.0039556 , -0.12157702, -0.11014523, -0.05693046,
       -0.00997591,  0.0052503 ,  0.07412283,  0.01239138, -0.01504549,
        0.04909196,  0.02973638,  0.00649209,  0.03390019,  0.00310681,
        0.003009  , -0.06331033,  0.00972393, -0.01492207, -0.06803757,
       -0.05141252,  0.10497223], dtype=float32)

In [14]:
%time labels = cluster_kmedoids(data_part[:100_000], n_clusters=100, LOG=LOG)

[2023-07-04 12:59:48,361][INFO ][__main__] Running pairwise_cosine on (100000, 32), this might take a while
[2023-07-04 12:59:48,545][INFO ][__main__] Created sparse matrix


RuntimeError: nnz of the result is too large

In [15]:
%time labels = cluster_kmedoids(data_part[:1_000_000], n_clusters=100, max_iter=1, LOG=LOG)

[2023-07-04 13:01:14,727][INFO ][__main__] Running pairwise_cosine on (101201, 32), this might take a while
[2023-07-04 13:01:14,904][INFO ][__main__] Created sparse matrix


RuntimeError: nnz of the result is too large

In [None]:
%time labels = cluster_kmedoids(data_part[:1000], n_clusters=10, LOG=LOG)

In [10]:
pd.Series(labels).value_counts()

573    48
608    40
533    37
527    36
566    35
       ..
908     1
741     1
365     1
731     1
554     1
Length: 1000, dtype: int64

In [15]:
%%time
LOG.info(f'Instantiating NN')
nn = NeuralNetwork(
    input_dim=data_s.shape[1], output_dim=n_categories, lr=lr, model_type=model_type
)
#data_x, data_y = data_to_torch(loaded_data, basic_clustering)
data_x, data_y = data_to_torch(data_s, labels)
LOG.info(f'Starting training')
losses = nn.train(data_x, data_y, epochs=epochs, logger=LOG)

[2023-07-04 10:35:55,629][INFO ][__main__] Instantiating NN
[2023-07-04 10:35:55,641][INFO ][__main__] Starting training
[2023-07-04 10:35:55,645][INFO ][__main__] Epochs: 100, step: 10
[2023-07-04 10:35:59,017][INFO ][__main__] Epoch 10 | Loss 0.9958549737930298
[2023-07-04 10:36:02,114][INFO ][__main__] Epoch 20 | Loss 0.13743114471435547
[2023-07-04 10:36:07,606][INFO ][__main__] Epoch 30 | Loss 0.03208932653069496
[2023-07-04 10:36:21,813][INFO ][__main__] Epoch 40 | Loss 0.01167370192706585
[2023-07-04 10:36:43,249][INFO ][__main__] Epoch 50 | Loss 0.006462142337113619
[2023-07-04 10:37:08,504][INFO ][__main__] Epoch 60 | Loss 0.004445212427526712
[2023-07-04 10:37:35,506][INFO ][__main__] Epoch 70 | Loss 0.0035496086347848177
[2023-07-04 10:38:03,718][INFO ][__main__] Epoch 80 | Loss 0.003038597758859396
[2023-07-04 10:38:32,560][INFO ][__main__] Epoch 90 | Loss 0.002697634045034647


CPU times: user 2min 54s, sys: 6.13 s, total: 3min 1s
Wall time: 3min 6s


In [16]:
predictions = nn.predict(data_X_to_torch(data.loc[data.index.difference(data_s.index)]))

In [18]:
data['categories'] = np.NaN
data.loc[data.index.difference(data_s.index), 'categories'] = predictions
data.loc[data_s.index, 'categories'] = labels

In [20]:
data.categories = data.categories.astype(int)

In [21]:
%time res = nn.predict_proba(data_X_to_torch(loaded_queries))

CPU times: user 158 ms, sys: 39.8 ms, total: 198 ms
Wall time: 199 ms


In [22]:
i=0

In [25]:
argsorted = np.argsort(res[0][i])[::-1]
argsorted[:10]

array([ 17, 969, 335, 289, 367,  25, 575, 106,  40, 205])

In [28]:
idx = 0

In [35]:
data.loc[loaded_gt[i][:10]].categories == argsorted[5]

1831320    False
7535354    False
7059014    False
7563483    False
3003515    False
5278218    False
8190596    False
9060635    False
5509896    False
210839     False
Name: categories, dtype: bool

In [None]:
predictions