In [1]:
%reload_ext autoreload
%autoreload 2

# Using the Database interface wit the SIFT dataset

## Goal
Create the DLMI through the Database (orchestrator) interface, insert 100k data

##### Author: Terezia Slaninakova, xslanin@fi.muni.cz
##### Date: 2.1.2023

In [2]:
import pickle
from dlmi.Database import Database
import numpy as np
import pandas as pd

In [6]:
from dlmi.Logger import logging, get_logger_config
logging.basicConfig(level=logging.WARNING, format=get_logger_config())
logging.debug('Initialized logger')

In [3]:
import h5py
import pandas as pd
# load the sift data
f = h5py.File('../data/sift-128-euclidean.hdf5', 'r')
data = list(f['train'])
data = pd.DataFrame(data)
data = data.sample(data.shape[0], random_state=2022)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
672749,1.0,0.0,2.0,6.0,1.0,1.0,1.0,6.0,7.0,1.0,...,70.0,1.0,16.0,22.0,23.0,0.0,0.0,2.0,31.0,1.0
458750,26.0,77.0,6.0,7.0,8.0,0.0,1.0,4.0,9.0,80.0,...,87.0,81.0,95.0,7.0,0.0,0.0,0.0,0.0,1.0,88.0
457092,25.0,19.0,13.0,1.0,34.0,58.0,23.0,14.0,16.0,34.0,...,0.0,21.0,19.0,11.0,45.0,84.0,68.0,19.0,2.0,4.0
609764,7.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,58.0,...,0.0,37.0,0.0,0.0,18.0,37.0,0.0,0.0,0.0,0.0
120615,59.0,9.0,0.0,0.0,0.0,0.0,3.0,29.0,123.0,20.0,...,38.0,40.0,62.0,6.0,7.0,20.0,17.0,3.0,2.0,24.0


In [None]:
# or use random numpy data
data = np.random.random((100_000, 128))
data = pd.DataFrame(data)

In [4]:
queries = pd.DataFrame(f['test']).sample(100, random_state=2022)

In [6]:
!cat ../config.yml 

Data:
    distance_function: 'L2'
    dataset: 'SIFT'
    subset: 1_000_000
    seed: 2022
    queries_subset: 1_000

LMI:
    leaf_node_capacity_min: 100
    leaf_node_capacity_max: 200
    children_min: 5
    children_max: 100
    violating_nodes: 0

Experiment:
    stop_conditions:
    - 0.0005
    - 0.001
    - 0.003
    - 0.005
    - 0.01
    - 0.03
    - 0.05
    - 0.1
    - 0.2
    - 0.3
    stop_conditions_absolute:
    - 1
    - 3
    - 5
    - 10
    - 13
    - 15
    - 20
    - 23
    - 25
    - 30
    stop_conditions_time:
    - 0.0001
    - 0.0003
    - 0.0005
    - 0.001
    - 0.003
    - 0.005
    - 0.01
    - 0.03
    - 0.05
    - 0.1
    - 0.3
    - 0.5


### 1. Insert the first partition

In [8]:
data_part = data.iloc[:1_000]

In [10]:
info_df = pd.DataFrame([], columns=['op', 'time-taken', 'size', '#-objects'])

In [12]:
db = Database('../config.yml')
%time info_df = db.insert(data_part, info_df)

[2023-01-02 11:50:15,831][INFO ][dlmi.Database.Databa] [INSERT] [1000]
[2023-01-02 11:50:15,941][INFO ][dlmi.Database.Databa] OVERFLOW at (0,) | 1000
[2023-01-02 11:50:15,943][INFO ][dlmi.Database.Databa] [DEEPEN] [(0,)] [7]
[2023-01-02 11:51:03,471][INFO ][dlmi.Database.Databa] Total n. overflows: 2,                             n. allowed: 1                             overflows in current node ((0,)):                             2
[2023-01-02 11:51:03,474][INFO ][dlmi.Database.Databa] [RETRAIN] [(0,)] [8]


CPU times: user 2.83 s, sys: 810 ms, total: 3.64 s
Wall time: 48.4 s


In [13]:
data_part = data.iloc[1000:10_000]
%time info_df = db.insert(data_part, info_df)

[2023-01-02 11:51:04,287][INFO ][dlmi.Database.Databa] [INSERT] [9000]
[2023-01-02 11:51:04,845][INFO ][dlmi.Database.Databa] Total n. overflows: 8,                             n. allowed: 1                             overflows in current node ((0,)):                             8
[2023-01-02 11:51:04,848][INFO ][dlmi.Database.Databa] [RETRAIN] [(0,)] [66]
[2023-01-02 11:51:06,356][INFO ][dlmi.Database.Databa] Total n. overflows: 12,                             n. allowed: 1                             overflows in current node ((0,)):                             12
[2023-01-02 11:51:06,359][INFO ][dlmi.Database.Databa] [RETRAIN] [(0,)] [73]
[2023-01-02 11:51:07,860][INFO ][dlmi.Database.Databa] Total n. overflows: 9,                             n. allowed: 1                             overflows in current node ((0,)):                             9
[2023-01-02 11:51:07,862][INFO ][dlmi.Database.Databa] [RETRAIN] [(0,)] [78]
[2023-01-02 11:51:09,390][INFO ][dlmi.Database.Databa] Total

CPU times: user 10.9 s, sys: 119 ms, total: 11 s
Wall time: 12 s


In [14]:
# dataframe with insert information
info_df.head()

Unnamed: 0,op,time-taken,size,#-objects
0,INSERT,0.002838,1.035558,1000
1,"DEEPEN-PART-(0,)-7",0.321392,,1000
2,"DEEPEN-TRAIN-(0,)-7",47.184151,,1000
3,"DEEPEN-REST-(0,)-7",0.00089,,1000
4,"RETRAIN-COLL-(0,)-8",0.001087,,1000


#### 2. Searching with the inserted data load

In [16]:
from dlmi.search_utils import get_knn_perf, get_objective_knns, sequential_search

In [15]:
queries.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
6487,70.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,125.0,8.0,...,73.0,0.0,0.0,12.0,22.0,0.0,0.0,8.0,49.0,2.0
8785,0.0,0.0,0.0,24.0,103.0,12.0,4.0,0.0,32.0,3.0,...,0.0,0.0,6.0,14.0,17.0,14.0,0.0,0.0,15.0,7.0
7390,33.0,3.0,0.0,1.0,4.0,0.0,0.0,0.0,135.0,33.0,...,135.0,24.0,0.0,0.0,0.0,0.0,0.0,103.0,73.0,0.0
7078,0.0,0.0,26.0,59.0,40.0,53.0,8.0,3.0,2.0,30.0,...,13.0,6.0,55.0,48.0,15.0,0.0,0.0,1.0,6.0,21.0
1230,1.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,14.0,2.0,...,44.0,23.0,55.0,24.0,45.0,12.0,7.0,4.0,6.0,16.0


In [22]:
results_knn, knn_distributions = get_knn_perf(
    db.lmi, data.iloc[:10_000], queries, stop_condition_leaf=5, k=100, metric='L2'
)

[2023-01-02 11:53:05,998][INFO ][get_knn_perf] K-NN perf for query=6487, time: 0.1053609848022461
[2023-01-02 11:53:06,086][INFO ][get_knn_perf] K-NN perf for query=8785, time: 0.08386826515197754
[2023-01-02 11:53:06,164][INFO ][get_knn_perf] K-NN perf for query=7390, time: 0.07467103004455566
[2023-01-02 11:53:06,307][INFO ][get_knn_perf] K-NN perf for query=7078, time: 0.1391584873199463
[2023-01-02 11:53:06,426][INFO ][get_knn_perf] K-NN perf for query=1230, time: 0.1154026985168457
[2023-01-02 11:53:06,527][INFO ][get_knn_perf] K-NN perf for query=3684, time: 0.09702682495117188
[2023-01-02 11:53:06,620][INFO ][get_knn_perf] K-NN perf for query=5263, time: 0.0895853042602539
[2023-01-02 11:53:06,712][INFO ][get_knn_perf] K-NN perf for query=3533, time: 0.08804750442504883
[2023-01-02 11:53:06,826][INFO ][get_knn_perf] K-NN perf for query=104, time: 0.11023259162902832
[2023-01-02 11:53:06,938][INFO ][get_knn_perf] K-NN perf for query=8080, time: 0.10790371894836426
[2023-01-02 11:

In [23]:
results_knn
# query, the leaf nodes that the answer was collected from, probabilities of those leaf nodes, ...

Unnamed: 0,query,query-predicted-pos,probs,n-objects,n-knns-found,time,time-seq-search,recall
0,6487,"[(0, 52), (0, 70), (0, 48, 4), (0, 22), (0, 48...","[0.97, 0.02, 0.0, 1.0, 0.0, 0.0]",434,53,0.001586,0.002703,0.53
1,8785,"[(0, 1), (0, 51), (0, 47), (0, 82, 1), (0, 9)]","[0.78, 0.19, 0.03, 0.0, 1.0, 0.0]",531,80,0.001596,0.002792,0.80
2,7390,"[(0, 35), (0, 6), (0, 60), (0, 37), (0, 26)]","[0.93, 0.05, 0.02, 0.0, 0.0]",606,74,0.000991,0.003014,0.74
3,7078,"[(0, 80), (0, 51), (0, 9), (0, 77), (0, 73)]","[0.93, 0.07, 0.0, 0.0, 0.0]",643,61,0.000978,0.003023,0.61
4,1230,"[(0, 1), (0, 82, 1), (0, 82, 4), (0, 79), (0, ...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]",431,92,0.001571,0.002538,0.92
...,...,...,...,...,...,...,...,...
95,8278,"[(0, 75), (0, 61, 4), (0, 61, 3), (0, 61, 0), ...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]",290,88,0.001637,0.002173,0.88
96,9452,"[(0, 30), (0, 17), (0, 65), (0, 60), (0, 5)]","[1.0, 0.0, 0.0, 0.0, 0.0]",458,42,0.000979,0.002621,0.42
97,7840,"[(0, 28), (0, 83), (0, 13), (0, 56), (0, 22)]","[0.98, 0.01, 0.01, 0.0, 0.0]",425,51,0.001113,0.002451,0.51
98,1659,"[(0, 54), (0, 72), (0, 17), (0, 57), (0, 65)]","[0.59, 0.4, 0.0, 0.0, 0.0]",386,48,0.000982,0.002417,0.48


#### 3. Individual search

In [47]:
query = queries.iloc[0]
pred_leaf_nodes, prob_distr, n_objects, time_taken = db.lmi.search(
    query, stop_condition_leaf=20
)

In [48]:
pred_leaf_nodes, prob_distr, n_objects, time_taken 

([(0, 52)], [0.9714974164962769], 95, 0.001135110855102539)

In [49]:
object_ids = []
for pred_leaf_node in pred_leaf_nodes:
    object_ids.extend(db.lmi.nodes[pred_leaf_node].object_ids)

In [50]:
found_k_nns = data.loc[object_ids].index[
    sequential_search(query, data.loc[object_ids], 100, metric='L2')
]

In [51]:
len(found_k_nns), found_k_nns[:4]

(95, Int64Index([458834, 876444, 451403, 512865], dtype='int64'))

In [52]:
nns = get_objective_knns(query.values, data, metric='L2')

##### No true NNs were found. Not surprising, we only inserted 10k objects out of 1M

In [53]:
found_nns = set(list(found_k_nns)).intersection(set(data.index[nns[0]][:100]))
found_nns

set()

In [None]:
data_part = data.iloc[10_000:100_000]
%time info_df = db.insert(data_part, info_df)

In [57]:
db.lmi.dump_structure()

Unnamed: 0_level_0,type,children
key,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0,)",InnerNode,100
"(0, 0)",InnerNode,11
"(0, 0, 0)",LeafNode,168
"(0, 0, 1)",LeafNode,124
"(0, 0, 2)",LeafNode,143
...,...,...
"(0, 82, 3)",LeafNode,156
"(0, 82, 4)",LeafNode,123
"(0, 82, 5)",LeafNode,162
"(0, 82, 6)",LeafNode,123


In [58]:
query = queries.iloc[0]
pred_leaf_nodes, prob_distr, n_objects, time_taken = db.lmi.search(
    query, stop_condition_leaf=20
)

In [59]:
object_ids = []
for pred_leaf_node in pred_leaf_nodes:
    object_ids.extend(db.lmi.nodes[pred_leaf_node].object_ids)

In [60]:
found_k_nns = data.loc[object_ids].index[
    sequential_search(query, data.loc[object_ids], 100, metric='L2')
]

##### After inseting 10% of the dataset, some kNNs were found!

In [61]:
found_nns = set(list(found_k_nns)).intersection(set(data.index[nns[0]][:100]))
found_nns

{48355, 80017, 242591, 293605, 306966, 317668, 547080, 654078, 661804}

In [63]:
db.lmi.get_n_of_objects()

100000

In [64]:
db.lmi.inconsistencies_stats()

Unnamed: 0,InnerNode-overflow,InnerNode-underflow,LeafNode-overflow,LeafNode-underflow
0,0,0,1,213
