In [32]:
import os
from typing import Dict, List, Annotated
import numpy as np
import faiss
from sklearn.neighbors import KNeighborsClassifier
import pickle
import faiss

In [33]:
filename = "0000_0.csv"

In [34]:
if filename.endswith(".csv"):
    bucket_rec = []
    with open(f"./index/{filename}", "r") as fin:
        for row in fin.readlines():
            row_splits = row.split(",")
            # the first element is id
            id = int(row_splits[0])
            # the rest are embed
            embed = [float(e) for e in row_splits[1:]]
            embed = np.array(embed)
            bucket_rec.append((id, embed))
        # build the HNSW index
        # bucket_rec = np.array(bucket_rec)
        # self._HNSW_index(
        #     data=bucket_rec,
        #     m=128,
        #     ef_construction=200,
        #     ef_search=32,
        #     filename=filename,
        # )
        # Knn using sklearn
        knn = KNeighborsClassifier(n_neighbors=10, metric="cosine")
        knn.fit(
            np.array([e[1] for e in bucket_rec]),
            np.array([e[0] for e in bucket_rec]),
        )
        # save the index using pickle
        with open(f"./index/{filename}.index", "wb") as fout:
            pickle.dump(knn, fout)

In [35]:
query = np.array(
    [
        0.374540119,
        0.950714306,
        0.731993942,
        0.598658484,
        0.15601864,
        0.15599452,
        0.058083612,
        0.866176146,
        0.601115012,
        0.708072578,
        0.020584494,
        0.969909852,
        0.832442641,
        0.212339111,
        0.181824967,
        0.18340451,
        0.304242243,
        0.524756432,
        0.431945019,
        0.29122914,
        0.611852895,
        0.139493861,
        0.292144649,
        0.366361843,
        0.456069984,
        0.785175961,
        0.199673782,
        0.514234438,
        0.592414569,
        0.046450413,
        0.607544852,
        0.170524124,
        0.065051593,
        0.948885537,
        0.965632033,
        0.808397348,
        0.304613769,
        0.097672114,
        0.684233027,
        0.440152494,
        0.122038235,
        0.49517691,
        0.034388521,
        0.909320402,
        0.258779982,
        0.662522284,
        0.311711076,
        0.520068021,
        0.546710279,
        0.184854456,
        0.969584628,
        0.775132823,
        0.939498942,
        0.89482735,
        0.597899979,
        0.921874235,
        0.088492502,
        0.195982862,
        0.045227289,
        0.325330331,
        0.38867729,
        0.271349032,
        0.828737509,
        0.356753327,
        0.28093451,
        0.542696083,
        0.140924225,
        0.802196981,
        0.074550644,
        0.986886937,
    ]
).reshape(1, -1)

In [36]:
distances, labels = knn.kneighbors(query, n_neighbors=5)
print("distances", distances)
print("labels", labels)
# calculate the score for each vector in the bucket
print("Calculating score...")
scores = [(distances[0][i], labels[0][i]) for i in range(len(labels[0]))]
scores = sorted(scores)[:5]
# return the ids of the top_k records
print(scores)

distances [[0.17026952 0.17234684 0.17359812 0.18428113 0.18996226]]
labels [[ 71  17 468 292 260]]
Calculating score...
[(0.17026951776473387, 71), (0.172346836401846, 17), (0.17359812140504816, 468), (0.1842811340688324, 292), (0.18996225809097111, 260)]


In [37]:
def _HNSW_index( data, m, ef_construction, filename, ef_search):
    index = faiss.IndexHNSWFlat(70, m)
    # set efConstruction and efSearch parameters
    index.hnsw.efConstruction = ef_construction
    index.hnsw.efSearch = ef_search
    # Wrap the index with IDMap
    id_map = faiss.IndexIDMap(index)
    id_map.add_with_ids(
        np.array([e[1] for e in data]), np.array([e[0] for e in data])
    )
    # save the index
    faiss.write_index(id_map, f"./index/{filename}.index")

In [38]:
bucket_rec = []
with open(f"./index/{filename}", "r") as fin:
    for row in fin.readlines():
        row_splits = row.split(",")
        # the first element is id
        id = int(row_splits[0])
        # the rest are embed
        embed = [float(e) for e in row_splits[1:]]
        embed = np.array(embed)
        bucket_rec.append((id, embed))

In [39]:
# bucket_rec = np.array(bucket_rec)
_HNSW_index(
    data=bucket_rec,
    m=128,
    ef_construction=200,
    ef_search=32,
    filename=filename,
)

In [40]:
loaded_index = faiss.read_index(
    f"./index/{filename}.index"
)

In [41]:
distances, labels = loaded_index.search(query, 10)
print("distances", distances)
print("labels", labels)
# calculate the score for each vector in the bucket
print("Calculating score...")
scores = [(distances[0][i], labels[0][i]) for i in range(len(labels[0]))]
scores = sorted(scores)[:10]
# return the ids of the top_k records
print(scores)
# return [s[1] for s in scores]

distances [[8.17793   8.179846  8.186981  8.198956  8.254056  8.345868  8.506533
  8.797141  8.8509865 8.917404 ]]
labels [[ 950  392 1632  189 6904 4317 3127 9746 7999 7906]]
Calculating score...
[(8.17793, 950), (8.179846, 392), (8.186981, 1632), (8.198956, 189), (8.254056, 6904), (8.345868, 4317), (8.506533, 3127), (8.797141, 9746), (8.8509865, 7999), (8.917404, 7906)]


In [3]:
from itertools import product

def hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))

def nearest_strings(binary_string, m):
    n = len(binary_string)
    all_strings = [''.join(bits) for bits in product('01', repeat=n)]
    distances = [(other_str, hamming_distance(binary_string, other_str)) for other_str in all_strings]
    distances.sort(key=lambda x: x[1])
    return [str_dist[0] for str_dist in distances[1:m+1]]

# Example usage:
binary_string = '1010'  # Replace with your binary string
m = 3  # Replace with the number of nearest strings you want
print(nearest_strings(binary_string, m))

['0010', '1000', '1011']
