In [1]:
import os
from typing import Dict, List, Annotated
import numpy as np
import faiss
from sklearn.neighbors import KNeighborsClassifier
import pickle
import faiss

In [33]:
filename = "0000_0.csv"

In [34]:
if filename.endswith(".csv"):
    bucket_rec = []
    with open(f"./index/{filename}", "r") as fin:
        for row in fin.readlines():
            row_splits = row.split(",")
            # the first element is id
            id = int(row_splits[0])
            # the rest are embed
            embed = [float(e) for e in row_splits[1:]]
            embed = np.array(embed)
            bucket_rec.append((id, embed))
        # build the HNSW index
        # bucket_rec = np.array(bucket_rec)
        # self._HNSW_index(
        #     data=bucket_rec,
        #     m=128,
        #     ef_construction=200,
        #     ef_search=32,
        #     filename=filename,
        # )
        # Knn using sklearn
        knn = KNeighborsClassifier(n_neighbors=10, metric="cosine")
        knn.fit(
            np.array([e[1] for e in bucket_rec]),
            np.array([e[0] for e in bucket_rec]),
        )
        # save the index using pickle
        with open(f"./index/{filename}.index", "wb") as fout:
            pickle.dump(knn, fout)

In [35]:
query = np.array(
    [
        0.374540119,
        0.950714306,
        0.731993942,
        0.598658484,
        0.15601864,
        0.15599452,
        0.058083612,
        0.866176146,
        0.601115012,
        0.708072578,
        0.020584494,
        0.969909852,
        0.832442641,
        0.212339111,
        0.181824967,
        0.18340451,
        0.304242243,
        0.524756432,
        0.431945019,
        0.29122914,
        0.611852895,
        0.139493861,
        0.292144649,
        0.366361843,
        0.456069984,
        0.785175961,
        0.199673782,
        0.514234438,
        0.592414569,
        0.046450413,
        0.607544852,
        0.170524124,
        0.065051593,
        0.948885537,
        0.965632033,
        0.808397348,
        0.304613769,
        0.097672114,
        0.684233027,
        0.440152494,
        0.122038235,
        0.49517691,
        0.034388521,
        0.909320402,
        0.258779982,
        0.662522284,
        0.311711076,
        0.520068021,
        0.546710279,
        0.184854456,
        0.969584628,
        0.775132823,
        0.939498942,
        0.89482735,
        0.597899979,
        0.921874235,
        0.088492502,
        0.195982862,
        0.045227289,
        0.325330331,
        0.38867729,
        0.271349032,
        0.828737509,
        0.356753327,
        0.28093451,
        0.542696083,
        0.140924225,
        0.802196981,
        0.074550644,
        0.986886937,
    ]
).reshape(1, -1)

In [36]:
distances, labels = knn.kneighbors(query, n_neighbors=5)
print("distances", distances)
print("labels", labels)
# calculate the score for each vector in the bucket
print("Calculating score...")
scores = [(distances[0][i], labels[0][i]) for i in range(len(labels[0]))]
scores = sorted(scores)[:5]
# return the ids of the top_k records
print(scores)

distances [[0.17026952 0.17234684 0.17359812 0.18428113 0.18996226]]
labels [[ 71  17 468 292 260]]
Calculating score...
[(0.17026951776473387, 71), (0.172346836401846, 17), (0.17359812140504816, 468), (0.1842811340688324, 292), (0.18996225809097111, 260)]


In [37]:
def _HNSW_index( data, m, ef_construction, filename, ef_search):
    index = faiss.IndexHNSWFlat(70, m)
    # set efConstruction and efSearch parameters
    index.hnsw.efConstruction = ef_construction
    index.hnsw.efSearch = ef_search
    # Wrap the index with IDMap
    id_map = faiss.IndexIDMap(index)
    id_map.add_with_ids(
        np.array([e[1] for e in data]), np.array([e[0] for e in data])
    )
    # save the index
    faiss.write_index(id_map, f"./index/{filename}.index")

In [38]:
bucket_rec = []
with open(f"./index/{filename}", "r") as fin:
    for row in fin.readlines():
        row_splits = row.split(",")
        # the first element is id
        id = int(row_splits[0])
        # the rest are embed
        embed = [float(e) for e in row_splits[1:]]
        embed = np.array(embed)
        bucket_rec.append((id, embed))

In [39]:
# bucket_rec = np.array(bucket_rec)
_HNSW_index(
    data=bucket_rec,
    m=128,
    ef_construction=200,
    ef_search=32,
    filename=filename,
)

In [40]:
loaded_index = faiss.read_index(
    f"./index/{filename}.index"
)

In [41]:
distances, labels = loaded_index.search(query, 10)
print("distances", distances)
print("labels", labels)
# calculate the score for each vector in the bucket
print("Calculating score...")
scores = [(distances[0][i], labels[0][i]) for i in range(len(labels[0]))]
scores = sorted(scores)[:10]
# return the ids of the top_k records
print(scores)
# return [s[1] for s in scores]

distances [[8.17793   8.179846  8.186981  8.198956  8.254056  8.345868  8.506533
  8.797141  8.8509865 8.917404 ]]
labels [[ 950  392 1632  189 6904 4317 3127 9746 7999 7906]]
Calculating score...
[(8.17793, 950), (8.179846, 392), (8.186981, 1632), (8.198956, 189), (8.254056, 6904), (8.345868, 4317), (8.506533, 3127), (8.797141, 9746), (8.8509865, 7999), (8.917404, 7906)]


In [1]:
import numpy as np
from scipy.cluster.vq import kmeans2
import faiss

In [2]:
num_part = 16  # number of IVF partitions
dataset = np.random.normal(size=(1000, 70))

In [3]:
(centroids, assignments) = kmeans2(dataset, num_part, iter=32)

In [5]:
centroids.shape

(16, 70)

In [6]:
centroids #the choesen cetroids for the 16 partitions

array([[ 0.51168185,  0.18225576,  0.37021322, ..., -0.31593024,
         0.11881143,  0.03136554],
       [-0.28565728, -0.49904209, -0.0612307 , ..., -0.09203601,
        -0.16423853, -0.05104231],
       [-0.06713186,  0.46291873,  0.7767644 , ...,  0.02974499,
        -0.9253968 , -0.16321818],
       ...,
       [-0.0770925 , -0.06430145, -0.41172783, ...,  0.09970878,
         0.2766169 , -0.2589768 ],
       [ 0.17554476,  0.2829007 ,  0.2984314 , ...,  0.01373098,
         0.11269911, -0.25094596],
       [ 0.21551573,  0.07543853, -0.57635857, ..., -0.15508406,
        -0.01352909, -0.35860672]])

In [14]:
min(assignments) #assignment for each vector in the dataset

0

In [15]:
test = [np.argmin(np.linalg.norm(vec - centroids, axis=1)) for vec in dataset]

In [17]:
np.all(test == assignments)

True

In [19]:
assignments[0]

12

In [21]:
index = [[] for _ in range(num_part)]
for n, k in enumerate(assignments):
    # n is the index of the vector
    # k is the index of the cluster
    index[k].append(n) # the nth vector gets added to the kth cluster

In [22]:
query = np.random.normal(size=(70,))
c = np.argmin(np.linalg.norm(centroids - query, axis=1))  # find the nearest partition
print(c)
nearest = np.argmin(np.linalg.norm(dataset[index[c]] - query, axis=1))  # find nearest neighbor
print(nearest)

3
71


In [26]:
# print(dataset[nearest])  # the index of the nearest neighbor in the dataset
# print(query)
print(np.linalg.norm(dataset[nearest] - query))

11.389149789375157


In [3]:
index = faiss.IndexFlatL2(70)
index.add(dataset)

In [10]:
k = 5
query = np.random.normal(size=(70,))

In [11]:
%%time
D, I = index.search(query.reshape(1, -1), k)

CPU times: total: 0 ns
Wall time: 0 ns


In [12]:
D

array([[78.15851, 80.8537 , 85.08509, 85.5335 , 87.41831]], dtype=float32)

In [13]:
I

array([[973, 868, 993, 588, 879]], dtype=int64)

In [2]:
import numpy as np  
fp = np.memmap("./index/index_peter.dta", dtype='float32', mode='r', shape=(2,71))
print(fp)
# fp[:] = np.random.randn(3,4)
# fp[0,0] = 1.1
# fp[0,1] = 2.2
# fp.flush()

FileNotFoundError: [Errno 2] No such file or directory: './index/index_peter.dta'

In [2]:
import numpy as np

# # Assuming you have two NumPy arrays of size n, array1 and array2
# array1 = np.array([1,2,3])  # Replace with your actual array1
# array2 = np.array([1,2,3])  # Replace with your actual array2

# # Concatenate array1 and array2 to form a new array of shape (n, 2)
# result = np.column_stack((array1, array2))

# print(result)
min(
    10**6,
    int(10**6 * 0.1)
    if 10**6 >= 10**6
    else None,
)

100000

In [14]:
import numpy as np
vec1= np.random.normal(size=(1000, 70))
vec2= np.random.normal(size=(1, 70))
# vec2_broadcasted = np.broadcast_to(vec2, vec1.shape)
# Calculate the dot product between each vector in vec1 and the broadcasted vec2
dot_product = np.sum(vec1 * vec2, axis=1)
# Calculate the dot product between each vector in vec1 and vec2
# dot_product = np.dot(vec1, vec2.T)

# Calculate the norm of each vector in vec1
norm_vec1 = np.linalg.norm(vec1, axis=1)

# Calculate the norm of vec2
norm_vec2 = np.linalg.norm(vec2)

# Calculate the cosine similarity for each pair of vectors
cosine_similarity = dot_product / (norm_vec1 * norm_vec2)

cosine_similarity.squeeze()
print(np.max(cosine_similarity))

0.38597861795300203


In [2]:
import numpy as np
x = np.loadtxt(
        "saved_db.csv",
        delimiter=",",
        skiprows=1000,
        dtype=np.float32,
        usecols=range(0, 71),
        max_rows=1
,
    )
print(x.shape)

(71,)
