## ANN Imputer

In [280]:
import numpy as np
import pandas as pd
import hnswlib
import faiss

from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_iris

from os import EX_OSFILE

### Подготовка датасета

In [281]:
iris = load_iris()
iris_data = iris.data
iris_feature_names = iris.feature_names
df_iris = pd.DataFrame(iris_data, columns=iris_feature_names)

In [282]:
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


5% значений будут заменены на NaN

In [283]:
nan_percent = 0.05

Всего элементов в датасете и сколько хотим заменить:

In [284]:
total_values = np.prod(df_iris.shape)
nan_count = int(total_values * nan_percent)

print(total_values)
print(nan_count)

600
30


Случайные индексы для пропусков

In [285]:
rand_row_idx = np.random.randint(0, df_iris.shape[0], size=nan_count)
rand_col_idx = np.random.randint(0, df_iris.shape[1], size=nan_count)

Вставляем np.nan в случайно выбранные позиции

In [286]:
for row, col in zip(rand_row_idx, rand_col_idx):
    df_iris.iat[row, col] = np.nan

In [287]:
print(df_iris.isna().sum())

sepal length (cm)     8
sepal width (cm)      4
petal length (cm)     7
petal width (cm)     10
dtype: int64


### HNSW Imputer

In [288]:
def build_hnsw(build_data, space='l2', M=32, ef_construction=32, ef = 32):
    space = space
    M = M
    dim = build_data.shape[-1]
    ef_construction = ef
    index = hnswlib.Index(space=space, dim=dim)
    index.init_index(max_elements=build_data.shape[0], ef_construction=ef_construction, M=M)
    index.add_items(np.float32(build_data), np.arange(build_data.shape[0]))
    return index

def search_hnsw(index, query_data, k, efSearch=10):
    index.set_ef(efSearch)
    labels, distances = index.knn_query(np.float32(query_data), k=k)
    return distances, labels

class HNSWSearcher(object):
    def __init__(self, space='l2', M=32, ef_construction=32, ef = 32):
        self.index = None
        self.dim = None
        self.space = space
        self.M = M
        self.ef_construction = ef_construction
        self.ef = ef

    def fit(self, X):
        self.index = build_hnsw(X, space=self.space, M=self.M, ef_construction=self.ef_construction, ef = self.ef)
        return self

    def kneighbors(self, X, k, efSearch=10):
        if self.index is None:
            raise ValueError("Unfitted")
        
        distances, labels = search_hnsw(self.index, X, k, efSearch)
        return distances, labels

In [289]:
class HNSWImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X):
        self.nn_model_ = HNSWSearcher()
        self.nn_model_.fit(X)

    def impute(self, X):
        missing_mask = np.isnan(X)
        if not np.any(missing_mask):
            return X

        # Find indices of missing values
        missing_indices = np.where(missing_mask)

        # Replace missing values with 0 for the purpose of finding nearest neighbors
        X_zeroed = X.fillna(0)

        # Find nearest neighbors for each missing value
        distances, indices = self.nn_model_.kneighbors(X_zeroed, k = self.n_neighbors)

        # Impute missing values with mean of nearest neighbors
        ## Тут проблема в том, что X имеет 150 строчек и 4 столбца. Здесь некорректно применять данный метод
        # imputed_values = np.mean(np.take_along_axis(X, indices, axis=0), axis=1)
        
        for row, col in zip(missing_indices[0], missing_indices[1]):
            mean_value = X_zeroed.iloc[indices[row], col].mean()
            X_zeroed.at[row, X_zeroed.columns[col]] = mean_value

        return X_zeroed

### Обучение HNSWImputer и получение ближайших соседей

In [290]:
hnsw_imputer = HNSWImputer(n_neighbors=5)

In [291]:
hnsw_imputer.fit(df_iris)

In [292]:
hnsw_imputer.impute(df_iris)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.20
1,4.9,3.0,1.4,0.20
2,4.7,3.2,1.3,0.20
3,4.6,3.1,1.5,0.20
4,5.0,3.6,1.4,0.12
...,...,...,...,...
145,6.7,3.0,5.2,2.30
146,6.3,2.5,5.0,1.90
147,6.5,3.0,5.2,2.00
148,6.2,3.4,5.4,2.30


### FAISS Imputer

In [297]:
def build_IVFPQ(build_data, coarse_index, nlist, m, nbits, metric, num_threads=1):
    dim = build_data.shape[1]
    faiss.omp_set_num_threads(num_threads)
    
    index = faiss.IndexIVFPQ(
        coarse_index,
        dim,
        nlist,
        m,
        nbits,
        metric
    )
    index.train(build_data)
    index.add(build_data)
    return index

def build_IVFFlat(build_data, coarse_index, nbits, metric, num_threads=1):
    dim = build_data.shape[1]
    faiss.omp_set_num_threads(num_threads)
    
    index = faiss.IndexIVFFlat(
        coarse_index,
        dim,
        nlist,
        metric
    )
    index.train(build_data)
    index.add(build_data)
    return index

def build_flat_l2(build_data , **fixed_params):
    dim = build_data.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.train(build_data)
    index.add(build_data)
    return index


def build_faiss_cosine(build_data, **fixed_params):
    dim = build_data.shape[1]
    faiss.normalize_L2(build_data.astype(np.float32))
    index = faiss.IndexFlatIP(dim)
    index.train(build_data)
    index.add(build_data)
    return index


def search_flat(index, query_data, k):
    distances, labels = index.search(x=query_data, k=k)
    return distances, labels

def search_faiss_cosine(index, query_data, k, nprobe=1):
    faiss.normalize_L2(query_data.astype(np.float32))
    index.nprobe = nprobe
    distances, labels = index.search(query_data, k)
    return distances, labels

def search_faiss(index, query_data, k, nprobe=1):
    index.nprobe = nprobe
    distances, labels = index.search(query_data, k)
    return distances, labels