## ANN Imputer

In [55]:
import numpy as np
import pandas as pd
import hnswlib
import faiss

from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_iris

from sklearn.preprocessing import StandardScaler

#from os import EX_OSFILE

### Подготовка датасета

In [3]:
iris = load_iris()
iris_data = iris.data
iris_feature_names = iris.feature_names
df_iris = pd.DataFrame(iris_data, columns=iris_feature_names)

In [4]:
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


5% значений будут заменены на NaN

In [5]:
nan_percent = 0.05

Всего элементов в датасете и сколько хотим заменить:

In [6]:
total_values = np.prod(df_iris.shape)
nan_count = int(total_values * nan_percent)

print(total_values)
print(nan_count)

600
30


Случайные индексы для пропусков

In [7]:
rand_row_idx = np.random.randint(0, df_iris.shape[0], size=nan_count)
rand_col_idx = np.random.randint(0, df_iris.shape[1], size=nan_count)

Вставляем np.nan в случайно выбранные позиции

In [8]:
for row, col in zip(rand_row_idx, rand_col_idx):
    df_iris.iat[row, col] = np.nan

In [9]:
print(df_iris.isna().sum())

sepal length (cm)     4
sepal width (cm)      5
petal length (cm)     9
petal width (cm)     12
dtype: int64


### HNSW Imputer

In [10]:
def build_hnsw(build_data, space='l2', M=32, ef_construction=32, ef = 32):
    space = space
    M = M
    dim = build_data.shape[-1]
    ef_construction = ef
    index = hnswlib.Index(space=space, dim=dim)
    index.init_index(max_elements=build_data.shape[0], ef_construction=ef_construction, M=M)
    index.add_items(np.float32(build_data), np.arange(build_data.shape[0]))
    return index

def search_hnsw(index, query_data, k, efSearch=10):
    index.set_ef(efSearch)
    labels, distances = index.knn_query(np.float32(query_data), k=k)
    return distances, labels

class HNSWSearcher(object):
    def __init__(self, space='l2', M=32, ef_construction=32, ef = 32):
        self.index = None
        self.dim = None
        self.space = space
        self.M = M
        self.ef_construction = ef_construction
        self.ef = ef

    def fit(self, X):
        self.index = build_hnsw(X, space=self.space, M=self.M, ef_construction=self.ef_construction, ef = self.ef)
        return self

    def kneighbors(self, X, k, efSearch=10):
        if self.index is None:
            raise ValueError("Unfitted")
        
        distances, labels = search_hnsw(self.index, X, k, efSearch)
        return distances, labels

In [11]:
class HNSWImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X):
        self.nn_model_ = HNSWSearcher()
        self.nn_model_.fit(X)

    def impute(self, X):
        missing_mask = np.isnan(X)
        if not np.any(missing_mask):
            return X

        # Find indices of missing values
        missing_indices = np.where(missing_mask)

        # Replace missing values with 0 for the purpose of finding nearest neighbors
        X_zeroed = X.fillna(0)

        # Find nearest neighbors for each missing value
        distances, indices = self.nn_model_.kneighbors(X_zeroed, k = self.n_neighbors)

        # Impute missing values with mean of nearest neighbors
        ## Тут проблема в том, что X имеет 150 строчек и 4 столбца. Здесь некорректно применять данный метод
        # imputed_values = np.mean(np.take_along_axis(X, indices, axis=0), axis=1)
        
        for row, col in zip(missing_indices[0], missing_indices[1]):
            mean_value = X_zeroed.iloc[indices[row], col].mean()
            X_zeroed.at[row, X_zeroed.columns[col]] = mean_value

        return X_zeroed

### Обучение HNSWImputer и получение ближайших соседей

In [56]:
hnsw_imputer = HNSWImputer(n_neighbors=10)

In [58]:
hnsw_imputer.fit(StandardScaler().fit_transform(df_iris))

In [59]:
hnsw_imputer.impute(df_iris)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.400,0.2
1,4.9,3.0,2.160,0.2
2,4.7,3.2,1.300,0.2
3,4.6,3.1,1.500,0.2
4,5.0,3.6,1.400,0.2
...,...,...,...,...
145,6.7,3.0,2.336,2.3
146,6.3,2.5,5.000,1.9
147,6.5,3.0,5.200,2.0
148,6.2,3.4,5.400,2.3


In [62]:
imputed = hnsw_imputer.impute(df_iris.apply(lambda x: StandardScaler().fit_transform(x.reshape(, 1))))

AttributeError: 'Series' object has no attribute 'reshape'

## Check metric

In [48]:
from sklearn.metrics import r2_score

In [49]:
df_iris_orig = pd.DataFrame(iris_data, columns=iris_feature_names)

In [50]:
def check(df_orig, df_nan, df_filled, col):
    imputed_ = df_filled[df_nan[col].isna()][col]
    orig = df_orig[df_nan[col].isna()][col]
    return pd.DataFrame({'imputed': imputed_, 'original':orig})

In [51]:
check(df_iris_orig, df_iris, imputed, 'sepal width (cm)')

Unnamed: 0,imputed,original
9,2.73,3.1
15,3.003,4.4
35,3.003,3.2
105,3.0236,3.0
120,3.0433,3.2


In [52]:
check(df_iris_orig, df_iris, imputed, 'petal length (cm)')

Unnamed: 0,imputed,original
1,1.32,1.4
46,1.402,1.6
57,1.37,3.3
60,1.37,3.5
94,1.36,4.2
112,1.422,5.5
130,1.472,6.1
144,1.422,5.7
145,1.422,5.2


In [53]:
check(df_iris_orig, df_iris, imputed, 'sepal length (cm)')

Unnamed: 0,imputed,original
51,5.19,6.4
65,5.19,6.7
113,5.19,5.7
122,5.17,7.7


In [54]:
check(df_iris_orig, df_iris, imputed, 'petal width (cm)')

Unnamed: 0,imputed,original
17,0.15,0.3
19,0.165,0.3
25,0.1815,0.2
36,0.19965,0.2
73,0.55965,1.2
84,0.56965,1.5
95,0.56965,1.2
109,0.59965,2.5
111,0.56965,1.9
118,0.55965,2.3


# План:

## HNSW:

1. Нормировать вход
2. Сделать так, чтобы в impute можно было подавать np.array
3. Сравнить с KNNImputer sklearn

## Faiss:
Все то же самое

## Тесты:

1. Отобрать 3-10 датасетов (неслучайные, хотя бы 2 - большие (N*M > 10000))
2. Протестить, сравнить качество (MSE, RMSE, R2, MAPE, time)

## Оформление результата:

1. --- Текст ---
2. Библиотека в PIP -> я сделаю

# Датасеты:

1. Отдельно для маленьких, отдельно для больших
2. OpenML (?), Kaggle(?) -> iris, fetch_california_housing, fraud_detection - банковский - поискать еще список на kaggle


In [63]:
from sklearn.datasets import fetch_california_housing

In [64]:
fetch_california_housing(return_X_y=True)[0].shape

(20640, 8)

### FAISS Imputer

In [297]:
def build_IVFPQ(build_data, coarse_index, nlist, m, nbits, metric, num_threads=1):
    dim = build_data.shape[1]
    faiss.omp_set_num_threads(num_threads)
    
    index = faiss.IndexIVFPQ(
        coarse_index,
        dim,
        nlist,
        m,
        nbits,
        metric
    )
    index.train(build_data)
    index.add(build_data)
    return index

def build_IVFFlat(build_data, coarse_index, nbits, metric, num_threads=1):
    dim = build_data.shape[1]
    faiss.omp_set_num_threads(num_threads)
    
    index = faiss.IndexIVFFlat(
        coarse_index,
        dim,
        nlist,
        metric
    )
    index.train(build_data)
    index.add(build_data)
    return index

def build_flat_l2(build_data , **fixed_params):
    dim = build_data.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.train(build_data)
    index.add(build_data)
    return index


def build_faiss_cosine(build_data, **fixed_params):
    dim = build_data.shape[1]
    faiss.normalize_L2(build_data.astype(np.float32))
    index = faiss.IndexFlatIP(dim)
    index.train(build_data)
    index.add(build_data)
    return index


def search_flat(index, query_data, k):
    distances, labels = index.search(x=query_data, k=k)
    return distances, labels

def search_faiss_cosine(index, query_data, k, nprobe=1):
    faiss.normalize_L2(query_data.astype(np.float32))
    index.nprobe = nprobe
    distances, labels = index.search(query_data, k)
    return distances, labels

def search_faiss(index, query_data, k, nprobe=1):
    index.nprobe = nprobe
    distances, labels = index.search(query_data, k)
    return distances, labels