## Aproximate Nearest Neighbor from Scratch

Create Next : 
2. Add priority queue for predict function
3. based on max distance from every node

## Import Library

In [36]:
# import basic library for data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib

# import sys library 
# and append src folder into path
import sys
sys.path.append("../")

# import approximate_nn and knn library
from src.approximate_nn import ApproximateNearestNeighbor
from src.approximate_nn import KNearestNeighbor

# import train_test_split
from sklearn.model_selection import train_test_split

# import word embedding model
import tensorflow as tf
embedding_nnlm = tf.keras.models.load_model("../../model/nnlm-id-dim50/")



## A. Data Preparation

In [37]:
# import embedding data
embedding_data_nnlm = joblib.load("../../data/processed/word_embedding_nnlm.pkl")
embedding_data_nnlm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93566 entries, 0 to 93565
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   words      93566 non-null  object
 1   embedding  93566 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [38]:
## create train and test data
X_nnlm = embedding_data_nnlm['embedding']
y_nnlm = embedding_data_nnlm['words']

# save training and test data
joblib.dump(X_nnlm, "../../data/processed/X_nnlm.pkl")
joblib.dump(y_nnlm, "../../data/processed/y_nnlm.pkl")

['../../data/processed/y_nnlm.pkl']

In [39]:
# load pickle data
X_nnlm = joblib.load("../../data/processed/X_nnlm.pkl")
y_nnlm = joblib.load("../../data/processed/y_nnlm.pkl")

## B. Model Training / Registering Data Into Model

### B.1. ANN Model

In [40]:
ann_model_nnlm = ApproximateNearestNeighbor(min_size_split=100, distance_type="cosine-similarity", n_tree = 1)
ann_model_nnlm.fit(X = np.array(X_nnlm.to_numpy().tolist()), y=y_nnlm.values.astype('str'))

### B.2. KNN Model

In [41]:
knn_model_nnlm = KNearestNeighbor(distance_type="cosine-similarity")
knn_model_nnlm.fit(X = np.array(X_nnlm.to_numpy().tolist()), y=y_nnlm.values.astype('str'))

## C. Model Testing / Searching Similar Items

In [42]:
def search_similar_item(embedding_model : str,
                        neighbors_model : object,
                        text_input : str):
    # generate embedding for text input
    input_embed = embedding_nnlm([text_input]).numpy().squeeze().tolist()

    # search neighbors by embedding 
    similar_item = neighbors_model.find_similar_items(np.array([input_embed]))

    return similar_item

### C.1. ANN Model

In [43]:
sim_item = search_similar_item(embedding_model="nnlm", neighbors_model=ann_model_nnlm, text_input="kucing")
sim_item

array(['kucing', 'angsa', 'monyet', 'lalat', 'duyung', 'siamang',
       'timang gajah', 'pante kera', 'lelaki', 'cecak'], dtype='<U44')

### C.2. KNN Model

In [44]:
sim_item_ = search_similar_item(embedding_model="nnlm", neighbors_model=knn_model_nnlm, text_input="kucing")
sim_item_

array(['kucing', 'angsa', 'ular', 'anjing', 'batu kucing', 'monyet',
       'kelinci', 'binatang', 'beruang', 'sumur kucing'], dtype='<U44')

## D. Searching Time Comparison

In [50]:
%%timeit -r 1 -n 1000
sim_item = search_similar_item(embedding_model="nnlm", neighbors_model=ann_model_nnlm, text_input="kucing")

9.82 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1,000 loops each)


In [51]:
%%timeit -r 1 -n 1000
sim_item_ = search_similar_item(embedding_model="nnlm", neighbors_model=knn_model_nnlm, text_input="kucing")

30.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1,000 loops each)
