## Aproximate Nearest Neighbor from Scratch

Create Next : 
2. Add priority queue for predict function
3. based on max distance from every node

## Import Library

In [16]:
# import basic library for data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib

# import sys library 
# and append src folder into path
import sys
sys.path.append("../")

# import approximate_nn and knn library
from src.approximate_nn import ApproximateNearestNeighbor
from src.approximate_nn import KNearestNeighbor

# import train_test_split
from sklearn.model_selection import train_test_split

# import word embedding model
import tensorflow as tf
embedding = tf.keras.models.load_model("../../model/nnlm-id-dim50/")



## A. Data Preparation

In [2]:
# import embedding data
embedding_data = joblib.load("../../data/processed/word_embedding.pkl")
embedding_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93566 entries, 0 to 93565
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   words      93566 non-null  object
 1   embedding  93566 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [None]:
## create train and test data
X = embedding_data['embedding']
y = embedding_data['words']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=666)

# save training and test data
joblib.dump(X_train, "../../data/processed/X_train.pkl")
joblib.dump(X_test, "../../data/processed/X_test.pkl")
joblib.dump(y_train, "../../data/processed/y_train.pkl")
joblib.dump(y_test, "../../data/processed/y_test.pkl")

In [17]:
# load pickle data
X_train = joblib.load("../../data/processed/X_train.pkl")
X_test = joblib.load("../../data/processed/X_test.pkl")
y_train = joblib.load("../../data/processed/y_train.pkl")
y_test = joblib.load("../../data/processed/y_test.pkl")

## B. Model Training / Registering Data Into Model

### B.1. ANN Model

In [28]:
ann_model = ApproximateNearestNeighbor(min_size_split=100, distance_type="cosine-similarity", n_tree = 1)
ann_model.fit(X = np.array(X_train.to_numpy().tolist()), y=y_train.values.astype('str'))

### B.2. KNN Model

In [23]:
knn_model = KNearestNeighbor(distance_type="cosine-similarity")
knn_model.fit(X = np.array(X_train.to_numpy().tolist()), y=y_train.values.astype('str'))

## C. Model Testing / Searching Similar Items

In [20]:
def search_similar_item(embedding_model : object,
                        neighbors_model : object,
                        text_input : str):
    # generate embedding for text input
    input_embed = embedding_model([text_input]).numpy().squeeze().tolist()

    # search neighbors by embedding 
    similar_item = neighbors_model.find_similar_items(np.array([input_embed]))

    return similar_item

### C.1. ANN Model

In [24]:
sim_item = search_similar_item(embedding_model=embedding, neighbors_model=ann_model, text_input="kucing")
sim_item

array(['kucing', 'ular', 'anjing', 'monyet', 'beruang', 'sumur kucing',
       'singa', 'rusa', 'kelelawar', 'piton'], dtype='<U44')

### C.2. KNN Model

In [25]:
sim_item_ = search_similar_item(embedding_model=embedding, neighbors_model=knn_model, text_input="kucing")
sim_item_

array(['kucing', 'angsa', 'ular', 'anjing', 'batu kucing', 'monyet',
       'kelinci', 'binatang', 'beruang', 'sumur kucing'], dtype='<U44')

## D. Searching Time Comparison

In [29]:
%%timeit
sim_item = search_similar_item(embedding_model=embedding, neighbors_model=ann_model, text_input="kucing")

11.5 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%%timeit
sim_item_ = search_similar_item(embedding_model=embedding, neighbors_model=knn_model, text_input="kucing")

31.8 ms ± 735 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
