In [2]:
# Installing FAISS for efficient similarity search

!pip install faiss-cpu --no-cache

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 1.9 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2


## FAISS - ANN (Approximate Nearest Neighbours)

In [5]:

import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from Constants import Constants

GRAPH_FILE_NAME = Constants.GRAPH_FILE_NAME.value

df_node2vec = pd.read_parquet('../Data/Embedding_Data/node2vec_embedding_df_{}.parquet'.format(GRAPH_FILE_NAME.split('.')[0]))
df_node2vec.columns = ['product_id', 'embedding_vector']

df_full = pd.read_parquet('../Data/optimised_raw_data.parquet').drop_duplicates(subset=['product_id'])
df_full = df_full[['product_id', 'category_code']]

df_node2vec = df_node2vec.merge(df_full, how='left', on='product_id')

In [6]:
xb = np.array(df_node2vec.embedding_vector.tolist())

In [7]:
xb.shape

(211861, 128)

In [8]:
# Initializing and training a FAISS index with L2 distance metric


import faiss                  
index = faiss.IndexFlatL2(128)   
print(index.is_trained)
index.add(xb)                
print(index.ntotal)

True
211861


In [9]:
# Performing a similarity search for the first 5000 products

k = 10                         
D, I = index.search(xb[:5000], k) 
I

array([[     0, 211448,     10, ..., 211705,      8,    460],
       [     1, 211484, 210977, ...,    733, 152873,    734],
       [     2, 211072,    942, ..., 211434, 211068, 179393],
       ...,
       [  4997,   3874,   5066, ...,   5100,   5705,   4324],
       [  4998,   6100, 179616, ..., 178522, 177862, 179103],
       [  4999,   5001,   3977, ...,   4317,   4661,   4663]])

In [10]:
# Retrieving similar products for product query - 4321

df_node2vec[df_node2vec.index.isin(I[4321])]

Unnamed: 0,product_id,embedding_vector,category_code
3948,10500709,"[0.7503771, -0.18183377, 0.16479845, -0.100275...",kids.toys
3994,10500914,"[0.45275557, 0.2702926, -0.07311262, -0.314343...",kids.toys
4146,10501504,"[0.5677568, 0.52260596, 0.034401912, -0.361972...",kids.toys
4285,10501901,"[0.64030284, 0.2345686, -0.061874636, -0.25763...",kids.toys
4321,10502004,"[0.87546676, -0.17221348, -0.03916068, -0.1922...",kids.toys
15542,44100021,"[0.8373087, 0.0653706, -0.26724932, -0.17449, ...",
131691,100021262,"[0.9261285, -0.060547136, -0.27329034, -0.1380...",
143311,100036427,"[0.76979965, 0.5984715, -0.10999734, -0.332528...",kids.toys
143313,100036434,"[0.81136036, 0.38808438, -0.23255286, -0.30186...",kids.toys
143314,100036436,"[0.7408471, 0.3930096, -0.18140738, -0.3619474...",kids.toys


In [11]:
# Retrieving similar products for product query - 1234

df_node2vec[df_node2vec.index.isin(I[1234])]

Unnamed: 0,product_id,embedding_vector,category_code
10,17301515,"[-0.10090834, 0.20500384, -0.35713068, 0.09066...",apparel.shoes.sandals
11,17301516,"[-0.08727922, 0.294588, -0.22248454, 0.0759693...",apparel.shoes.sandals
1234,17303153,"[0.13292949, 0.24028468, 0.258352, -0.19490324...",apparel.shoes.sandals
1239,17303160,"[0.2926829, 0.20645146, 0.12906672, -0.1899009...",apparel.shoes.sandals
1245,17303176,"[-0.17870961, 0.09260236, -0.15189257, 0.07243...",apparel.shoes.sandals
1258,17303190,"[-0.030680787, -0.19200186, -0.3513529, 0.1216...",apparel.shoes.sandals
1259,17303191,"[-0.002899726, 0.4434748, 0.04633778, -0.09894...",apparel.shoes.sandals
1269,17303201,"[0.22060238, -0.03629059, -0.27219886, 0.25183...",apparel.shoes.sandals
211434,17301022,"[0.051331908, 0.27771783, -0.24839675, 0.09112...",apparel.shoes.sandals
211843,17301484,"[0.18427522, 0.36696106, -0.346714, 0.14238462...",apparel.shoes.sandals


### Product Recommendation 

Given a product id -> we have a list of other most relevant product ids. 

In above example: 
Query = `1234	17303153	[0.82914335, 0.83423114, -0.1470292, 0.2176020...]	apparel.shoes.sandals`
Product Id (Parent) = `17303153`