# Product Quantization

In [4]:
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-cpu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[K     |████████████████████████████████| 17.0 MB 6.8 MB/s 
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [5]:
import pickle
import faiss

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
def load_data():
    with open('/content/drive/MyDrive/movies.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

* Use IndexFlatL2 and IndexIVFPQ methods to get the index.

In [12]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimension, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [13]:
index = IVPQIndex(data["vector"], data["name"])
index.build()

Similar movies related to Fifth Element, The (1997)(249 index)

In [14]:
movie_index = 249
movie_vector = data['vector'][movie_index:movie_index+1]
print(f"The most similar movies to {data['name'][movie_index]} are:")
index.query(movie_vector)

The most similar movies to Fifth Element, The (1997) are:


['Fifth Element, The (1997)',
 'Men in Black (1997)',
 'Breakdown (1997)',
 'Lost World: Jurassic Park, The (1997)',
 'Con Air (1997)',
 'Chasing Amy (1997)',
 'Private Parts (1997)',
 'Austin Powers: International Man of Mystery (1997)',
 'Face/Off (1997)',
 'Grosse Pointe Blank (1997)']

# Exhaustive Search

In [15]:
class BruteForceIndex():
    def __init__(self, vectors, labels):
        self.vectors = vectors.astype('float32')
        self.labels = labels
        self.index = faiss.IndexFlatL2(vectors.shape[1])
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [16]:
index = BruteForceIndex(data["vector"], data["name"])

In [17]:
movie_vector, movie_name = data['vector'][249:250], data['name'][249]
simlar_movies_names = '\n* '.join(index.query(movie_vector))
print(f"The most similar movies to {movie_name} are:\n* {simlar_movies_names}")

The most similar movies to Fifth Element, The (1997) are:
* Fifth Element, The (1997)
* Men in Black (1997)
* Face/Off (1997)
* Con Air (1997)
* Austin Powers: International Man of Mystery (1997)
* Private Parts (1997)
* Grosse Pointe Blank (1997)
* Rumble in the Bronx (1995)
* Romy and Michele's High School Reunion (1997)
* Mars Attacks! (1996)


# Trees and Forests

Instead of FAISS Library, use annoy

In [18]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 6.7 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp37-cp37m-linux_x86_64.whl size=395185 sha256=582e4484ae6693261622be154ba6815901956b6694fdd77d4a0400d79e99667a
  Stored in directory: /root/.cache/pip/wheels/81/94/bf/92cb0e4fef8770fe9c6df0ba588fca30ab7c306b6048ae8a54
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


In [19]:
import annoy

class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [20]:
index = AnnoyIndex(data["vector"], data["name"])
index.build()

  # This is added back by InteractiveShellApp.init_path()


In [21]:
movie_vector, movie_name = data['vector'][90], data['name'][90]
simlar_movies_names = '\n* '.join(index.query(movie_vector))
print(f"The most similar movies to {movie_name} are:\n* {simlar_movies_names}")

The most similar movies to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Beauty and the Beast (1991)
* Fantasia (1940)
* Heavy Metal (1981)
* Snow White and the Seven Dwarfs (1937)
* Lion King, The (1994)
* Pink Floyd - The Wall (1982)
* Sound of Music, The (1965)
* Monty Python's Life of Brian (1979)
* Sirens (1994)


# HNSW

In [22]:
!pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[K     |████████████████████████████████| 13.5 MB 6.5 MB/s 
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 56.6 MB/s 
[?25hInstalling collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [24]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits

* Load the dataset

In [25]:
digits = load_digits(n_class = 9)
X = digits.data
y = digits.target
n_samples, n_features = X.shape
X.shape

(1617, 64)

In [26]:
X


array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  6., ...,  6.,  0.,  0.],
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

* Transform the dataset

In [27]:
from sklearn.manifold import TSNE
tsne_2 = TSNE()
data_embeddings_tsne_2 = tsne_2.fit_transform(X)



In [28]:
data_embeddings_tsne_2.shape

(1617, 2)

In [29]:
data_embeddings_tsne_2

array([[ -7.386362 , -61.89731  ],
       [ 23.134464 ,  -3.5537174],
       [  5.472348 ,  19.396969 ],
       ...,
       [  2.1004508, -58.27798  ],
       [  7.237931 ,   8.314335 ],
       [  1.3318087,   9.990942 ]], dtype=float32)

In [30]:
import nmslib

In [31]:
hnsw_index = nmslib.init(method = 'hnsw', space = 'cosinesimil')
hnsw_index.addDataPointBatch(data_embeddings_tsne_2)
hnsw_index.createIndex({'post': 2})

In [32]:
y[10]

1

In [33]:
hnsw_output = hnsw_index.knnQuery(data_embeddings_tsne_2[10], k = 10)
hnsw_output_list = [y[i] for i in hnsw_output[0]]
hnsw_output_list

[4, 1, 1, 4, 4, 1, 4, 1, 1, 4]