<a href="https://colab.research.google.com/github/Shreya-07/CMPE255_ANN/blob/main/cmpe255ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preparing the Dataset

In [None]:
!pip install lightfm
from lightfm import LightFM
from lightfm.datasets import fetch_stackexchange
import pickle
!pip install faiss-cpu
import faiss


Collecting faiss-cpu
  Downloading faiss_cpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 5.1 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.1.post2


In [None]:
data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']

model = LightFM(learning_rate=0.05, loss='warp', no_components=64, item_alpha=0.001)
model.fit_partial(train, item_features=data['item_features'], epochs=20 )

item_vectors = data['item_features'] * model.item_embeddings

In [None]:
with open('stack.pickle', 'wb') as f:
    pickle.dump({"name": data['item_features'], "vector": item_vectors}, f)

LSH - Locality Sensitive Hashing

In [None]:
def load_data():
    with open('stack.pickle', 'rb') as f:
        data1 = pickle.load(f)
    return data1

data1 = load_data()
data1

{'name': <72360x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 198963 stored elements in Compressed Sparse Row format>,
 'vector': array([[-0.31005383, -0.5460872 , -0.06690072, ..., -0.31407112,
          0.5473798 , -0.07498293],
        [-0.22133675, -0.31808484,  0.12196617, ...,  0.57256573,
          0.03614874, -0.25965038],
        [ 0.66195554,  0.198521  ,  0.05890319, ...,  0.4117253 ,
          0.05028956,  0.04528536],
        ...,
        [-0.43684655, -0.4595806 , -0.05338642, ..., -0.4427461 ,
         -0.09659885,  0.0546329 ],
        [ 0.5091615 , -0.2869155 , -0.3321965 , ..., -0.21657251,
          0.23571734,  0.03303249],
        [-0.767992  , -0.9125018 ,  0.5902929 , ...,  0.03145292,
         -0.09047353,  0.06938331]], dtype=float32)}

HNSW

In [None]:
!pip install nmslib
import nmslib



In [None]:
def load_data():
    with open('stack.pickle', 'rb') as f:
        data1 = pickle.load(f)
    return data1

data1 = load_data()
data1

{'name': <72360x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 198963 stored elements in Compressed Sparse Row format>,
 'vector': array([[-0.31005383, -0.5460872 , -0.06690072, ..., -0.31407112,
          0.5473798 , -0.07498293],
        [-0.22133675, -0.31808484,  0.12196617, ...,  0.57256573,
          0.03614874, -0.25965038],
        [ 0.66195554,  0.198521  ,  0.05890319, ...,  0.4117253 ,
          0.05028956,  0.04528536],
        ...,
        [-0.43684655, -0.4595806 , -0.05338642, ..., -0.4427461 ,
         -0.09659885,  0.0546329 ],
        [ 0.5091615 , -0.2869155 , -0.3321965 , ..., -0.21657251,
          0.23571734,  0.03303249],
        [-0.767992  , -0.9125018 ,  0.5902929 , ...,  0.03145292,
         -0.09047353,  0.06938331]], dtype=float32)}

In [None]:
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels

    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [None]:
index = NMSLIBIndex(data1["vector"], data1["name"])
index.build()

In [None]:
#stack_vector, stack_name = data1['vector'][90], data1['name'][90]
#simlar_questions = '\n* '.join(index.query(stack_vector))
#similar_questions = '\n'.join([' '.join([x for i in stack_vector(1,j)]) for j in stack_vector(2,6)])
#print(f"The most similar {stack_name} are:\n* {simlar_questions}")

Product Quantization

In [None]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [None]:
index = IVPQIndex(data1["vector"], data1["name"])
index.build()

In [None]:
movie_index = 90
movie_vector = data1['vector'][movie_index:movie_index+1]
print(f"The most simillar movies to {data1['name'][movie_index]} are:")
index.query(movie_vector)

The most simillar movies to   (0, 115)	1.0
  (0, 116)	1.0 are:


[<1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 <1x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 2 stored

Trees and Graphs

In [None]:
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [None]:
!pip install annoy
import annoy
index = AnnoyIndex(data1["vector"], data1["name"])
index.build()

Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[?25l[K     |▌                               | 10 kB 22.3 MB/s eta 0:00:01[K     |█                               | 20 kB 12.2 MB/s eta 0:00:01[K     |█▌                              | 30 kB 8.0 MB/s eta 0:00:01[K     |██                              | 40 kB 8.4 MB/s eta 0:00:01[K     |██▌                             | 51 kB 5.1 MB/s eta 0:00:01[K     |███                             | 61 kB 5.6 MB/s eta 0:00:01[K     |███▌                            | 71 kB 5.5 MB/s eta 0:00:01[K     |████                            | 81 kB 6.2 MB/s eta 0:00:01[K     |████▋                           | 92 kB 6.0 MB/s eta 0:00:01[K     |█████                           | 102 kB 5.2 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 5.2 MB/s eta 0:00:01[K     |██████                          | 122 kB 5.2 MB/s eta 0:00:01[K     |██████▋                         | 133 kB 5.2 MB/s eta 0:00:01[K     |███████ 

  if __name__ == '__main__':


In [None]:
movie_vector, movie_name = data1['vector'][90], data1['name'][90]
simlar_movies_names = '\n* '.join(index.query(movie_vector))
print(f"The most similar movies to {movie_name} are:\n* {simlar_movies_names}")

TypeError: ignored