<a href="https://colab.research.google.com/github/Shreya-07/CMPE255_ANN/blob/main/cmpe255ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preparing the Dataset

In [84]:
!pip install lightfm
from lightfm import LightFM
from lightfm.datasets import fetch_stackexchange
import pickle
!pip install faiss-cpu
import faiss




In [85]:
data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']

model = LightFM(learning_rate=0.05, loss='warp', no_components=64, item_alpha=0.001)
model.fit_partial(train, item_features=data['item_features'], epochs=20 )

item_vectors = data['item_features'] * model.item_embeddings

In [79]:
with open('stack.pickle', 'wb') as f:
    pickle.dump({"name": data['item_features'], "vector": item_vectors}, f)

LSH - Locality Sensitive Hashing

In [80]:
def load_data():
    with open('stack.pickle', 'rb') as f:
        data1 = pickle.load(f)
    return data1

data1 = load_data()
data1

{'name': <72360x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 198963 stored elements in Compressed Sparse Row format>,
 'vector': array([[ 0.08971766, -0.01931789, -0.39299065, ..., -0.32398626,
         -0.04930801,  0.03139853],
        [ 0.42719924, -0.13446862, -0.11744323, ...,  0.05327304,
         -0.16749631, -0.5094366 ],
        [ 0.17469001, -0.3165027 ,  0.34241223, ..., -0.10355807,
          0.34335202,  0.18460143],
        ...,
        [-0.22121345, -0.7440063 ,  0.09974044, ..., -0.08692014,
         -0.01444667,  0.30797526],
        [ 0.5214379 , -0.11684816,  0.00835054, ...,  0.24716628,
          0.33209053, -0.17896186],
        [-0.18122649, -0.11276215, -0.02549529, ...,  0.76297367,
         -0.30785683, -0.69704616]], dtype=float32)}

In [81]:
class FalconIndex():
     def __init__(self, vectors, labels):
         self.dimention = vectors.shape[1]
         self.vectors = vectors.astype('float32')
         self.labels = labels


     def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
         quantizer = faiss.IndexFlatL2(self.dimention)
         self.index = faiss.IndexIVFPQ(quantizer, self.dimention, number_of_partition, search_in_x_partitions, subvector_size)
         self.index.train(self.vectors)
         self.index.add(self.vectors)
        
     def query(self, vectors, k=10):
         distances, indices = self.index.search(vectors, k) 
         return [self.labels[i] for i in indices[0]]

In [82]:
 index = IVPQIndex(data1["vector"], data1["name"])
 index.build()

In [83]:
 stack_vector, stack_name = data1['vector'][90:91], data1['name'][90]
 simlar_stack_names = '\n* '.join(index.query(stack_vector))
 print(f"The most similar questions to {stack_name} are:\n* {simlar_stack_names}")

TypeError: ignored

HNSW - Hierarchical Navigable Small Words

In [86]:
!pip install nmslib
import nmslib



In [87]:
def load_data():
    with open('stack.pickle', 'rb') as f:
        data1 = pickle.load(f)
    return data1

data1 = load_data()
data1

{'name': <72360x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 198963 stored elements in Compressed Sparse Row format>,
 'vector': array([[ 0.08971766, -0.01931789, -0.39299065, ..., -0.32398626,
         -0.04930801,  0.03139853],
        [ 0.42719924, -0.13446862, -0.11744323, ...,  0.05327304,
         -0.16749631, -0.5094366 ],
        [ 0.17469001, -0.3165027 ,  0.34241223, ..., -0.10355807,
          0.34335202,  0.18460143],
        ...,
        [-0.22121345, -0.7440063 ,  0.09974044, ..., -0.08692014,
         -0.01444667,  0.30797526],
        [ 0.5214379 , -0.11684816,  0.00835054, ...,  0.24716628,
          0.33209053, -0.17896186],
        [-0.18122649, -0.11276215, -0.02549529, ...,  0.76297367,
         -0.30785683, -0.69704616]], dtype=float32)}

In [88]:
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels

    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [89]:
index = NMSLIBIndex(data1["vector"], data1["name"])
index.build()

In [90]:
stack_vector, stack_name = data1['vector'][90], data1['name'][90]
simlar_questions = '\n* '.join(index.query(stack_vector))
similar_questions = '\n'.join([' '.join([str(x) for i in stack_vector(1,j)]) for j in stack_vector(2,6)])
print(f"The most similar {stack_name} are:\n* {simlar_questions}")

TypeError: ignored

Product Quantization

In [None]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [None]:
index = IVPQIndex(data1["vector"], data1["name"])
index.build()

In [None]:
stack_index = 90
stack_vector = data1['vector'][stack_index:stack_index+1]
print(f"The most similar questions {data1['name']stack_index]} are:")
index.query(stack_vector)

Trees and Graphs

In [None]:
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [None]:
!pip install annoy
import annoy
index = AnnoyIndex(data1["vector"], data1["name"])
index.build()

In [None]:
stack_vector, stack_name = data1['vector'][90], data1['name'][90]
simlar_stack_names = '\n* '.join(index.query(stack_vector))
print(f"The most similar questions to {stack_name} are:\n* {simlar_stack_names}")