<a href="https://colab.research.google.com/github/PriyankaMath/ANN_Algorithms/blob/main/ANN_Algotithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lightfm
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
import numpy as np
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
import pickle



In [2]:
movielensdata = fetch_movielens()

In [3]:
for key, value in movielensdata.items():
    print(key, type(value), value.shape)

train <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
test <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
item_features <class 'scipy.sparse.csr.csr_matrix'> (1682, 1682)
item_feature_labels <class 'numpy.ndarray'> (1682,)
item_labels <class 'numpy.ndarray'> (1682,)


In [4]:
train = movielensdata['train']
test = movielensdata['test']

In [5]:
model = LightFM(learning_rate=0.05, loss='warp')

model.fit_partial(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.61, test 0.11.
AUC: train 0.94, test 0.90.


In [6]:
item_vectors = movielensdata['item_features'] * model.item_embeddings

In [7]:
model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.59, test 0.10.
AUC: train 0.89, test 0.86.


In [8]:
with open('movielens.pickle', 'wb') as f:
    pickle.dump({"name": movielensdata['item_feature_labels'], "vector": item_vectors}, f)

In [9]:
def load_data():
    with open('movielens.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[ 0.44773567, -0.5421608 , -0.8530715 , ..., -0.42705998,
          0.2976313 , -0.24506229],
        [-0.9270871 ,  0.12023643,  0.00364534, ..., -0.40871066,
         -0.1807808 , -0.02832063],
        [-0.34158242, -0.95900655,  0.15841396, ..., -0.36590555,
          0.5346886 , -0.01360991],
        ...,
        [-0.00530609,  0.0372894 ,  0.56539595, ...,  0.49840134,
          0.30304453, -0.10677519],
        [-0.09163828, -0.10213464,  0.43350768, ...,  0.36996385,
          0.13224396, -0.25808433],
        [-0.06250183, -0.09214886,  0.44835532, ...,  0.27758336,
          0.09235444, -0.1141343 ]], dtype=float32)}

In [10]:
!pip install faiss-gpu
import faiss



# Locality_Sensitive_Hashing

In [11]:
class LSHIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, num_bits=8):
        self.index = faiss.IndexLSH(self.dimension, num_bits)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [12]:
index = LSHIndex(data["vector"], data["name"])
index.build()

In [13]:
movie_vector, movie_name = data['vector'][90:91], data['name'][90]
simlar_movie_questions = '\n* '.join(index.query(movie_vector))
print(f"The most similar movies to {movie_name} are:\n* {simlar_movie_questions}")

The most similar movies to Nightmare Before Christmas, The (1993) are:
* Outbreak (1995)
* Star Trek: The Wrath of Khan (1982)
* Die Hard 2 (1990)
* Heavy Metal (1981)
* Star Trek III: The Search for Spock (1984)
* Star Trek IV: The Voyage Home (1986)
* Last of the Mohicans, The (1992)
* Clueless (1995)
* Bram Stoker's Dracula (1992)
* Nightmare Before Christmas, The (1993)


#Trees and Graphs using Annoy

In [14]:
class TreesIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [15]:
!pip install annoy
import annoy



In [16]:
index = TreesIndex(data["vector"], data["name"])
index.build()

  if __name__ == '__main__':


In [17]:
movie_vector, movie_name = data['vector'][90], data['name'][90]
similar_movie_questions = '\n* '.join(index.query(movie_vector))
print(f"The most similar movie to {movie_name} are:\n* {simlar_movie_questions}")

The most similar movie to Nightmare Before Christmas, The (1993) are:
* Outbreak (1995)
* Star Trek: The Wrath of Khan (1982)
* Die Hard 2 (1990)
* Heavy Metal (1981)
* Star Trek III: The Search for Spock (1984)
* Star Trek IV: The Voyage Home (1986)
* Last of the Mohicans, The (1992)
* Clueless (1995)
* Bram Stoker's Dracula (1992)
* Nightmare Before Christmas, The (1993)


In [18]:
!pip install nmslib
import nmslib



#HNSW: Hierarchical Navigable Small World Algorithm

In [19]:
class HNSWIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [20]:
index = HNSWIndex(data["vector"], data["name"])
index.build()

In [21]:
movie_vector, movie_name = data['vector'][90], data['name'][90]
simlar_movie_questions = '\n* '.join(index.query(movie_vector))
print(f"The most similar stack to {movie_name} are:\n* {simlar_movie_questions}")

The most similar stack to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Clear and Present Danger (1994)
* Top Gun (1986)
* Batman (1989)
* True Lies (1994)
* Star Trek IV: The Voyage Home (1986)
* Grease (1978)
* Star Trek: The Wrath of Khan (1982)
* Last of the Mohicans, The (1992)
* Star Trek III: The Search for Spock (1984)


#Product Quantization

In [22]:
class ProductIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimension, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [23]:
index = ProductIndex(data["vector"], data["name"])
index.build()

In [24]:
movie_index = 90
movie_vector = data['vector'][movie_index:movie_index+1]
print(f"The most simillar movie to {data['name'][movie_index]} are:")
index.query(movie_vector)

The most simillar movie to Nightmare Before Christmas, The (1993) are:


['Nightmare Before Christmas, The (1993)',
 'Rob Roy (1995)',
 'Last of the Mohicans, The (1992)',
 'Clear and Present Danger (1994)',
 "Monty Python's Life of Brian (1979)",
 'Pink Floyd - The Wall (1982)',
 'Mask, The (1994)',
 'Ghost (1990)',
 'Grease (1978)',
 'Sneakers (1992)']

#Exhaustive Search

In [25]:
class ExhaustiveIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self):
        self.index = faiss.IndexFlatL2(self.dimension,)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [26]:
index = ExhaustiveIndex(data["vector"], data["name"])
index.build()

In [27]:
movie_vector, movie_name = data['vector'][90:91], data['name'][90]
simlar_movie_questions = '\n* '.join(index.query(movie_vector))
print(f"The most similar movie to {movie_name} are:\n* {simlar_movie_questions}")

The most similar movie to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Clear and Present Danger (1994)
* Last of the Mohicans, The (1992)
* Maverick (1994)
* Rob Roy (1995)
* Monty Python's Life of Brian (1979)
* Ghost (1990)
* Grease (1978)
* Pink Floyd - The Wall (1982)
* Sneakers (1992)
