# Vectorization Module

###
---

### CLIP Model

In [None]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch.nn.functional import cosine_similarity

In [None]:
def load_clip_model(model_path):
    model = CLIPModel.from_pretrained(model_path, local_files_only=True)
    processor = CLIPProcessor.from_pretrained(model_path, local_files_only=True)
    return model, processor

def get_text_embedding(model, processor, text):
    inputs = processor(text=[text], return_tensors="pt", padding=True)
    
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
    
    return text_features

def get_image_embedding(model, processor, image_path):
    image = Image.open(image_path)
    # image = image.resize((224, 224))
    inputs = processor(images=image, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    
    return image_features


def calculate_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1, embedding2).item()



model_path = "../CLIP" 
embedding_model, embedding_processor = load_clip_model(model_path)
   

###
---

### FAISS VectorStore

In [None]:
import numpy as np
import faiss
import pickle
import os
import logging
import uuid



In [None]:


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class LocalVectorStore:
    def __init__(self):
        self.indexes = {}
        self.vectors = {}
        self.metadata = {}

    def create_index(self, index_name, dimension):
        if index_name in self.indexes:
            raise ValueError(f"Index '{index_name}' already exists")
        
        self.indexes[index_name] = faiss.IndexFlatL2(dimension)
        self.vectors[index_name] = []
        self.metadata[index_name] = []
        logger.info(f"Created index '{index_name}' with dimension {dimension}")

    def add_vector(self, index_name, vector, metadata=None):
        if index_name not in self.indexes:
            raise ValueError(f"Index '{index_name}' does not exist")
        
        vector = np.array(vector).astype('float32').reshape(1, -1)
        self.indexes[index_name].add(vector)
        self.vectors[index_name].append(vector)
        self.metadata[index_name].append(metadata)
        logger.info(f"Added vector to index '{index_name}'")

    def search(self, index_name, query_vector, k=5):
        if index_name not in self.indexes:
            raise ValueError(f"Index '{index_name}' does not exist")
        
        logger.info(f"Searching in index '{index_name}' for {k} nearest neighbors")
        
        try:
            query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
            
            if self.indexes[index_name].ntotal == 0:
                logger.warning(f"Index '{index_name}' is empty. No search performed.")
                return []

            distances, indices = self.indexes[index_name].search(query_vector, min(k, self.indexes[index_name].ntotal))
            
            results = []
            for i, idx in enumerate(indices[0]):
                results.append({
                    'vector': self.vectors[index_name][idx].tolist(),  # Convert to list for JSON serialization
                    'metadata': self.metadata[index_name][idx],
                    'distance': float(distances[0][i])  # Convert to float for JSON serialization
                })
            
            logger.info(f"Found {len(results)} results")
            return results
        except Exception as e:
            logger.error(f"Error during search: {str(e)}")
            raise

    def save(self, filepath):
        try:
            data = {
                'vectors': {k: [v.tolist() for v in vecs] for k, vecs in self.vectors.items()},
                'metadata': self.metadata
            }
            
            index_data = {}
            for index_name, index in self.indexes.items():
                index_data[index_name] = faiss.serialize_index(index).tobytes()
            
            data['index_data'] = index_data

            with open(filepath, 'wb') as f:
                pickle.dump(data, f)
            
            logger.info(f"Vector store saved to {filepath}")
        except Exception as e:
            logger.error(f"Error saving vector store: {str(e)}")
            raise

    @classmethod
    def load(cls, filepath):
        try:
            with open(filepath, 'rb') as f:
                data = pickle.load(f)

            vector_store = cls()
            vector_store.vectors = {k: [np.array(v, dtype='float32') for v in vecs] for k, vecs in data['vectors'].items()}
            vector_store.metadata = data['metadata']

            for index_name, index_bytes in data['index_data'].items():
                index = faiss.deserialize_index(np.frombuffer(index_bytes, dtype='uint8'))
                vector_store.indexes[index_name] = index

            logger.info(f"Vector store loaded from {filepath}")
            return vector_store
        except Exception as e:
            logger.error(f"Error loading vector store: {str(e)}")
            raise



In [None]:
# # Example usage
# vector_store = LocalVectorStore()

# # Create indexes for different sets
# vector_store.create_index("set1", dimension=128)
# vector_store.create_index("set2", dimension=64)

# # Add vectors to set1
# vector_store.add_vector("set1", np.random.rand(128), metadata={"id": 1, "name": "Vector 1"})
# vector_store.add_vector("set1", np.random.rand(128), metadata={"id": 2, "name": "Vector 2"})

# # Add vectors to set2
# vector_store.add_vector("set2", np.random.rand(64), metadata={"id": 1, "name": "Vector A"})
# vector_store.add_vector("set2", np.random.rand(64), metadata={"id": 2, "name": "Vector B"})

# # Save the vector store
# vector_store.save("vector_store.pkl")

# # Load the vector store
# loaded_vector_store = LocalVectorStore.load("vector_store.pkl")

# # Search for similar vectors using the loaded store
# query_vector = np.random.rand(128)
# results = loaded_vector_store.search("set1", query_vector, k=2)

# print("Search results:")
# for result in results:
#     print(f"Vector: {result['vector']}")
#     print(f"Metadata: {result['metadata']}")
#     print(f"Distance: {result['distance']}")
#     print()

###
---

### Numpy VectorStore

In [None]:
import numpy as np
import pickle
import logging


In [None]:

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class NumPyVectorStore:
    def __init__(self):
        self.vectors = {}
        self.metadata = {}

    def create_index(self, index_name, dimension):
        if index_name in self.vectors:
            raise ValueError(f"Index '{index_name}' already exists")
        
        self.vectors[index_name] = []
        self.metadata[index_name] = []
        logger.info(f"Created index '{index_name}' with dimension {dimension}")

    def add_vector(self, index_name, vector, metadata=None):
        if index_name not in self.vectors:
            raise ValueError(f"Index '{index_name}' does not exist")
        
        vector = np.array(vector).astype('float32')
        self.vectors[index_name].append(vector)
        self.metadata[index_name].append(metadata)
        logger.info(f"Added vector to index '{index_name}'")

    def search(self, index_name, query_vector, k=5):
        if index_name not in self.vectors:
            raise ValueError(f"Index '{index_name}' does not exist")
        
        logger.info(f"Searching in index '{index_name}' for {k} nearest neighbors")
        
        query_vector = np.array(query_vector).astype('float32')
        vectors = np.array(self.vectors[index_name])
        
        if len(vectors) == 0:
            logger.warning(f"Index '{index_name}' is empty. No search performed.")
            return []

        distances = np.linalg.norm(vectors - query_vector, axis=1)
        nearest_indices = np.argsort(distances)[:k]
        
        results = []
        for idx in nearest_indices:
            results.append({
                'vector': self.vectors[index_name][idx].tolist(),
                'metadata': self.metadata[index_name][idx],
                'distance': float(distances[idx])
            })
        
        logger.info(f"Found {len(results)} results")
        return results

    def save(self, filepath):
        try:
            data = {
                'vectors': {k: [v.tolist() for v in vecs] for k, vecs in self.vectors.items()},
                'metadata': self.metadata
            }

            with open(filepath, 'wb') as f:
                pickle.dump(data, f)
            
            logger.info(f"Vector store saved to {filepath}")
        except Exception as e:
            logger.error(f"Error saving vector store: {str(e)}")
            raise

    @classmethod
    def load(cls, filepath):
        try:
            with open(filepath, 'rb') as f:
                data = pickle.load(f)

            vector_store = cls()
            vector_store.vectors = {k: [np.array(v, dtype='float32') for v in vecs] for k, vecs in data['vectors'].items()}
            vector_store.metadata = data['metadata']

            logger.info(f"Vector store loaded from {filepath}")
            return vector_store
        except Exception as e:
            logger.error(f"Error loading vector store: {str(e)}")
            raise


In [None]:
# # Example usage
# vector_store = NumPyVectorStore()

# # Create index
# vector_store.create_index("purchase_data", dimension=128)

# # Add vectors
# vector_store.add_vector("purchase_data", np.random.rand(128), metadata={"id": 1, "product": "Book"})
# vector_store.add_vector("purchase_data", np.random.rand(128), metadata={"id": 2, "product": "Laptop"})

# # Save the vector store
# vector_store.save("vector_store.pkl")

# # Load the vector store
# loaded_vector_store = NumPyVectorStore.load("vector_store.pkl")

# # Search for similar vectors
# query_vector = np.random.rand(128)
# results = loaded_vector_store.search("purchase_data", query_vector, k=2)

# print("Search results:")
# for result in results:
#     print(f"Vector: {result['vector'][:5]}...")  # Showing only first 5 elements
#     print(f"Metadata: {result['metadata']}")
#     print(f"Distance: {result['distance']}")
#     print()

###
---

### Final Run

In [None]:
import uuid

In [None]:
vector_store = NumPyVectorStore()
namespace = uuid.NAMESPACE_DNS
vector_db_folder = "../data/app_data/vector_indices"

folder_paths = ["../data/app_data/purchase_data","../data/app_data/available_stocks","../data/app_data/wardrobe_images"]
for folder_path in folder_paths:
    index = folder_path.split("/")[-1]
    vector_store.create_index(index, dimension=768)
    for file in os.listdir(folder_path):
        if(file.endswith("png")):
            image_path = f"{folder_path}/{file}"

            image_id = uuid.uuid3(namespace, image_path)

            image_embeddings = get_image_embedding(embedding_model,embedding_processor,image_path)
            vector_store.add_vector(index, image_embeddings[0], metadata={"id":image_id , "image_path": image_path})

vector_store.save(f"{vector_db_folder}/vector_db.pkl")

       

In [None]:
query_vector = get_image_embedding(embedding_model,embedding_processor,"/Users/t.sumukhflexday/Desktop/Projects/Test/fashion_trend/fashion_trend/data/app_data/purchase_data/purchase_hist_1.png")
query_vector = np.array(query_vector[0])

In [None]:
loaded_vector_store = NumPyVectorStore.load("../data/app_data/vector_indices/vector_db.pkl")

In [None]:
results = loaded_vector_store.search("available_stocks",np.random.rand(768),k = 3)

In [None]:
for result in results:
    print(result['metadata']['image_path'])

###
---