In [1]:
from src.indexing import build_index_from_url, build_chunks_from_tree
import time
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

In [5]:
tree = await build_index_from_url("https://en.wikipedia.org/wiki/peru", 1)

In [6]:
chunks = build_chunks_from_tree(tree)

In [None]:
class EmbeddingManager:
    def __init__(self, model_name='all-MiniLM-L6-v2', model_dir='data/models', index_dir='data/faiss_index'):
        self.model_name = model_name
        self.model_dir = model_dir
        self.index_dir = index_dir
        self.model = None
        self.index = None

        # Create directories if they don't exist
        os.makedirs(self.model_dir, exist_ok=True)
        os.makedirs(self.index_dir, exist_ok=True)

    def load_or_download_model(self):
        """Load model from local directory or download if not exists"""
        model_path = os.path.join(self.model_dir, self.model_name)
        
        if os.path.exists(model_path):
            print("Loading model from local directory...")
            self.model = SentenceTransformer(model_path)
        else:
            print("Downloading model...")
            self.model = SentenceTransformer(self.model_name)
            self.model.save(model_path)
        return self.model

    def generate_embeddings(self, chunks):
        """Convert text chunks to embeddings"""
        if not self.model:
            self.load_or_download_model()
            
        print("Generating embeddings...")
        return self.model.encode(chunks, convert_to_numpy=True)

    def create_faiss_index(self, embeddings):
        """Create and save FAISS index"""
        # Convert embeddings to float32 numpy array
        embeddings = np.array(embeddings).astype('float32')
        
        # Create FAISS index
        dimension = embeddings.shape[1]
        
        self.index = faiss.IndexScalarQuantizerL2(dimension)
        self.index.add(embeddings)
        
        print(f"Created FAISS index with {self.index.ntotal} vectors")
        return self.index

    def save_faiss_index(self, index_name='my_index'):
        """Save FAISS index to disk"""
        if self.index is None:
            raise ValueError("Index not initialized. Create index first.")
            
        index_path = os.path.join(self.index_dir, f"{index_name}.index")
        faiss.write_index(self.index, index_path)
        print(f"Index saved to {index_path}")

    def load_faiss_index(self, index_name='my_index'):
        """Load FAISS index from disk"""
        index_path = os.path.join(self.index_dir, f"{index_name}.index")
        if not os.path.exists(index_path):
            raise FileNotFoundError(f"No index found at {index_path}")
            
        self.index = faiss.read_index(index_path)
        print(f"Loaded index with {self.index.ntotal} vectors")
        return self.index



In [None]:
# Usage example
if __name__ == "__main__":
    # Initialize manager
    em = EmbeddingManager()
    
    # Example chunks (replace with your actual text chunks)
    chunks = [
        "This is the first text chunk",
        "Here's another piece of text",
        "Finally, a third example chunk"
    ]
    
    # Generate embeddings
    embeddings = em.generate_embeddings(chunks)
    
    # Create and save FAISS index
    em.create_faiss_index(embeddings)
    em.save_faiss_index()
    
    # Later, to load everything:
    em.load_or_download_model()
    em.load_faiss_index()