In [1]:
import faiss
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize
import nltk
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Download tokenizer model
nltk.download('punkt')

# Sample dataset
documents = [
    "AI is transforming the world.",
    "Machine learning enables systems to learn from data.",
    "Natural Language Processing deals with human language.",
    "Deep learning is a subset of machine learning."
]

# Tokenization and display
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
print("Tokenized documents:")
for i, tokens in enumerate(tokenized_docs):
    print(f"Doc {i+1}: {tokens}")

# Convert text to embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance index
index.add(np.array(embeddings))

# Save FAISS index and metadata
faiss.write_index(index, "faiss_index.index")
with open("doc_metadata.pkl", "wb") as f:
    pickle.dump(documents, f)

print("\nFAISS index and document metadata saved locally.")

# Reload FAISS index and metadata for verification
reloaded_index = faiss.read_index("faiss_index.index")
with open("doc_metadata.pkl", "rb") as f:
    reloaded_docs = pickle.load(f)

# Query example
query = "AI and deep learning"
query_embedding = model.encode([query])
D, I = reloaded_index.search(np.array(query_embedding), k=2)

print("\nQuery results:")
for idx, dist in zip(I[0], D[0]):
    print(f"Matched Doc: {reloaded_docs[idx]} (Distance: {dist:.4f})")
