In [1]:
# # %% [markdown]
# # # Text Embeddings with Amazon Titan and FAISS Index Creation
# # This notebook processes text files using Amazon Titan Embeddings and builds a FAISS index

# # %%
# # Install required packages
# !pip install faiss-cpu numpy boto3 tqdm

# %%
import os
import json
import numpy as np
import boto3
import faiss
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# %% [markdown]
# ## 1. Configuration

# %%
# Configuration
TEXT_FILES_DIR = r"A:\NLP Projects\Chatbot\python_code\products" # Directory containing your .txt files
FAISS_INDEX_PATH = "plantify_faiss_index.index"  # Where to save the FAISS index
METADATA_PATH = "metadata.json"             # File to store document metadata
EMBEDDING_DIMENSIONS = 512                 # Titan Embed v2 uses 512 dimensions

In [4]:

# AWS Configuration
AWS_REGION = "us-east-1"                   # Change to your region
BEDROCK_MODEL_ID = "amazon.titan-embed-text-v2:0"

# %%
# Initialize AWS Bedrock client
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name=AWS_REGION
)

# Verify text files directory
if not os.path.exists(TEXT_FILES_DIR):
    raise FileNotFoundError(f"Directory not found: {TEXT_FILES_DIR}")

txt_files = [f for f in os.listdir(TEXT_FILES_DIR) if f.endswith('.txt')]
print(f"Found {len(txt_files)} text files for processing")

Found 2 text files for processing


In [5]:
# %% [markdown]
# ## 2. Embedding Generation with Amazon Titan

# %%
def get_titan_embeddings(text):
    """Get embeddings for text using Amazon Titan model"""
    body = json.dumps({
        "inputText": text,
        "dimensions": EMBEDDING_DIMENSIONS,
        "normalize": True
    })
    
    response = bedrock_runtime.invoke_model(
        body=body,
        modelId=BEDROCK_MODEL_ID,
        accept="*/*",
        contentType="application/json"
    )
    
    response_body = json.loads(response['body'].read())
    return np.array(response_body['embedding'], dtype=np.float32)

def batch_embed_texts(texts, batch_size=10):
    """Embed texts in batches"""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        batch_embeddings = [get_titan_embeddings(text) for text in batch]
        embeddings.extend(batch_embeddings)
    return np.vstack(embeddings)


In [7]:
# %% [markdown]
# ## 3. Process Text Files

# %%
def process_text_files(text_files_dir):
    """Process all text files and generate chunks with metadata"""
    text_chunks = []
    metadata_records = []
    
    for filename in tqdm(os.listdir(text_files_dir)):
        if not filename.endswith('.txt'):
            continue
            
        file_path = os.path.join(text_files_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
        # Basic chunking - adjust as needed
        chunks = [text[i:i+500] for i in range(0, len(text), 500)]  # 500 character chunks
        
        for i, chunk in enumerate(chunks):
            text_chunks.append(chunk)
            metadata_records.append({
                "source_file": filename,
                "chunk_id": i,
                "start_char": i * 500,
                "end_char": min((i + 1) * 500, len(text))
            })
            
    return text_chunks, metadata_records

# %%
# Process all text files
text_chunks, metadata = process_text_files(TEXT_FILES_DIR)
print(f"Created {len(text_chunks)} text chunks from {len(txt_files)} files")

100%|██████████| 3/3 [00:00<00:00, 551.28it/s]

Created 9 text chunks from 2 files





In [8]:
# %% [markdown]
# ## 4. Generate Embeddings with Amazon Titan

# %%
# Generate embeddings for all text chunks
print("Generating embeddings with Amazon Titan...")
embeddings = batch_embed_texts(text_chunks)
print(f"Embeddings shape: {embeddings.shape}")  # (num_chunks, 512)

Generating embeddings with Amazon Titan...


100%|██████████| 1/1 [00:09<00:00,  9.57s/it]

Embeddings shape: (9, 512)





In [10]:
# %% [markdown]
# ## 5. Create and Save FAISS Index

# %%
def create_faiss_index(embeddings):
    """Create and return a FAISS index"""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
    
    # No need to normalize since Titan embeddings are already normalized
    index.add(embeddings)
    return index

# %%
# Create FAISS index
index = create_faiss_index(embeddings)
print(f"FAISS index contains {index.ntotal} vectors")

# %%
# Save the FAISS index
faiss.write_index(index, FAISS_INDEX_PATH)
print(f"Saved FAISS index to {FAISS_INDEX_PATH}")

# %%
# Save metadata
with open(METADATA_PATH, 'w') as f:
    json.dump(metadata, f)
print(f"Saved metadata to {METADATA_PATH}")

FAISS index contains 9 vectors
Saved FAISS index to plantify_faiss_index.index
Saved metadata to metadata.json


In [15]:
# %% [markdown]
# ## 6. Verification

# %%
# Verify the saved index can be loaded
test_index = faiss.read_index(FAISS_INDEX_PATH)
print(f"Verified index loading. Contains {test_index.ntotal} vectors")

# %%
# Sample search function
def search_index(query, index, metadata, k=3):
    """Search the FAISS index for similar chunks"""
    # Get embedding for query
    query_embedding = get_titan_embeddings(query)
    query_embedding = np.expand_dims(query_embedding, axis=0)
    
    # Search the index
    distances, indices = index.search(query_embedding, k)
    
    # Return results with metadata
    results = []
    for i, dist in zip(indices[0], distances[0]):
        results.append({
            "score": dist,  # Titan returns cosine similarity directly
            "text": text_chunks[i],
            "metadata": metadata[i]
        })
    
    return results

Verified index loading. Contains 9 vectors


In [None]:
# %%
# Test search
sample_query = "price of Peace Lily"
results = search_index(sample_query, test_index, metadata)
print(f"Top result for '{sample_query}':")
print(results[0]['text'][:300] + "...")  # Print first 200 chars of top result


