In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List

class TextChunker:
    def __init__(self, chunk_size: int = 300, chunk_overlap: int = 50):
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )

    def chunk_texts(self, texts: List[str]) -> List[str]:
        return self.splitter.split_texts(texts)


In [None]:
from sentence_transformers import SentenceTransformer
from typing import List
import numpy as np

class Embedder:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)


In [None]:
import faiss
import numpy as np
import pickle
from typing import List, Dict

class VectorStoreBuilder:
    def __init__(self, dim: int):
        self.index = faiss.IndexFlatL2(dim)
        self.metadata = []

    def add_embeddings(self, embeddings: np.ndarray, meta: List[Dict]):
        self.index.add(embeddings)
        self.metadata.extend(meta)

    def save(self, index_path: str, metadata_path: str):
        faiss.write_index(self.index, index_path)
        with open(metadata_path, 'wb') as f:
            pickle.dump(self.metadata, f)


In [None]:
import pandas as pd
from tqdm import tqdm

# Load preprocessed data
df = pd.read_csv("../data/processed/filtered_complaints.csv")

chunker = TextChunker(chunk_size=300, chunk_overlap=50)
embedder = Embedder()
vector_store = VectorStoreBuilder(dim=384)

all_chunks = []
all_metadata = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    chunks = chunker.chunk_texts([row['Consumer complaint narrative']])
    meta = [{
        "product": row["Product_Mapped"],
        "complaint_id": row["Complaint ID"],
        "original_text": chunk
    } for chunk in chunks]

    all_chunks.extend(chunks)
    all_metadata.extend(meta)

# Embed all chunks at once
embeddings = embedder.embed_texts(all_chunks)

# Store vectors and metadata
vector_store.add_embeddings(embeddings, all_metadata)
vector_store.save("../vector_store/index.faiss", "../vector_store/metadata.pkl")
