In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import os
import pickle
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(r'C:\Users\bless\OneDrive\Desktop\week _6\crediTrust-rag-chatbot\data\filtered\filtered_complaints.csv')

In [3]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     
    chunk_overlap=50  
)    


In [4]:
documents = []
metadata = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    chunks = splitter.split_text(row['cleaned_narrative'])
    for chunk in chunks:
        documents.append(chunk)
        metadata.append({'product': row['Product'], 'source_idx': idx})

100%|██████████| 80667/80667 [00:39<00:00, 2032.49it/s]


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents, show_progress_bar=True)

Batches: 100%|██████████| 7088/7088 [2:36:31<00:00,  1.32s/it]     


In [6]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))

In [7]:
os.makedirs('vector_store', exist_ok=True)
faiss.write_index(index, 'vector_store/faiss_index.index')

In [8]:
with open('vector_store/documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

with open('vector_store/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)