In [1]:
import pandas as pd
import sys
sys.path.append('../src')

from embedding import embed_texts

In [2]:
chunked_df = pd.read_csv('../data/chunked_complaints.csv')
chunked_df.head()

Unnamed: 0,chunk_text,chunk_id,Product
0,a xxxx xxxx card was opened under my name by a...,0_0,Credit card
1,have failed to remove this from the three cred...,0_1,Credit card
2,dear cfpb i have a secured credit card with ci...,1_0,Credit card
3,who where and when did it happen i requested a...,1_1,Credit card
4,told me same thing no info no help at all plea...,1_2,Credit card


In [3]:
embeddings = embed_texts(chunked_df['chunk_text'])
print("Embeddings shape:", embeddings.shape)

Batches:   0%|          | 0/9744 [00:00<?, ?it/s]

Embeddings shape: (311806, 384)


In [4]:
import faiss
import numpy as np
import os

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

# Save the index
faiss.write_index(index, '../vector_store/complaints_faiss.index')
print("FAISS index saved to ../vector_store/complaints_faiss.index")

FAISS index saved to ../vector_store/complaints_faiss.index


In [5]:
# Saving metadata (chunk_id, product, etc.) for later retrieval
chunked_df[['chunk_id', 'Product']].to_csv('../vector_store/complaints_metadata.csv', index=False)
print("Metadata saved to ../vector_store/complaints_metadata.csv")

Metadata saved to ../vector_store/complaints_metadata.csv


In [6]:
# Example: Retrieve the 5 most similar chunks to a random embedding
D, I = index.search(np.array([embeddings[0]]).astype('float32'), 5)
print("Indices of top 5 similar chunks:", I)
print("Distances:", D)
print("Corresponding chunk texts:")
for idx in I[0]:
    print(chunked_df.iloc[idx]['chunk_text'])
    print("---")

Indices of top 5 similar chunks: [[     0 258418 106956  42639  23782]]
Distances: [[0.         0.24230747 0.32806346 0.33932364 0.34341097]]
Corresponding chunk texts:
a xxxx xxxx card was opened under my name by a fraudster i received a notice from xxxx that an account was just opened under my name i reached out to xxxx xxxx to state that this activity was unauthorized and not me xxxx xxxx confirmed this was fraudulent and immediately closed the card however they have failed to remove this from the three credit agencies and this fraud is now impacting my credit
---
i was notified by xxxx that a xxxx xxxx credit card was opened in my name i did not open this credit card i looked at my credit report and the credit card was opened on xxxx and there was xxxx charged to the account i called xxxx xxxx on xxxxyear and spoke to their fraud department and they were going to close the account
---
someone opened 2 credit cards in my name using my information i called the company am i called the