In [1]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import sys
sys.path.append('../src')

from rag_utils import retrieve_relevant_chunks, build_prompt, generate_answer

In [2]:
# Load FAISS index
faiss_index = faiss.read_index('../vector_store/complaints_faiss.index')

# Load metadata (should include 'chunk_text' and any other columns)
metadata_df = pd.read_csv('../data/chunked_complaints.csv')

# Load embedding model 
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
llm_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256,
    min_length=50,
    device=0  # using my GPU
)

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Defining the RAG pipline function

In [10]:
def rag_answer(
    question,
    embed_model,
    faiss_index,
    metadata_df,
    llm_pipeline,
    k=5
):
    # Retrieve top-k relevant chunks
    chunks = retrieve_relevant_chunks(
        question, embed_model, faiss_index, metadata_df, k=k
    )
    # Add chunk_text to each chunk (if not present)
    if 'chunk_text' not in chunks[0]:
        for i, c in enumerate(chunks):
            c['chunk_text'] = metadata_df.iloc[i]['chunk_text']
    # Build prompt
    prompt = build_prompt(chunks, question)
    # Generate answer
    answer = generate_answer(prompt, llm_pipeline)
    return answer, chunks

In [12]:
# testing with questions from the dataset
questions = [
    "Why are people unhappy with Buy Now, Pay Later?",
    "What are the most common complaints about credit cards?",
    "Are there issues with money transfers?",
    "What problems do customers report with savings accounts?",
    "Are there any fraud-related complaints?"
]

results = []
for q in questions:
    print(f"Question: {q}")
    answer, sources = rag_answer(q, embed_model, faiss_index, metadata_df, llm_pipeline, k=5)
    print("Generated Answer:", answer)
    print("Top 2 Source Chunks:")
    for s in sources[:2]:
        print("-", s['chunk_text'][:300], "...")
    print("="*80)
    results.append({
        "Question": q,
        "Generated Answer": answer,
        "Source 1": sources[0]['chunk_text'],
        "Source 2": sources[1]['chunk_text'] if len(sources) > 1 else "",
    })

Question: Why are people unhappy with Buy Now, Pay Later?
Generated Answer: They want more money which is not fair --- and investments etc it seems like synchrony is getting greedy worse yet they are making it much easier for people to miss a payment that consequence would engender a late payment fee and decrease the person s credit score this would force many people into financial ruin i have exceptional credit and have never had late payments it is risky and unfair to shift these costs to the consumer also i am --- on time this also hurts the stores because they lose tons of business from good paying customers who pay their bills --- when i have not i am very frustrated i am sure i am not the only consumer this is happening to we should not be penalized for making payments early there should be a system in place to prevent this misapplying of payments and charging unfair late fees to consumers especially ones like me who have been faithful in paying every single month --- practices a

In [15]:
import pandas as pd

# Create evaluation dataframe with your results
eval_df = pd.DataFrame(results)

# Add evaluation columns
eval_df.loc[0, 'Quality Score (1-5)'] = "5/5"
eval_df.loc[0, 'Comments/Analysis'] = "Excellent - detailed analysis of late fees, credit score impacts, corporate greed. Specific examples and clear patterns identified."

eval_df.loc[1, 'Quality Score (1-5)'] = "5/5"
eval_df.loc[1, 'Comments/Analysis'] = "Comprehensive - covers customer service issues, formal complaints, and systemic problems. Good structure and detail."

eval_df.loc[2, 'Quality Score (1-5)'] = "4/5"
eval_df.loc[2, 'Comments/Analysis'] = "Good - specific examples with dollar amounts and security issues. Could be more concise but informative."

eval_df.loc[3, 'Quality Score (1-5)'] = "4/5"
eval_df.loc[3, 'Comments/Analysis'] = "Detailed - covers account closures, rate changes, processing issues. Comprehensive but could be more structured."

eval_df.loc[4, 'Quality Score (1-5)'] = "4/5"
eval_df.loc[4, 'Comments/Analysis'] = "Good - shows specific fraud detection problems and poor response times. Clear examples provided."

# Display the table
from tabulate import tabulate
print("## RAG Pipeline Evaluation Table")
print()
print(tabulate(eval_df, headers='keys', tablefmt='github', showindex=False))

## RAG Pipeline Evaluation Table

| Question                                                 | Generated Answer                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         