In [1]:
# %%
"""
# Customer Support RAG - Enhanced Knowledge Base Builder
## Building Production-Ready Knowledge Base with OpenAI API

This notebook:
1. Generates answers for all training queries using GPT-4o-mini
2. Creates embeddings using OpenAI text-embedding-3-small
3. Stores everything in ChromaDB vector database
4. Implements negation handling and evaluation framework
5. Adds hybrid retrieval and reranking capabilities
6. Properly evaluates on UNSEEN test data
7. Exports knowledge base for production use
"""

# %% [markdown]
# ## 1. Setup and Imports

# %%
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import time
import warnings
from datetime import datetime
from collections import defaultdict
import concurrent.futures
from functools import partial

warnings.filterwarnings('ignore')

# For vector database
import chromadb
from chromadb.config import Settings

# For hybrid retrieval
try:
    from rank_bm25 import BM25Okapi
    BM25_AVAILABLE = True
    print("‚úÖ BM25 available for hybrid search")
except ImportError:
    BM25_AVAILABLE = False
    print("‚ö†Ô∏è  BM25 not available - hybrid search will be disabled")
    print("   Install with: pip install rank-bm25")

print("‚úÖ All imports successful!")

‚úÖ BM25 available for hybrid search
‚úÖ All imports successful!


In [2]:
# %%
# 1. Setup paths and environment
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
env_path = project_root / '.env'

print(f"Project root: {project_root}")
print(f"Looking for .env at: {env_path}")
print(f"Exists: {env_path.exists()}")

# Load environment variables
load_dotenv(dotenv_path=env_path, override=True)

# Initialize OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    client = OpenAI(api_key=api_key)
    print(f"\n‚úÖ OpenAI client initialized")
    print(f"   API key: {api_key[:20]}...{api_key[-5:]}")
    
    # Test connection
    try:
        models = client.models.list()
        print(f"‚úÖ Connected to OpenAI API")
        print(f"   Available models: {len(models.data)}")
    except Exception as e:
        print(f"‚ùå Connection failed: {e}")
else:
    print("‚ùå OPENAI_API_KEY not found!")
    print("Make sure .env file contains: OPENAI_API_KEY=sk-...")

print("\n" + "="*70)
print("Setup complete!")
print("="*70)


Project root: c:\Users\victo\customer-support-rag
Looking for .env at: c:\Users\victo\customer-support-rag\.env
Exists: True

‚úÖ OpenAI client initialized
   API key: sk-proj-YNmq1tao-Q91...0g08A
‚úÖ Connected to OpenAI API
   Available models: 111

Setup complete!


In [4]:
# ## 2. Load Training Data (ONLY - For Knowledge Base)

# %%
print("Loading training data for knowledge base...")
print("‚ö†Ô∏è  Note: Test data will be loaded SEPARATELY later for evaluation\n")

# Load ONLY training data for knowledge base
train_df = pd.read_csv('../data/processed/train_processed.csv')

# Load category mapping
with open('../data/processed/category_mapping.json', 'r') as f:
    category_mapping = json.load(f)

print(f"‚úÖ Training set: {len(train_df):,} queries (for knowledge base)")
print(f"‚úÖ Categories: {len(category_mapping)}")

print(f"\nTraining data info:")
print(f"   Columns: {list(train_df.columns)}")
print(f"   Avg query length: {train_df['word_count'].mean():.1f} words")
print(f"   Categories: {train_df['category'].nunique()}")
print(f"   Queries with negation: {train_df['has_negation'].sum()} ({train_df['has_negation'].sum()/len(train_df)*100:.1f}%)")
print(f"   Complex queries: {train_df['is_complex'].sum()} ({train_df['is_complex'].sum()/len(train_df)*100:.1f}%)")

print(f"\nSample data:")
display(train_df[['text', 'category', 'word_count', 'has_negation']].head())

Loading training data for knowledge base...
‚ö†Ô∏è  Note: Test data will be loaded SEPARATELY later for evaluation

‚úÖ Training set: 10,003 queries (for knowledge base)
‚úÖ Categories: 77

Training data info:
   Columns: ['text', 'label', 'text_length', 'word_count', 'char_count', 'category', 'question_type', 'has_and', 'has_or', 'has_but', 'has_negation', 'has_multiple_sentences', 'has_question_mark', 'is_complex']
   Avg query length: 11.9 words
   Categories: 77
   Queries with negation: 2184 (21.8%)
   Complex queries: 1815 (18.1%)

Sample data:


Unnamed: 0,text,category,word_count,has_negation
0,I am still waiting on my card?,card_arrival,7,False
1,What can I do if my card still hasn't arrived ...,card_arrival,13,True
2,I have been waiting over a week. Is the card s...,card_arrival,12,False
3,Can I track my card while it is in the process...,card_arrival,13,False
4,"How do I know if I will get my card, or if it ...",card_arrival,15,False


In [5]:
# ## 3. Enhanced Sampling Strategy

# %%
def create_stratified_sample(df, sample_size=100, min_per_category=1, random_state=42):
    """
    Create stratified sample ensuring minimum representation per category
    
    Args:
        df: DataFrame to sample from
        sample_size: Total samples to return
        min_per_category: Minimum samples per category
        random_state: Random seed for reproducibility
    
    Returns:
        Stratified sample DataFrame
    """
    category_counts = df['category'].value_counts()
    n_categories = len(category_counts)
    
    # Allocate samples
    samples_per_category = {}
    remaining = sample_size - (n_categories * min_per_category)
    
    if remaining < 0:
        print(f"‚ö†Ô∏è  Warning: sample_size too small for {n_categories} categories")
        remaining = 0
        min_per_category = sample_size // n_categories
    
    for cat, count in category_counts.items():
        base = min_per_category
        # Allocate remaining proportionally
        additional = int(remaining * (count / len(df)))
        samples_per_category[cat] = min(base + additional, count)
    
    # Sample from each category
    sampled_dfs = []
    for cat, n in samples_per_category.items():
        cat_df = df[df['category'] == cat]
        n_actual = min(n, len(cat_df))
        sampled_dfs.append(cat_df.sample(n=n_actual, random_state=random_state))
    
    result = pd.concat(sampled_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    print(f"‚úÖ Stratified sampling complete:")
    print(f"   Target: {sample_size} samples")
    print(f"   Actual: {len(result)} samples")
    print(f"   Categories covered: {result['category'].nunique()}/{n_categories}")
    print(f"   Min per category: {result['category'].value_counts().min()}")
    print(f"   Max per category: {result['category'].value_counts().max()}")
    
    return result

# %%
"""
Choose between testing mode (100 queries) or production mode (full dataset)
"""

# CONFIGURATION - CHANGE THESE SETTINGS
USE_SAMPLE = False  # Set to False for full dataset
SAMPLE_SIZE = 100  # Only used if USE_SAMPLE = True
USE_STRATIFIED_SAMPLING = True  # Use enhanced sampling strategy


if USE_SAMPLE:
    # Choose sampling strategy
    if USE_STRATIFIED_SAMPLING:
        kb_df = create_stratified_sample(train_df, sample_size=SAMPLE_SIZE, random_state=42)
        print("\n" + "="*70)
        print("üß™ TESTING MODE - STRATIFIED SAMPLING")
        print("="*70)
    else:
        kb_df = train_df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
        print("\n" + "="*70)
        print("üß™ TESTING MODE - RANDOM SAMPLING")
        print("="*70)
    
    print(f"Using: {len(kb_df)} queries from TRAINING SET")
    print(f"Coverage: {kb_df['category'].nunique()}/77 categories ({kb_df['category'].nunique()/77*100:.1f}%)")
    print(f"Negation queries: {kb_df['has_negation'].sum()} ({kb_df['has_negation'].sum()/len(kb_df)*100:.1f}%)")
    print(f"Complex queries: {kb_df['is_complex'].sum()} ({kb_df['is_complex'].sum()/len(kb_df)*100:.1f}%)")
    print(f"Estimated cost: ~$0.30")
    print(f"Estimated time: ~5 minutes")
    print(f"‚ö†Ô∏è Hybrid search: DISABLED (insufficient data)")
    print("\nThis is for TESTING only!")
    print("Set USE_SAMPLE = False for production dataset")

    USE_HYBRID_SEARCH = False  # Disable for small datasets
    
else:
    # Production mode - full dataset
    kb_df = train_df.copy()
    
    print("="*70)
    print("üöÄ PRODUCTION MODE")
    print("="*70)
    print(f"Using: {len(kb_df):,} queries from TRAINING SET")
    print(f"Coverage: ALL 77 categories (100%)")
    print(f"Estimated cost: ~$2.00")
    print(f"Estimated time: ~60 minutes")
    print(f"‚úÖ Hybrid search: ENABLED (sufficient data)")
    print("\nThis will create production-ready knowledge base")

    USE_HYBRID_SEARCH = True  # Enable for large datasets

print("="*70)

# Show category distribution
print(f"\nTop 10 categories in dataset:")
print(kb_df['category'].value_counts().head(10))

# Show negation examples
print(f"\nSample negation queries:")
negation_samples = kb_df[kb_df['has_negation'] == True].head(3)
for idx, row in negation_samples.iterrows():
    print(f"  - {row['text']}")

üöÄ PRODUCTION MODE
Using: 10,003 queries from TRAINING SET
Coverage: ALL 77 categories (100%)
Estimated cost: ~$2.00
Estimated time: ~60 minutes
‚úÖ Hybrid search: ENABLED (sufficient data)

This will create production-ready knowledge base

Top 10 categories in dataset:
category
card_payment_fee_charged                            187
direct_debit_payment_not_recognised                 182
balance_not_updated_after_cheque_or_cash_deposit    181
wrong_amount_of_cash_received                       180
cash_withdrawal_charge                              177
transaction_charged_twice                           175
declined_cash_withdrawal                            173
transfer_fee_charged                                172
transfer_not_received_by_recipient                  171
balance_not_updated_after_bank_transfer             171
Name: count, dtype: int64

Sample negation queries:
  - What can I do if my card still hasn't arrived after 2 weeks?
  - What do I do if I still have not rece

In [6]:
# ## 4. Enhanced Answer Generation with Templates

# %%
# Answer templates for common categories
ANSWER_TEMPLATES = {
    # Card arrival & activation (common)
    "card_arrival": """Your card typically arrives within 5-7 business days after ordering. 
    If it's been longer than this, please check your delivery address in the app under Settings > Card Details. 
    If the address is correct and it's been over 10 business days, contact our support team and we'll investigate or send a replacement.""",
    
    "activate_my_card": """To activate your card, open the app and navigate to Cards > Activate Card.
    You'll need to enter the last 4 digits of your card number and the CVV code on the back. 
    Activation is instant and you can start using your card immediately.""",
    
    "card_delivery_estimate": """Standard card delivery takes 5-7 business days. 
    Express delivery (if selected) takes 2-3 business days. 
    You'll receive a tracking notification when your card ships. Delivery times may be longer during holidays or to remote areas.""",
    
    # PIN management (very common)
    "change_pin": """You can change your PIN anytime through the app. Go to Settings > Security > Change PIN. 
    You'll need to enter your current PIN, then choose your new 4-digit PIN. 
    For security, avoid using easily guessed numbers like 1234 or your birth year.""",
    
    "pin_blocked": """If you've entered your PIN incorrectly multiple times, your card is temporarily blocked for security. 
    To unblock it, open the app and go to Cards > Unblock PIN. 
    You'll need to verify your identity. If you've forgotten your PIN, you can reset it in Settings > Security > Reset PIN.""",
    
    # Payment issues (very common)
    "declined_card_payment": """Card payments can be declined for several reasons: insufficient funds, exceeded spending limits, expired card, incorrect PIN, or security holds. 
    Check your account balance and card status in the app. 
    If everything looks correct, contact support as there may be a security flag on your account.""",
    
    "card_payment_not_recognised": """If you see an unrecognized payment, first check if it's a merchant with a different trading name than the store name. 
    Check your recent transactions for the exact amount and date. 
    If you still don't recognize it, report it immediately through the app under Transactions > Dispute Payment, and we'll investigate within 1-3 business days.""",
    
    "card_payment_fee_charged": """Payment fees can occur for several reasons: international transactions (typically 2-3%), ATM withdrawals outside our network, exceeding monthly transaction limits, or merchant processing fees. 
    Check your transaction details in the app for the specific fee breakdown. 
    Our standard fees are listed in Settings > Fees & Limits.""",
    
    # Balance & transfers (common)
    "balance_not_updated_after_bank_transfer": """Bank transfers typically take 1-3 business days to process and appear in your balance. If you initiated the transfer on a weekend or holiday, add an extra day. Check your transaction history for a pending status. If it's been over 3 business days, contact support with your transfer reference number.""",
    
    "balance_not_updated_after_cheque_or_cash_deposit": """Cheque deposits take 2-5 business days to clear and appear in your available balance. 
    Cash deposits at supported locations usually appear within 24 hours. 
    Check your transaction history for pending deposits. If it's been longer than expected, contact support with your deposit receipt.""",
    
    # Top-up issues (common)
    "top_up_failed": """Top-up failures usually occur due to insufficient funds in your source account, incorrect card details, or temporary banking issues. 
    Verify your payment method in Settings > Payment Methods and try again. 
    If it continues failing, try a different payment method or contact support.""",
    
    "pending_top_up": """Top-ups are usually instant but can take up to 30 minutes during high-traffic periods. 
    Check your transaction history for the pending status. 
    If it's been over 1 hour, contact support with your transaction reference number and we'll investigate immediately.""",
    
    # Virtual card (common)
    "getting_virtual_card": """Virtual cards are issued instantly upon account approval. 
    Open the app and go to Cards > Add Virtual Card. 
    If you don't see this option, your account may need verification first. 
    Complete any pending identity verification in Settings > Account, then try again.""",
    
    # Direct debit (common)
    "direct_debit_payment_not_recognised": """If you see an unexpected direct debit, check if it's a subscription or recurring payment you set up. 
    Common ones include: streaming services, gym memberships, or utility bills. 
    Check Transactions > Recurring for your active direct debits. 
    If you don't recognize it, you can dispute it within 60 days.""",
    
    # Refunds (common)
    "request_refund": """To request a refund, go to Transactions, select the payment, and tap Request Refund. 
    You'll need to provide a reason. Merchant refunds typically take 5-10 business days to process. 
    If the merchant approves, you'll see it in your account. 
    If denied, you can escalate to our disputes team.""",
    
    # Country support (common)
    "country_support": """We currently support accounts in 30+ countries across Europe, North America, and parts of Asia. 
    To check if your country is supported, visit our website or check Settings > Supported Countries in the app. 
    Some features may be limited in certain regions due to local regulations.""",
}

print(f"‚úÖ Loaded {len(ANSWER_TEMPLATES)} answer templates for cost optimization")
print(f"   Categories covered: {', '.join(list(ANSWER_TEMPLATES.keys())[:5])}...")
print(f"   Expected template usage: ~{len(ANSWER_TEMPLATES)/77*100:.1f}% of queries")
print(f"   Cost savings: ~30-40% on answer generation")


def generate_answer_openai(
    question: str,
    category: str,
    model: str = "gpt-4o-mini",
    max_retries: int = 3,
    use_template: bool = True
) -> str:
    """
    Generate a helpful answer for a banking customer support question
    
    Args:
        question: Customer's question
        category: Question category
        model: OpenAI model to use
        max_retries: Number of retry attempts
        use_template: Whether to use templates for common categories
    
    Returns:
        Generated answer string
    """
    # Check if template exists for this category
    if use_template and category in ANSWER_TEMPLATES:
        return ANSWER_TEMPLATES[category]
    
    # Clean category name for display
    category_clean = category.replace('_', ' ').title()
    
    # Enhanced system prompt with negation awareness
    system_prompt = """You are a helpful customer support agent for a digital bank.
Provide clear, concise, and helpful answers to customer questions.

Guidelines:
- Be professional but friendly
- Keep answers to 2-4 sentences
- Provide specific steps if applicable
- Use realistic timeframes (e.g., "1-3 business days", "5-7 business days")
- If mentioning fees/limits, use typical banking ranges
- Don't make up features that don't exist in a typical banking app
- Pay special attention to negations (not, didn't, hasn't, never)
- For negative situations (card not working, payment not received), acknowledge the issue and provide troubleshooting steps"""

    # User prompt
    user_prompt = f"""Customer Question: {question}
Category: {category_clean}

Provide a helpful answer:"""
    
    # Retry logic for API reliability
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=200,
                temperature=0.7
            )
            
            answer = response.choices[0].message.content.strip()
            return answer
            
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"   ‚ö†Ô∏è  Retry {attempt + 1}/{max_retries} after {wait_time}s: {e}")
                time.sleep(wait_time)
                continue
            else:
                print(f"   ‚ùå Failed after {max_retries} attempts: {e}")
                return f"I understand you're asking about {category_clean}. Please contact our support team for immediate assistance."

print("‚úÖ Enhanced answer generation function defined")

# %%
# Test answer generation with sample queries including negations
print("Testing answer generation with sample queries...\n")

test_samples = []
# Get some negation examples
negation_samples = kb_df[kb_df['has_negation'] == True].sample(min(2, kb_df['has_negation'].sum()))
# Get some regular examples
regular_samples = kb_df[kb_df['has_negation'] == False].sample(min(2, (~kb_df['has_negation']).sum()))
test_samples = pd.concat([negation_samples, regular_samples])

for idx, row in test_samples.iterrows():
    question = row['text']
    category = row['category']
    has_negation = row['has_negation']
    
    print(f"Q: {question}")
    print(f"Category: {category}")
    print(f"Has negation: {'Yes ‚ö†Ô∏è' if has_negation else 'No'}")
    
    answer = generate_answer_openai(question, category)
    
    print(f"A: {answer}\n")
    print("-" * 70 + "\n")

‚úÖ Loaded 16 answer templates for cost optimization
   Categories covered: card_arrival, activate_my_card, card_delivery_estimate, change_pin, pin_blocked...
   Expected template usage: ~20.8% of queries
   Cost savings: ~30-40% on answer generation
‚úÖ Enhanced answer generation function defined
Testing answer generation with sample queries...

Q: I have received my statement but I do not see my refund, why is that?
Category: Refund_not_showing_up
Has negation: Yes ‚ö†Ô∏è
A: I'm sorry to hear that your refund isn't showing up on your statement. Refunds can take 3-5 business days to process and appear in your account, depending on the merchant's processing time. Please double-check that the refund was initiated by the merchant, and if it's still not visible after that timeframe, feel free to reach out to customer support for further assistance.

----------------------------------------------------------------------

Q: I was mugged.  They took everything.  I can't use the app.  What d

In [7]:
# ## 5. Generate Answers for All Queries

# %%
def generate_answers_batch(
    df: pd.DataFrame,
    model: str = "gpt-4o-mini",
    batch_size: int = 20,  
    save_progress: bool = True,
    use_templates: bool = True,
    max_workers: int = 10  
) -> list:
    """
    Generate answers for all queries with PARALLEL processing
    
    Args:
        df: DataFrame with 'text' and 'category' columns
        model: OpenAI model to use
        batch_size: DEPRECATED - kept for compatibility, not used
        save_progress: Save checkpoints every 1000 queries
        use_templates: Use answer templates for common categories
        max_workers: Number of parallel API calls (5-20 recommended)
    
    Returns:
        List of generated answers
    """
    answers = [None] * len(df)  
    failed_indices = []
    template_count = 0
    
    print(f"Generating answers for {len(df)} queries...")
    print(f"Model: {model}")
    print(f"Using templates: {use_templates}")
    print(f" Parallel workers: {max_workers}") 
    print(f"Estimated time: {len(df) * 2 / 60 / max_workers:.1f} minutes\n") 
    
    # üÜï NEW - Define function to process single row
    def process_single_query(idx_row_tuple):
        """Process a single query - designed for parallel execution"""
        idx, row = idx_row_tuple
        question = row['text']
        category = row['category']
        
        try:
            answer = generate_answer_openai(
                question, 
                category, 
                model=model, 
                use_template=use_templates
            )
            
            # Track template usage
            used_template = use_templates and category in ANSWER_TEMPLATES
            
            return idx, answer, used_template, None  # idx, answer, template_flag, error
            
        except Exception as e:
            error_msg = f"Please contact support regarding {category.replace('_', ' ')}"
            return idx, error_msg, False, str(e)
    
    # üÜï NEW - Parallel execution with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(process_single_query, (idx, row)): idx 
            for idx, row in df.iterrows()
        }
        
        # Process completed tasks with progress bar
        for future in tqdm(
            concurrent.futures.as_completed(futures),
            total=len(futures),
            desc="Generating answers"
        ):
            try:
                idx, answer, used_template, error = future.result()
                answers[idx] = answer
                
                if used_template:
                    template_count += 1
                
                if error:
                    failed_indices.append(idx)
                    print(f"\n‚ùå Failed for index {idx}: {error}")
                
                # Save progress every 1000 queries
                if save_progress and (idx + 1) % 1000 == 0:
                    temp_df = df.iloc[:idx+1].copy()
                    temp_df['answer'] = answers[:idx+1]
                    checkpoint_path = project_root / 'data' / 'processed' / f'kb_checkpoint_{idx+1}.csv'
                    temp_df.to_csv(checkpoint_path, index=False)
                    print(f"\nüíæ Checkpoint saved: {idx + 1}/{len(df)} completed")
                    
            except Exception as e:
                idx = futures[future]
                print(f"\n‚ùå Unexpected error for index {idx}: {e}")
                failed_indices.append(idx)
                answers[idx] = "Error generating answer"
    
    print(f"\n‚úÖ Answer generation complete!")
    print(f"   Total: {len(answers)}")
    print(f"   From templates: {template_count} ({template_count/len(answers)*100:.1f}%)")
    print(f"   Generated: {len(answers) - template_count}")
    print(f"   Failed: {len(failed_indices)}")
    if failed_indices:
        print(f"   Failed indices: {failed_indices[:10]}...")
    
    return answers

# %%
# Generate answers for all queries in dataset
print("Starting answer generation...")
print("="*70)

# üÜï OPTIONAL - Adjust max_workers based on your needs
# max_workers=5  : Conservative, ~40 min for 10k queries
# max_workers=10 : Balanced (recommended), ~20 min for 10k queries  
# max_workers=20 : Aggressive, ~12 min for 10k queries (may hit rate limits)

answers = generate_answers_batch(
    kb_df, 
    model="gpt-4o-mini", 
    use_templates=True,
    max_workers=10  
)

# Add answers to dataframe
kb_df['answer'] = answers

print("\n" + "="*70)
print("‚úÖ All answers generated!")
print("="*70)

# Show statistics
print(f"\nAnswer Statistics:")
kb_df['answer_length'] = kb_df['answer'].str.split().str.len()
print(f"   Avg answer length: {kb_df['answer_length'].mean():.1f} words")
print(f"   Min answer length: {kb_df['answer_length'].min()} words")
print(f"   Max answer length: {kb_df['answer_length'].max()} words")

# Show sample Q&A pairs including negations
print(f"\nSample Q&A pairs:")
for i in range(min(5, len(kb_df))):
    row = kb_df.iloc[i]
    print(f"\n{i+1}.")
    print(f"Q: {row['text']}")
    print(f"Category: {row['category']}")
    if row['has_negation']:
        print(f"‚ö†Ô∏è  Contains negation")
    print(f"A: {row['answer']}")

Starting answer generation...
Generating answers for 10003 queries...
Model: gpt-4o-mini
Using templates: True
 Parallel workers: 10
Estimated time: 33.3 minutes



Generating answers:  11%|‚ñà         | 1110/10003 [02:49<01:36, 92.39it/s]


üíæ Checkpoint saved: 1000/10003 completed


Generating answers:  21%|‚ñà‚ñà        | 2069/10003 [05:16<00:41, 190.94it/s]


üíæ Checkpoint saved: 2000/10003 completed


Generating answers:  31%|‚ñà‚ñà‚ñà       | 3090/10003 [07:56<00:56, 123.17it/s]


üíæ Checkpoint saved: 3000/10003 completed


Generating answers:  40%|‚ñà‚ñà‚ñà‚ñâ      | 3996/10003 [09:56<21:09,  4.73it/s] 


üíæ Checkpoint saved: 4000/10003 completed


Generating answers:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 4998/10003 [12:38<22:00,  3.79it/s] 


üíæ Checkpoint saved: 5000/10003 completed


Generating answers:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 5981/10003 [15:00<00:23, 172.54it/s]


üíæ Checkpoint saved: 6000/10003 completed


Generating answers:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 6991/10003 [18:24<00:26, 114.56it/s]


üíæ Checkpoint saved: 7000/10003 completed


Generating answers:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 7997/10003 [22:09<09:29,  3.52it/s] 


üíæ Checkpoint saved: 8000/10003 completed


Generating answers:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8999/10003 [24:34<02:57,  5.67it/s] 


üíæ Checkpoint saved: 9000/10003 completed


Generating answers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 9991/10003 [27:26<00:00, 126.08it/s]


üíæ Checkpoint saved: 10000/10003 completed


Generating answers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10003/10003 [27:28<00:00,  6.07it/s]


‚úÖ Answer generation complete!
   Total: 10003
   From templates: 2393 (23.9%)
   Generated: 7610
   Failed: 0

‚úÖ All answers generated!

Answer Statistics:
   Avg answer length: 60.7 words
   Min answer length: 28 words
   Max answer length: 147 words

Sample Q&A pairs:

1.
Q: I am still waiting on my card?
Category: card_arrival
A: Your card typically arrives within 5-7 business days after ordering. 
    If it's been longer than this, please check your delivery address in the app under Settings > Card Details. 
    If the address is correct and it's been over 10 business days, contact our support team and we'll investigate or send a replacement.

2.
Q: What can I do if my card still hasn't arrived after 2 weeks?
Category: card_arrival
‚ö†Ô∏è  Contains negation
A: Your card typically arrives within 5-7 business days after ordering. 
    If it's been longer than this, please check your delivery address in the app under Settings > Card Details. 
    If the address is correct and it'




In [8]:
# ## 6. Create Embeddings

# %%
def get_openai_embedding(
    text: str,
    model: str = "text-embedding-3-small"
) -> list:
    """Get embedding from OpenAI"""
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

def create_embeddings_batch(
    texts: list,
    model: str = "text-embedding-3-small",
    batch_size: int = 100
) -> list:
    """
    Create embeddings in batches for efficiency
    
    Args:
        texts: List of texts to embed
        model: OpenAI embedding model
        batch_size: Number of texts per API call
    
    Returns:
        List of embeddings
    """
    all_embeddings = []
    
    print(f"\nCreating embeddings with {model}...")
    print(f"Total texts: {len(texts)}")
    print(f"Batch size: {batch_size}\n")
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[i:i+batch_size]
        
        # Clean texts
        batch = [text.replace("\n", " ") for text in batch]
        
        try:
            response = client.embeddings.create(
                input=batch,
                model=model
            )
            
            batch_embeddings = [data.embedding for data in response.data]
            all_embeddings.extend(batch_embeddings)
            
        except Exception as e:
            print(f"\n‚ùå Error in batch {i//batch_size}: {e}")
            # Add zero vectors as fallback
            embedding_dim = 1536 if 'small' in model else 3072
            all_embeddings.extend([[0.0] * embedding_dim] * len(batch))
        
        # Rate limiting
        if (i + batch_size) % 1000 == 0:
            time.sleep(1)
    
    return all_embeddings

# %%
# Combine question and answer for richer embeddings
print("Preparing texts for embedding...")
kb_df['combined_text'] = (
    "Question: " + kb_df['text'] +
    " Answer: " + kb_df['answer']
)

print(f"‚úÖ Combined {len(kb_df)} question-answer pairs")
print(f"\nSample combined text:")
print(kb_df['combined_text'].iloc[0][:200] + "...")

# %%
# Create embeddings
EMBEDDING_MODEL = "text-embedding-3-small"

embeddings = create_embeddings_batch(
    kb_df['combined_text'].tolist(),
    model=EMBEDDING_MODEL,
    batch_size=100
)

# Add to dataframe
kb_df['embedding'] = embeddings

print(f"\n‚úÖ Created {len(embeddings)} embeddings")
print(f"   Embedding dimension: {len(embeddings[0])}")
print(f"   Model: {EMBEDDING_MODEL}")

Preparing texts for embedding...
‚úÖ Combined 10003 question-answer pairs

Sample combined text:
Question: I am still waiting on my card? Answer: Your card typically arrives within 5-7 business days after ordering. 
    If it's been longer than this, please check your delivery address in the app ...

Creating embeddings with text-embedding-3-small...
Total texts: 10003
Batch size: 100



Embedding batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 101/101 [02:33<00:00,  1.52s/it]


‚úÖ Created 10003 embeddings
   Embedding dimension: 1536
   Model: text-embedding-3-small





In [9]:
# ## 7. Build Vector Database (ChromaDB)

# %%
# Create vector database directory
vector_db_dir = project_root / 'data' / 'vector_db'
vector_db_dir.mkdir(parents=True, exist_ok=True)

print(f"Vector database directory: {vector_db_dir}")

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=str(vector_db_dir))

# Delete existing collection if it exists
try:
    chroma_client.delete_collection(name="banking_support")
    print("Deleted existing collection")
except:
    pass

# Create new collection with enhanced metadata
collection = chroma_client.create_collection(
    name="banking_support",
    metadata={
        "description": "Banking customer support Q&A",
        "embedding_model": EMBEDDING_MODEL,
        "answer_model": "gpt-4o-mini",
        "total_entries": len(kb_df),
        "version": "2.0",
        "created_date": datetime.now().isoformat(),
        "stratified_sampling": USE_STRATIFIED_SAMPLING if USE_SAMPLE else False,
        "template_count": kb_df['category'].isin(ANSWER_TEMPLATES.keys()).sum()
    }
)

print(f"‚úÖ ChromaDB collection created: 'banking_support'")

# %%
# Prepare data for ChromaDB
print("\nPreparing data for vector database...")

documents = []
metadatas = []
ids = []
embeddings_list = []

for idx, row in tqdm(kb_df.iterrows(), total=len(kb_df), desc="Preparing data"):
    # Document text (what will be returned in search)
    doc_text = f"Question: {row['text']}\nAnswer: {row['answer']}"
    documents.append(doc_text)
    
    # Enhanced metadata
    metadata = {
        'question': row['text'],
        'answer': row['answer'],
        'category': row['category'],
        'category_id': int(row['label']),
        'word_count': int(row['word_count']),
        'answer_length': int(row['answer_length']),
        'has_negation': bool(row['has_negation']),
        'is_complex': bool(row['is_complex']),
        'question_type': row['question_type']
    }
    metadatas.append(metadata)
    
    # Unique ID
    ids.append(f"kb_{idx}")
    
    # Embedding
    embeddings_list.append(row['embedding'])

print(f"‚úÖ Prepared {len(documents)} documents")

# %%
# Add to ChromaDB in batches
print("\nAdding to vector database...")

batch_size = 100
for i in tqdm(range(0, len(documents), batch_size), desc="Adding to DB"):
    batch_docs = documents[i:i+batch_size]
    batch_meta = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    batch_emb = embeddings_list[i:i+batch_size]
    
    collection.add(
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids,
        embeddings=batch_emb
    )

print(f"\n‚úÖ Vector database created successfully!")
print(f"   Total entries: {collection.count()}")
print(f"   Location: {vector_db_dir}")

Vector database directory: c:\Users\victo\customer-support-rag\data\vector_db
Deleted existing collection
‚úÖ ChromaDB collection created: 'banking_support'

Preparing data for vector database...


Preparing data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10003/10003 [00:00<00:00, 22125.09it/s]


‚úÖ Prepared 10003 documents

Adding to vector database...


Adding to DB: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 101/101 [00:15<00:00,  6.35it/s]


‚úÖ Vector database created successfully!
   Total entries: 10003
   Location: c:\Users\victo\customer-support-rag\data\vector_db





In [10]:
# ## 8. Enhanced Retrieval Functions with Query Preprocessing

# %%
def preprocess_query(query: str, enhance_negation: bool = True) -> str:
    """
    Preprocess query for better retrieval
    
    Args:
        query: Original query text
        enhance_negation: Whether to enhance negation queries
    
    Returns:
        Preprocessed query string
    """
    # Detect negation
    negation_words = ['not', 'no', "n't", 'never', 'none', 'nobody', 'nothing', 
                      'nowhere', 'neither', 'hasn\'t', 'haven\'t', 'didn\'t', 'don\'t',
                      'won\'t', 'wouldn\'t', 'couldn\'t', 'shouldn\'t', 'isn\'t', 'aren\'t']
    
    has_negation = any(neg in query.lower() for neg in negation_words)
    
    # Enhance negation queries for better semantic matching
    if has_negation and enhance_negation:
        # Add context that this is a problem/issue
        query = query + " [PROBLEM/ISSUE/NOT_WORKING]"
    
    # Clean up extra whitespace
    query = ' '.join(query.split())
    
    return query

# %%
def search_knowledge_base(query: str, n_results: int = 5):
    """Basic semantic search with query preprocessing"""
    
    # Preprocess query
    processed_query = preprocess_query(query)
    
    # Embed the processed query
    query_embedding = get_openai_embedding(processed_query, model=EMBEDDING_MODEL)
    
    # Search
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    
    return results

def hybrid_search_smart(query: str, n_results: int = 5, alpha: float = 0.7):
    """
    Smart hybrid search that adapts to dataset size
    Falls back to semantic-only for small datasets
    
    Args:
        query: Search query
        n_results: Number of results to return
        alpha: Weight for dense retrieval (1-alpha for sparse)
    
    Returns:
        Combined results
    """
    # Check if we have enough data for hybrid search
    if not BM25_AVAILABLE:
        print("‚ö†Ô∏è BM25 not available, using semantic search")
        return search_knowledge_base(query, n_results=n_results)
    
    if len(kb_df) < 1000:
        # Dataset too small for effective hybrid search
        return search_knowledge_base(query, n_results=n_results)
    
    # Preprocess query
    processed_query = preprocess_query(query)
    
    # Dense retrieval (semantic)
    dense_results = search_knowledge_base(processed_query, n_results=20)
    
    # Prepare corpus for BM25
    corpus = [doc.split() for doc in documents]
    bm25 = BM25Okapi(corpus)
    
    # Sparse retrieval (keyword-based)
    tokenized_query = processed_query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Normalize scores to 0-1 range
    dense_scores = np.array(dense_results['distances'][0])
    dense_scores = 1 - dense_scores  # Convert distance to similarity
    
    # Avoid division by zero
    bm25_range = bm25_scores.max() - bm25_scores.min()
    if bm25_range > 0:
        bm25_scores_norm = (bm25_scores - bm25_scores.min()) / bm25_range
    else:
        bm25_scores_norm = np.zeros_like(bm25_scores)
    
    # Combine scores
    final_scores = {}
    for i, doc_id in enumerate(dense_results['ids'][0]):
        idx = int(doc_id.split('_')[1])
        dense_score = dense_scores[i] if i < len(dense_scores) else 0
        sparse_score = bm25_scores_norm[idx] if idx < len(bm25_scores_norm) else 0
        final_scores[doc_id] = alpha * dense_score + (1 - alpha) * sparse_score
    
    # Sort by combined score
    sorted_ids = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:n_results]
    
    # Retrieve full results for top documents
    top_ids = [doc_id for doc_id, score in sorted_ids]
    results = collection.get(ids=top_ids, include=['documents', 'metadatas', 'embeddings'])
    
    return {
        'ids': [top_ids],
        'documents': [results['documents']],
        'metadatas': [results['metadatas']],
        'distances': [[1 - final_scores[doc_id] for doc_id in top_ids]]  # Convert back to distance
    }

def category_aware_search(query: str, detected_category: str = None, n_results: int = 5, boost: float = 1.3):
    """
    Search with optional category boosting
    
    Args:
        query: Search query
        detected_category: If provided, boost results from this category
        n_results: Number of results to return
        boost: Multiplier for category matches
    
    Returns:
        Search results
    """
    # Get more results initially
    base_results = search_knowledge_base(query, n_results=20)
    
    if detected_category:
        # Calculate boosted scores
        boosted_results = []
        for i, (doc_id, doc, metadata, distance) in enumerate(zip(
            base_results['ids'][0],
            base_results['documents'][0],
            base_results['metadatas'][0],
            base_results['distances'][0]
        )):
            # Convert distance to similarity
            similarity = 1 - distance
            
            # Boost if category matches
            if metadata['category'] == detected_category:
                similarity *= boost
            
            boosted_results.append((doc_id, doc, metadata, 1 - similarity))  # Convert back to distance
        
        # Sort by boosted scores
        boosted_results.sort(key=lambda x: x[3])
        
        # Return top n
        return {
            'ids': [[r[0] for r in boosted_results[:n_results]]],
            'documents': [[r[1] for r in boosted_results[:n_results]]],
            'metadatas': [[r[2] for r in boosted_results[:n_results]]],
            'distances': [[r[3] for r in boosted_results[:n_results]]]
        }
    
    return {
        'ids': [base_results['ids'][0][:n_results]],
        'documents': [base_results['documents'][0][:n_results]],
        'metadatas': [base_results['metadatas'][0][:n_results]],
        'distances': [base_results['distances'][0][:n_results]]
    }

print("‚úÖ Enhanced retrieval functions defined")
print(f"   - Query preprocessing with negation enhancement")
print(f"   - Smart hybrid search (adapts to dataset size)")
print(f"   - Category-aware boosting")

‚úÖ Enhanced retrieval functions defined
   - Query preprocessing with negation enhancement
   - Smart hybrid search (adapts to dataset size)
   - Category-aware boosting


In [11]:
# ## 9. Load and Preprocess Test Data for REAL Evaluation

# %%
print("\n" + "="*70)
print("üìä LOADING TEST DATA FOR REAL EVALUATION")
print("="*70)

# Load test data (NEVER seen by the knowledge base)
test_df = pd.read_csv('../data/processed/test_processed.csv')

print(f"\n‚úÖ Test set loaded: {len(test_df):,} queries")
print(f"   ‚ö†Ô∏è  These queries are UNSEEN by the knowledge base!")
print(f"   ‚úÖ This is the PROPER way to evaluate performance")

# Check if preprocessing columns exist
if 'has_negation' not in test_df.columns:
    print("\n‚ö†Ô∏è  Test data missing preprocessing columns - adding them now...")
    
    # Add the same preprocessing columns as training data
    negation_words = ['not', 'no', "n't", 'never', 'none', 'nobody', 'nothing', 
                      'nowhere', 'neither', 'hasn\'t', 'haven\'t', 'didn\'t', 'don\'t']
    test_df['has_negation'] = test_df['text'].str.lower().str.contains('|'.join(negation_words), regex=True)
    
    # Complex query detection
    test_df['has_and'] = test_df['text'].str.lower().str.contains(r'\band\b')
    test_df['has_or'] = test_df['text'].str.lower().str.contains(r'\bor\b')
    test_df['has_but'] = test_df['text'].str.lower().str.contains(r'\bbut\b')
    test_df['has_multiple_sentences'] = test_df['text'].str.contains(r'[.!?]\s+[A-Z]')
    
    # Define complex as: >15 words OR (has_and + has_or) OR (has_negation + connector words)
    test_df['is_complex'] = (
        (test_df['word_count'] > 15) |
        (test_df['has_and'] & test_df['has_or']) |
        (test_df['has_negation'] & (test_df['has_and'] | test_df['has_or'] | test_df['has_but']))
    )
    
    print("   ‚úÖ Added preprocessing columns to test data")

print(f"\nTest data characteristics:")
print(f"   Categories: {test_df['category'].nunique()}")
print(f"   Avg query length: {test_df['word_count'].mean():.1f} words")
print(f"   Negation queries: {test_df['has_negation'].sum()} ({test_df['has_negation'].sum()/len(test_df)*100:.1f}%)")
print(f"   Complex queries: {test_df['is_complex'].sum()} ({test_df['is_complex'].sum()/len(test_df)*100:.1f}%)")

print(f"\nSample test queries:")
display(test_df[['text', 'category', 'word_count', 'has_negation']].head())


üìä LOADING TEST DATA FOR REAL EVALUATION

‚úÖ Test set loaded: 3,080 queries
   ‚ö†Ô∏è  These queries are UNSEEN by the knowledge base!
   ‚úÖ This is the PROPER way to evaluate performance

‚ö†Ô∏è  Test data missing preprocessing columns - adding them now...
   ‚úÖ Added preprocessing columns to test data

Test data characteristics:
   Categories: 77
   Avg query length: 11.0 words
   Negation queries: 802 (26.0%)
   Complex queries: 500 (16.2%)

Sample test queries:


Unnamed: 0,text,category,word_count,has_negation
0,How do I locate my card?,card_arrival,6,False
1,"I still have not received my new card, I order...",card_arrival,14,True
2,I ordered a card but it has not arrived. Help ...,card_arrival,11,True
3,Is there a way to know when my card will arrive?,card_arrival,11,True
4,My card has not arrived yet.,card_arrival,6,True


In [12]:
# ## 10. Create Evaluation Framework

# %%
def create_real_evaluation_set(test_df, samples_per_type=20):
    """
    Create evaluation set from UNSEEN test data
    
    Args:
        test_df: Test dataframe (unseen queries)
        samples_per_type: Samples per test category
    
    Returns:
        Evaluation test set with unseen queries
    """
    test_set = []
    
    # 1. Negation queries from test set
    negation_df = test_df[test_df['has_negation'] == True].sample(
        min(samples_per_type, test_df['has_negation'].sum()),
        random_state=42
    )
    for idx, row in negation_df.iterrows():
        test_set.append({
            'query': row['text'],
            'expected_category': row['category'],
            'has_negation': True,
            'test_type': 'negation'
        })
    
    # 2. Complex queries from test set
    complex_df = test_df[test_df['is_complex'] == True].sample(
        min(samples_per_type, test_df['is_complex'].sum()),
        random_state=42
    )
    for idx, row in complex_df.iterrows():
        test_set.append({
            'query': row['text'],
            'expected_category': row['category'],
            'has_negation': row['has_negation'],
            'test_type': 'complex'
        })
    
    # 3. Short queries from test set
    short_df = test_df[test_df['word_count'] <= 5].sample(
        min(samples_per_type, (test_df['word_count'] <= 5).sum()),
        random_state=42
    )
    for idx, row in short_df.iterrows():
        test_set.append({
            'query': row['text'],
            'expected_category': row['category'],
            'has_negation': row['has_negation'],
            'test_type': 'short'
        })
    
    # 4. Long queries from test set
    long_df = test_df[test_df['word_count'] > 20].sample(
        min(samples_per_type, (test_df['word_count'] > 20).sum()),
        random_state=42
    )
    for idx, row in long_df.iterrows():
        test_set.append({
            'query': row['text'],
            'expected_category': row['category'],
            'has_negation': row['has_negation'],
            'test_type': 'long'
        })
    
    # 5. Random baseline from test set
    random_df = test_df.sample(min(samples_per_type, len(test_df)), random_state=42)
    for idx, row in random_df.iterrows():
        test_set.append({
            'query': row['text'],
            'expected_category': row['category'],
            'has_negation': row['has_negation'],
            'test_type': 'baseline'
        })
    
    print(f"\n‚úÖ Created REAL evaluation set with {len(test_set)} UNSEEN test cases:")
    print(f"   Negation: {sum(1 for t in test_set if t['test_type'] == 'negation')}")
    print(f"   Complex: {sum(1 for t in test_set if t['test_type'] == 'complex')}")
    print(f"   Short: {sum(1 for t in test_set if t['test_type'] == 'short')}")
    print(f"   Long: {sum(1 for t in test_set if t['test_type'] == 'long')}")
    print(f"   Baseline: {sum(1 for t in test_set if t['test_type'] == 'baseline')}")
    
    return test_set

def evaluate_retrieval(test_set, search_function, k_values=[1, 3, 5], verbose=False):
    """
    Evaluate retrieval performance
    
    Args:
        test_set: List of test cases
        search_function: Function to use for retrieval
        k_values: k values to evaluate accuracy@k
        verbose: Print detailed results
    
    Returns:
        Dictionary of evaluation metrics
    """
    results = {f'accuracy@{k}': 0 for k in k_values}
    category_correct = 0
    results_by_type = defaultdict(lambda: {f'accuracy@{k}': 0 for k in k_values})
    results_by_type_category = defaultdict(int)
    
    print(f"\nEvaluating {len(test_set)} test cases...")
    
    for test_case in tqdm(test_set, desc="Evaluating"):
        query = test_case['query']
        expected_category = test_case['expected_category']
        test_type = test_case['test_type']
        
        # Retrieve results
        retrieved = search_function(query, n_results=max(k_values))
        
        # Check accuracy@k
        for k in k_values:
            top_k_categories = [meta['category'] for meta in retrieved['metadatas'][0][:k]]
            if expected_category in top_k_categories:
                results[f'accuracy@{k}'] += 1
                results_by_type[test_type][f'accuracy@{k}'] += 1
        
        # Check category accuracy (top-1)
        top_category = retrieved['metadatas'][0][0]['category']
        if top_category == expected_category:
            category_correct += 1
            results_by_type_category[test_type] += 1
        
        if verbose:
            print(f"\nQuery: {query[:60]}...")
            print(f"Expected: {expected_category}")
            print(f"Got: {top_category}")
            print(f"Match: {'‚úÖ' if top_category == expected_category else '‚ùå'}")
    
    # Calculate percentages
    n = len(test_set)
    for k in k_values:
        results[f'accuracy@{k}'] = results[f'accuracy@{k}'] / n
    results['category_accuracy'] = category_correct / n
    
    # Calculate by type
    type_counts = defaultdict(int)
    for test_case in test_set:
        type_counts[test_case['test_type']] += 1
    
    for test_type, count in type_counts.items():
        for k in k_values:
            results_by_type[test_type][f'accuracy@{k}'] /= count
        results_by_type[test_type]['category_accuracy'] = results_by_type_category[test_type] / count
    
    results['by_type'] = dict(results_by_type)
    
    return results

def print_evaluation_results(results, title="EVALUATION RESULTS"):
    """Pretty print evaluation results"""
    print("\n" + "="*70)
    print(f"üìä {title}")
    print("="*70)
    
    print("\n Overall Performance:")
    print(f"   Accuracy@1: {results['accuracy@1']:.1%}")
    print(f"   Accuracy@3: {results['accuracy@3']:.1%}")
    print(f"   Accuracy@5: {results['accuracy@5']:.1%}")
    print(f"   Category Accuracy: {results['category_accuracy']:.1%}")
    
    print("\n Performance by Query Type:")
    for test_type, metrics in results['by_type'].items():
        print(f"\n   {test_type.upper()}:")
        print(f"      Accuracy@1: {metrics['accuracy@1']:.1%}")
        print(f"      Accuracy@3: {metrics['accuracy@3']:.1%}")
        print(f"      Category Accuracy: {metrics['category_accuracy']:.1%}")
        
        # Flag issues
        if test_type == 'negation' and metrics['category_accuracy'] < 0.8:
            print(f"      ‚ö†Ô∏è  WARNING: Low negation accuracy!")
        if test_type == 'complex' and metrics['category_accuracy'] < 0.75:
            print(f"      ‚ö†Ô∏è  WARNING: Low complex query accuracy!")
    
    print("\n" + "="*70)

print("‚úÖ Evaluation framework defined")

# %%
# Create REAL evaluation set from UNSEEN test data
real_eval_set = create_real_evaluation_set(test_df, samples_per_type=20)

# Save it
real_eval_path = project_root / 'data' / 'processed' / 'real_evaluation_set.json'
with open(real_eval_path, 'w') as f:
    json.dump(real_eval_set, f, indent=2)
print(f"\n‚úÖ Saved real evaluation set to: {real_eval_path}")


‚úÖ Evaluation framework defined

‚úÖ Created REAL evaluation set with 100 UNSEEN test cases:
   Negation: 20
   Complex: 20
   Short: 20
   Long: 20
   Baseline: 20

‚úÖ Saved real evaluation set to: c:\Users\victo\customer-support-rag\data\processed\real_evaluation_set.json


In [13]:
# ## 11. Run REAL Evaluation on Unseen Data

# %%
# Evaluate basic semantic search on REAL unseen data
print("\n" + "="*70)
print("üß™ REAL EVALUATION - BASIC SEMANTIC SEARCH")
print("Testing on UNSEEN queries from test set")
print("="*70)

real_results = evaluate_retrieval(
    real_eval_set, 
    search_knowledge_base, 
    k_values=[1, 3, 5],
    verbose=False
)

print_evaluation_results(real_results, "REAL EVALUATION - UNSEEN DATA")

# %%
# Only evaluate hybrid search if we have enough data
if USE_HYBRID_SEARCH and BM25_AVAILABLE:
    print("\n" + "="*70)
    print("üß™ REAL EVALUATION - HYBRID SEARCH")
    print("Testing on UNSEEN queries from test set")
    print("="*70)
    
    hybrid_real_results = evaluate_retrieval(
        real_eval_set,
        hybrid_search_smart,
        k_values=[1, 3, 5],
        verbose=False
    )
    
    print_evaluation_results(hybrid_real_results, "REAL EVALUATION - HYBRID (UNSEEN DATA)")
    
    # Compare performance
    print("\n" + "="*70)
    print("üìä SEMANTIC vs HYBRID COMPARISON")
    print("="*70)
    print(f"Semantic Accuracy@1: {real_results['accuracy@1']:.1%}")
    print(f"Hybrid Accuracy@1:   {hybrid_real_results['accuracy@1']:.1%}")
    improvement = hybrid_real_results['accuracy@1'] - real_results['accuracy@1']
    if improvement > 0:
        print(f"‚úÖ Hybrid improves by: +{improvement:.1%}")
    else:
        print(f"‚ö†Ô∏è Hybrid decreases by: {improvement:.1%}")
    print("="*70)
else:
    print("\n‚ö†Ô∏è Hybrid search evaluation skipped")
    if not BM25_AVAILABLE:
        print("   Reason: BM25 not available")
    else:
        print("   Reason: Dataset too small (need 1000+ entries)")
    print("   Using semantic results for configuration")
    hybrid_real_results = real_results.copy()



üß™ REAL EVALUATION - BASIC SEMANTIC SEARCH
Testing on UNSEEN queries from test set

Evaluating 100 test cases...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:39<00:00,  2.53it/s]



üìä REAL EVALUATION - UNSEEN DATA

 Overall Performance:
   Accuracy@1: 89.0%
   Accuracy@3: 94.0%
   Accuracy@5: 96.0%
   Category Accuracy: 89.0%

 Performance by Query Type:

   NEGATION:
      Accuracy@1: 90.0%
      Accuracy@3: 90.0%
      Category Accuracy: 90.0%

   COMPLEX:
      Accuracy@1: 85.0%
      Accuracy@3: 95.0%
      Category Accuracy: 85.0%

   SHORT:
      Accuracy@1: 85.0%
      Accuracy@3: 90.0%
      Category Accuracy: 85.0%

   LONG:
      Accuracy@1: 90.0%
      Accuracy@3: 100.0%
      Category Accuracy: 90.0%

   BASELINE:
      Accuracy@1: 95.0%
      Accuracy@3: 95.0%
      Category Accuracy: 95.0%


üß™ REAL EVALUATION - HYBRID SEARCH
Testing on UNSEEN queries from test set

Evaluating 100 test cases...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:18<00:00,  1.28it/s]


üìä REAL EVALUATION - HYBRID (UNSEEN DATA)

 Overall Performance:
   Accuracy@1: 83.0%
   Accuracy@3: 95.0%
   Accuracy@5: 96.0%
   Category Accuracy: 83.0%

 Performance by Query Type:

   NEGATION:
      Accuracy@1: 80.0%
      Accuracy@3: 90.0%
      Category Accuracy: 80.0%

   COMPLEX:
      Accuracy@1: 90.0%
      Accuracy@3: 100.0%
      Category Accuracy: 90.0%

   SHORT:
      Accuracy@1: 70.0%
      Accuracy@3: 90.0%
      Category Accuracy: 70.0%

   LONG:
      Accuracy@1: 90.0%
      Accuracy@3: 100.0%
      Category Accuracy: 90.0%

   BASELINE:
      Accuracy@1: 85.0%
      Accuracy@3: 95.0%
      Category Accuracy: 85.0%


üìä SEMANTIC vs HYBRID COMPARISON
Semantic Accuracy@1: 89.0%
Hybrid Accuracy@1:   83.0%
‚ö†Ô∏è Hybrid decreases by: -6.0%





In [14]:
# ## 12. Advanced Metrics (MRR, NDCG, Cost Analysis)

# %%
def calculate_additional_metrics(test_set, search_function):
    """
    Calculate additional retrieval metrics
    
    Args:
        test_set: List of test cases
        search_function: Function to use for retrieval
    
    Returns:
        Dictionary with MRR, NDCG, and cost estimates
    """
    mrr_scores = []
    ndcg_scores = []
    total_api_calls = 0
    
    print(f"\nCalculating advanced metrics for {len(test_set)} queries...")
    
    for test_case in tqdm(test_set, desc="Computing metrics"):
        query = test_case['query']
        expected_category = test_case['expected_category']
        
        # Retrieve results
        results = search_function(query, n_results=10)
        total_api_calls += 1
        
        # Calculate MRR (Mean Reciprocal Rank)
        retrieved_categories = [meta['category'] for meta in results['metadatas'][0]]
        try:
            rank = retrieved_categories.index(expected_category) + 1
            mrr_scores.append(1.0 / rank)
        except ValueError:
            mrr_scores.append(0.0)
        
        # Calculate NDCG (simplified binary relevance)
        relevance = [1 if cat == expected_category else 0 for cat in retrieved_categories]
        dcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(relevance)])
        idcg = 1.0  # Best case: relevant item at position 1
        ndcg_scores.append(dcg / idcg if idcg > 0 else 0.0)
    
    # Calculate costs
    cost_per_embedding = 0.00002  # $0.02 per 1M tokens, ~1 token per query
    cost_per_answer = 0.0001  # GPT-4o-mini cost estimate
    
    estimated_cost = {
        'embedding_calls': total_api_calls,
        'embedding_cost': total_api_calls * cost_per_embedding,
        'answer_cost': total_api_calls * cost_per_answer,
        'total_cost': total_api_calls * (cost_per_embedding + cost_per_answer)
    }
    
    return {
        'mrr': np.mean(mrr_scores),
        'ndcg': np.mean(ndcg_scores),
        'cost': estimated_cost
    }

# Add to evaluation results
print("\n" + "="*70)
print("üìä ADVANCED METRICS")
print("="*70)

advanced_metrics = calculate_additional_metrics(real_eval_set, search_knowledge_base)

print(f"\nüìà Ranking Quality:")
print(f"   MRR (Mean Reciprocal Rank): {advanced_metrics['mrr']:.3f}")
print(f"   NDCG: {advanced_metrics['ndcg']:.3f}")

print(f"\nüí∞ Cost Analysis:")
print(f"   Embedding calls: {advanced_metrics['cost']['embedding_calls']}")
print(f"   Embedding cost: ${advanced_metrics['cost']['embedding_cost']:.4f}")
print(f"   Answer generation cost: ${advanced_metrics['cost']['answer_cost']:.4f}")
print(f"   Total per {len(real_eval_set)} queries: ${advanced_metrics['cost']['total_cost']:.4f}")
print(f"   Projected cost per 1000 queries: ${advanced_metrics['cost']['total_cost'] * 1000 / len(real_eval_set):.2f}")

print("="*70)



üìä ADVANCED METRICS

Calculating advanced metrics for 100 queries...


Computing metrics: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:36<00:00,  2.75it/s]


üìà Ranking Quality:
   MRR (Mean Reciprocal Rank): 0.915
   NDCG: 3.807

üí∞ Cost Analysis:
   Embedding calls: 100
   Embedding cost: $0.0020
   Answer generation cost: $0.0100
   Total per 100 queries: $0.0120
   Projected cost per 1000 queries: $0.12





In [15]:
# ## 13. Reality Check & Analysis

# %%
print("\n" + "="*70)
print("üéØ PERFORMANCE ANALYSIS")
print("="*70)

print("\nüìä Real Performance Metrics:")
print(f"   Overall Accuracy@1: {real_results['accuracy@1']:.1%}")
print(f"   Overall Accuracy@5: {real_results['accuracy@5']:.1%}")
print(f"   Negation Accuracy: {real_results['by_type']['negation']['category_accuracy']:.1%}")
print(f"   Complex Query Accuracy: {real_results['by_type']['complex']['category_accuracy']:.1%}")
print(f"   MRR: {advanced_metrics['mrr']:.3f}")
print(f"   NDCG: {advanced_metrics['ndcg']:.3f}")

print(f"\nüìà Knowledge Base Stats:")
print(f"   KB Size: {len(kb_df)} entries")
print(f"   Categories: {kb_df['category'].nunique()}/77")
print(f"   Avg entries per category: {len(kb_df)/kb_df['category'].nunique():.1f}")

print("\nüîç Analysis:")
if real_results['accuracy@1'] >= 0.7:
    print("   ‚úÖ EXCELLENT: >70% accuracy even with limited KB!")
    print("   Your embeddings are working extremely well.")
elif real_results['accuracy@1'] >= 0.5:
    print("   ‚úÖ GOOD: 50-70% accuracy is solid for a small KB")
    print("   Full dataset will push this to 75-85%")
elif real_results['accuracy@1'] >= 0.3:
    print("   üìä EXPECTED: 30-50% is normal with limited examples per category")
    print(f"   With {len(kb_df)} KB entries and {kb_df['category'].nunique()} categories")
    print(f"   Average {len(kb_df)/kb_df['category'].nunique():.1f} examples per category")
    print("   Full dataset (10,003 entries) will dramatically improve performance")
else:
    print("   ‚ö†Ô∏è  LOW: <30% suggests potential issues")
    print("   Check: embedding quality, query similarity, category distribution")

print("\nüí° Recommendations:")

if real_results['by_type']['negation']['category_accuracy'] < 0.7:
    print("   ‚ö†Ô∏è  Negation handling needs attention")
    print("      Consider: Query preprocessing or fine-tuned embeddings")
else:
    print("   ‚úÖ Negation handling is good!")

if USE_HYBRID_SEARCH and hybrid_real_results['accuracy@1'] < real_results['accuracy@1']:
    print("   üìä Hybrid search underperforming with small dataset")
    print("   Normal with limited examples per category - will improve with full data")

print("\n" + "="*70)



üéØ PERFORMANCE ANALYSIS

üìä Real Performance Metrics:
   Overall Accuracy@1: 89.0%
   Overall Accuracy@5: 96.0%
   Negation Accuracy: 90.0%
   Complex Query Accuracy: 85.0%
   MRR: 0.915
   NDCG: 3.807

üìà Knowledge Base Stats:
   KB Size: 10003 entries
   Categories: 77/77
   Avg entries per category: 129.9

üîç Analysis:
   ‚úÖ EXCELLENT: >70% accuracy even with limited KB!
   Your embeddings are working extremely well.

üí° Recommendations:
   ‚úÖ Negation handling is good!
   üìä Hybrid search underperforming with small dataset
   Normal with limited examples per category - will improve with full data



In [16]:
# ## 14. Test Retrieval with Sample Queries

# %%
# Test with diverse sample queries
test_queries = [
    # Fee-related
    "Why was I charged a fee?",
    
    # Card issues with negation
    "My card isn't working",
    "Card payment didn't go through",
    
    # PIN management
    "How do I reset my PIN?",
    
    # Unauthorized transactions (negation)
    "I didn't authorize this payment",
    
    # Balance issues (negation)
    "My balance hasn't updated",
    
    # Complex query
    "I entered the wrong PIN too many times and now my card is blocked, how do I fix this?"
]

print("Testing knowledge base retrieval with diverse queries...\n")
print("=" * 80)

for query in test_queries:
    print(f"\nüîç Query: {query}")
    
    # Check if query has negation indicators
    has_negation = any(neg in query.lower() for neg in ["not", "didn't", "hasn't", "never", "no", "n't"])
    if has_negation:
        print("   ‚ö†Ô∏è  Contains negation")
    
    print("-" * 80)
    
    # Try basic search
    results = search_knowledge_base(query, n_results=3)
    
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ), 1):
        similarity = 1 - distance
        print(f"\n   Result {i} (similarity: {similarity:.3f})")
        print(f"   Category: {metadata['category']}")
        print(f"   Question: {metadata['question']}")
        print(f"   Answer: {metadata['answer'][:120]}...")
        
        if metadata['has_negation']:
            print(f"   üìå This result also contains negation")
    
    print("\n" + "=" * 80)

Testing knowledge base retrieval with diverse queries...


üîç Query: Why was I charged a fee?
--------------------------------------------------------------------------------

   Result 1 (similarity: 0.451)
   Category: transfer_fee_charged
   Question: Why was I charged a fee when making this transfer when I shouldn't have been?
   Answer: I understand your concern about the unexpected transfer fee. Fees can sometimes apply depending on the type of transfer ...

   Result 2 (similarity: 0.389)
   Category: extra_charge_on_statement
   Question: Why was my account assessed a fee?
   Answer: I'm sorry to hear that you've noticed an extra charge on your statement. Fees can occur for various reasons, such as ove...

   Result 3 (similarity: 0.386)
   Category: cash_withdrawal_charge
   Question: Why was I charged a fee when I withdrew money?
   Answer: I understand your concern about the withdrawal fee. Typically, fees can occur if you used an ATM outside of our network ...


üîç Quer

In [21]:
# ## 15. Save Knowledge Base

# %%
# Create processed data directory
processed_dir = project_root / 'data' / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"Saving knowledge base to: {processed_dir}")

# %%
# Save as CSV (without embeddings - too large)
csv_path = processed_dir / 'knowledge_base_v2.csv'
kb_df_save = kb_df.drop(columns=['embedding', 'combined_text'])
kb_df_save.to_csv(csv_path, index=False)
print(f"‚úÖ Saved CSV: {csv_path}")

# %%
# Save as pickle (with embeddings)
pickle_path = processed_dir / 'knowledge_base_v2_with_embeddings.pkl'
kb_df.to_pickle(pickle_path)
print(f"‚úÖ Saved pickle: {pickle_path}")

# %%
# Save as JSON (for portability)
json_path = processed_dir / 'knowledge_base_v2.json'

kb_export = []
for idx, row in kb_df.iterrows():
    entry = {
        'id': f"kb_{idx}",
        'question': row['text'],
        'answer': row['answer'],
        'category': row['category'],
        'category_id': int(row['label']),
        'word_count': int(row['word_count']),
        'answer_length': int(row['answer_length']),
        'has_negation': bool(row['has_negation']),
        'is_complex': bool(row['is_complex']),
        'question_type': row['question_type']
    }
    kb_export.append(entry)

with open(json_path, 'w') as f:
    json.dump(kb_export, f, indent=2)

print(f"‚úÖ Saved JSON: {json_path}")

# %%
# Save enhanced configuration
config_path = processed_dir / 'kb_config_v2.json'

config = {
    'version': '2.0',
    'embedding_model': EMBEDDING_MODEL,
    'answer_generation_model': 'gpt-4o-mini',
    'total_entries': int(len(kb_df)),
    'embedding_dimension': int(len(embeddings[0])),
    'vector_db_path': str(vector_db_dir),
    'collection_name': 'banking_support',
    'categories': int(kb_df['category'].nunique()),
    'created_date': datetime.now().isoformat(),
    'is_sample': USE_SAMPLE,
    'sample_size': int(SAMPLE_SIZE) if USE_SAMPLE else int(len(kb_df)),
    'stratified_sampling': USE_STRATIFIED_SAMPLING if USE_SAMPLE else False,
    'template_categories': list(ANSWER_TEMPLATES.keys()),
    'template_usage_count': int(kb_df['category'].isin(ANSWER_TEMPLATES.keys()).sum()),
    'statistics': {
        'avg_query_length': float(kb_df['word_count'].mean()),
        'avg_answer_length': float(kb_df['answer_length'].mean()),
        'negation_queries': int(kb_df['has_negation'].sum()),
        'complex_queries': int(kb_df['is_complex'].sum())
    },
    'evaluation_results': {
        'real_evaluation': {
            'test_set_size': len(real_eval_set),
            'accuracy@1': float(real_results['accuracy@1']),
            'accuracy@3': float(real_results['accuracy@3']),
            'accuracy@5': float(real_results['accuracy@5']),
            'category_accuracy': float(real_results['category_accuracy']),
            'negation_accuracy': float(real_results['by_type']['negation']['category_accuracy']),
            'complex_accuracy': float(real_results['by_type']['complex']['category_accuracy'])
        },
        'hybrid_search': {
            'accuracy@1': float(hybrid_real_results['accuracy@1']),
            'accuracy@3': float(hybrid_real_results['accuracy@3']),
            'accuracy@5': float(hybrid_real_results['accuracy@5']),
            'category_accuracy': float(hybrid_real_results['category_accuracy']),
            'negation_accuracy': float(hybrid_real_results['by_type']['negation']['category_accuracy'])
        }
    }
}

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"‚úÖ Saved config: {config_path}")


Saving knowledge base to: c:\Users\victo\customer-support-rag\data\processed
‚úÖ Saved CSV: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2.csv
‚úÖ Saved pickle: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2_with_embeddings.pkl
‚úÖ Saved JSON: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2.json
‚úÖ Saved config: c:\Users\victo\customer-support-rag\data\processed\kb_config_v2.json


In [22]:
# ## 16. Knowledge Base Summary

# %%
print("="*70)
print("üìä ENHANCED KNOWLEDGE BASE SUMMARY")
print("="*70)

print(f"\nüìö Content:")
print(f"   Total entries: {len(kb_df):,}")
print(f"   Unique categories: {kb_df['category'].nunique()}")
print(f"   Avg question length: {kb_df['word_count'].mean():.1f} words")
print(f"   Avg answer length: {kb_df['answer_length'].mean():.1f} words")
print(f"   Negation queries: {kb_df['has_negation'].sum()} ({kb_df['has_negation'].sum()/len(kb_df)*100:.1f}%)")
print(f"   Complex queries: {kb_df['is_complex'].sum()} ({kb_df['is_complex'].sum()/len(kb_df)*100:.1f}%)")

print(f"\nüî¢ Embeddings:")
print(f"   Model: {EMBEDDING_MODEL}")
print(f"   Dimension: {len(embeddings[0])}")
print(f"   Total vectors: {len(embeddings):,}")

print(f"\nüíæ Storage:")
print(f"   Vector DB: {vector_db_dir}")
print(f"   CSV: {csv_path}")
print(f"   Pickle: {pickle_path}")
print(f"   JSON: {json_path}")
print(f"   Config: {config_path}")
print(f"   Real Eval Set: {real_eval_path}")

print(f"\nüìà Category Coverage:")
category_dist = kb_df['category'].value_counts()
print(f"   Most common: {category_dist.index[0]} ({category_dist.iloc[0]} samples)")
print(f"   Least common: {category_dist.index[-1]} ({category_dist.iloc[-1]} samples)")
print(f"   Average per category: {category_dist.mean():.1f}")

print(f"\nüéØ Performance Metrics (on UNSEEN test data):")
print(f"   Accuracy@1: {real_results['accuracy@1']:.1%}")
print(f"   Accuracy@5: {real_results['accuracy@5']:.1%}")
print(f"   Negation Handling: {real_results['by_type']['negation']['category_accuracy']:.1%}")
print(f"   Complex Queries: {real_results['by_type']['complex']['category_accuracy']:.1%}")

if real_results['accuracy@1'] < 0.5 and len(kb_df) < 1000:
    print(f"\n‚ö†Ô∏è  RECOMMENDATION: Performance limited by small KB size ({len(kb_df)} entries)")
    print(f"   Run full production build for 70-85% accuracy")
elif real_results['accuracy@1'] >= 0.7:
    print(f"\n‚úÖ EXCELLENT: High accuracy even with {len(kb_df)} entries!")
else:
    print(f"\n‚úÖ GOOD: Solid baseline performance")

print("\n" + "="*70)

if USE_SAMPLE:
    print("‚ö†Ô∏è  NOTE: This is a SAMPLE knowledge base for testing")
    print("Set USE_SAMPLE = False to create production KB with all 10,003 queries")
else:
    print("‚úÖ Production-ready knowledge base created!")

print("="*70)

üìä ENHANCED KNOWLEDGE BASE SUMMARY

üìö Content:
   Total entries: 10,003
   Unique categories: 77
   Avg question length: 11.9 words
   Avg answer length: 60.7 words
   Negation queries: 2184 (21.8%)
   Complex queries: 1815 (18.1%)

üî¢ Embeddings:
   Model: text-embedding-3-small
   Dimension: 1536
   Total vectors: 10,003

üíæ Storage:
   Vector DB: c:\Users\victo\customer-support-rag\data\vector_db
   CSV: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2.csv
   Pickle: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2_with_embeddings.pkl
   JSON: c:\Users\victo\customer-support-rag\data\processed\knowledge_base_v2.json
   Config: c:\Users\victo\customer-support-rag\data\processed\kb_config_v2.json
   Real Eval Set: c:\Users\victo\customer-support-rag\data\processed\real_evaluation_set.json

üìà Category Coverage:
   Most common: card_payment_fee_charged (187 samples)
   Least common: contactless_not_working (35 samples)
   Average per c

In [19]:
# ## 17. Final Verification

# %%
print("\nüîç FINAL VERIFICATION")
print("="*70)

checks = []

# Data checks
checks.append(("Answers generated", 'answer' in kb_df.columns and kb_df['answer'].notna().all()))
checks.append(("Embeddings created", 'embedding' in kb_df.columns))
checks.append(("No missing values", kb_df[['text', 'answer', 'category']].notna().all().all()))
checks.append(("Negation flags set", 'has_negation' in kb_df.columns))

# Vector DB checks
try:
    count = collection.count()
    checks.append(("Vector DB populated", count > 0))
    checks.append(("Vector DB count matches", count == len(kb_df)))
except:
    checks.append(("Vector DB populated", False))
    checks.append(("Vector DB count matches", False))

# File checks
checks.append(("CSV exported", csv_path.exists()))
checks.append(("Pickle exported", pickle_path.exists()))
checks.append(("JSON exported", json_path.exists()))
checks.append(("Config saved", config_path.exists()))
checks.append(("Real eval set saved", real_eval_path.exists()))

# Functionality checks
try:
    test_results = search_knowledge_base("test query", n_results=1)
    checks.append(("Basic retrieval works", len(test_results['documents'][0]) > 0))
except:
    checks.append(("Basic retrieval works", False))

if USE_HYBRID_SEARCH and BM25_AVAILABLE:
    try:
        test_hybrid = hybrid_search_smart("test query", n_results=1)
        checks.append(("Hybrid retrieval works", len(test_hybrid['documents'][0]) > 0))
    except:
        checks.append(("Hybrid retrieval works", False))

# Performance checks
checks.append(("Real evaluation completed", len(real_eval_set) > 0))
checks.append(("Test data loaded", len(test_df) > 0))
checks.append(("Advanced metrics calculated", 'mrr' in advanced_metrics))

# Print results
for check_name, passed in checks:
    status = "‚úÖ" if passed else "‚ùå"
    print(f"{status} {check_name}")

all_passed = all(passed for _, passed in checks)

print("\n" + "="*70)

if all_passed:
    print("üéâ All checks passed! Enhanced knowledge base is ready!")
    print("\nüìã Next Steps:")
    print("1. Review performance metrics above")
    if USE_SAMPLE:
        print("2. üöÄ IMPORTANT: Set USE_SAMPLE = False and run full production build")
        print("   Expected improvement: 62% ‚Üí 75-85% accuracy")
    else:
        print("2. ‚úÖ Production KB complete - ready for deployment")
    print("3. Build RAG retrieval + generation pipeline")
    print("4. Create Streamlit chatbot interface")
else:
    print("‚ö†Ô∏è Some checks failed. Please review above.")

print("="*70)


üîç FINAL VERIFICATION
‚úÖ Answers generated
‚úÖ Embeddings created
‚úÖ No missing values
‚úÖ Negation flags set
‚úÖ Vector DB populated
‚úÖ Vector DB count matches
‚úÖ CSV exported
‚úÖ Pickle exported
‚úÖ JSON exported
‚úÖ Config saved
‚úÖ Real eval set saved
‚úÖ Basic retrieval works
‚úÖ Hybrid retrieval works
‚úÖ Real evaluation completed
‚úÖ Test data loaded
‚úÖ Advanced metrics calculated

üéâ All checks passed! Enhanced knowledge base is ready!

üìã Next Steps:
1. Review performance metrics above
2. ‚úÖ Production KB complete - ready for deployment
3. Build RAG retrieval + generation pipeline
4. Create Streamlit chatbot interface
