In [1]:
"""
Knowledge Base Gap Analysis Script
Evaluates overall accuracy and finds categories with <80% accuracy
Uses 10 workers for fast parallel evaluation
"""

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import concurrent.futures

# Add src to path
# Handle both script and notebook execution
try:
    # Running as script
    project_root = Path(__file__).parent.parent
except NameError:
    # Running in Jupyter notebook
    project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()

sys.path.append(str(project_root / 'src'))

from retrieval.retriever import KnowledgeBaseRetriever

# Load environment
load_dotenv()

print("="*80)
print("KNOWLEDGE BASE GAP ANALYSIS")
print("="*80)

KNOWLEDGE BASE GAP ANALYSIS


In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================

VECTOR_DB_PATH = str(project_root / 'data' / 'vector_db')
TEST_DATA_PATH = str(project_root / 'data' / 'processed' / 'test_processed.csv')
MAX_WORKERS = 10  # Parallel processing
K_VALUES = [1, 3, 5]
ACCURACY_THRESHOLD = 0.80  # Flag categories below 80%

print(f"\nConfiguration:")
print(f"  Vector DB: {VECTOR_DB_PATH}")
print(f"  Test data: {TEST_DATA_PATH}")
print(f"  Workers: {MAX_WORKERS}")
print(f"  Threshold: {ACCURACY_THRESHOLD:.0%}")


Configuration:
  Vector DB: c:\Users\victo\customer-support-rag\data\vector_db
  Test data: c:\Users\victo\customer-support-rag\data\processed\test_processed.csv
  Workers: 10
  Threshold: 80%


In [5]:
# ============================================================================
# LOAD DATA
# ============================================================================

print(f"\n{'='*80}")
print("LOADING DATA")
print("="*80)

# Load test set
test_df = pd.read_csv(TEST_DATA_PATH)
print(f"‚úÖ Loaded test set: {len(test_df):,} queries")
print(f"   Categories: {test_df['category'].nunique()}")
print(f"   Avg query length: {test_df['word_count'].mean():.1f} words")

# Initialize retriever
retriever = KnowledgeBaseRetriever(
    vector_db_path=VECTOR_DB_PATH,
    use_reranking=False
)
print(f"‚úÖ Initialized retriever (use reranking: OFF)")


LOADING DATA
‚úÖ Loaded test set: 3,080 queries
   Categories: 77
   Avg query length: 11.0 words
‚úÖ Initialized retriever (use reranking: OFF)


In [6]:
# ============================================================================
# FAST PARALLEL EVALUATION
# ============================================================================

def evaluate_single_query(args):
    """Evaluate a single query - designed for parallel execution"""
    idx, row, k = args
    query = row['text']
    expected_category = row['category']
    
    try:
        # Retrieve results
        results = retriever.retrieve(query, n_results=k)
        
        # Get retrieved categories
        retrieved_categories = [meta['category'] for meta in results['metadatas']]
        
        # Calculate accuracy@k
        correct = expected_category in retrieved_categories
        
        # Get top-1 prediction
        top1_category = retrieved_categories[0] if retrieved_categories else None
        
        # Get similarity scores
        distances = results['distances']
        similarities = [1 - d for d in distances]
        avg_similarity = np.mean(similarities) if similarities else 0.0
        
        return {
            'idx': idx,
            'query': query,
            'expected': expected_category,
            'predicted': top1_category,
            'correct_at_k': correct,
            'correct_at_1': top1_category == expected_category,
            'similarity': avg_similarity,
            'retrieved_categories': retrieved_categories,
            'error': None
        }
        
    except Exception as e:
        return {
            'idx': idx,
            'query': query,
            'expected': expected_category,
            'predicted': None,
            'correct_at_k': False,
            'correct_at_1': False,
            'similarity': 0.0,
            'retrieved_categories': [],
            'error': str(e)
        }

def evaluate_all_queries(test_df, k=5, max_workers=10):
    """Evaluate all queries in parallel"""
    print(f"\n{'='*80}")
    print(f"EVALUATING ALL {len(test_df):,} QUERIES (K={k}, WORKERS={max_workers})")
    print("="*80)
    
    results = []
    
    # Prepare arguments for parallel processing
    args_list = [(idx, row, k) for idx, row in test_df.iterrows()]
    
    # Process in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(evaluate_single_query, args) for args in args_list]
        
        for future in tqdm(concurrent.futures.as_completed(futures), 
                          total=len(futures), 
                          desc="Evaluating"):
            result = future.result()
            results.append(result)
    
    # Sort by original index
    results = sorted(results, key=lambda x: x['idx'])
    
    return pd.DataFrame(results)

In [7]:
# ============================================================================
# RUN EVALUATION
# ============================================================================

# Evaluate on all queries
results_df = evaluate_all_queries(test_df, k=max(K_VALUES), max_workers=MAX_WORKERS)

# Calculate overall metrics
print(f"\n{'='*80}")
print("OVERALL PERFORMANCE")
print("="*80)

total = len(results_df)
errors = results_df['error'].notna().sum()

print(f"\nTotal queries: {total:,}")
print(f"Failed queries: {errors}")
print(f"Successful: {total - errors:,}")

# Calculate accuracy@k for each k value
print(f"\nAccuracy Metrics:")
for k in K_VALUES:
    # For accuracy@k, we need to check if expected is in top-k
    if k == 1:
        acc = results_df['correct_at_1'].mean()
    else:
        # Re-evaluate for this k
        k_results = evaluate_all_queries(test_df, k=k, max_workers=MAX_WORKERS)
        acc = k_results['correct_at_k'].mean()
    print(f"  Accuracy@{k}: {acc:.1%}")

overall_accuracy = results_df['correct_at_1'].mean()
print(f"\nüéØ Overall Accuracy@1: {overall_accuracy:.1%}")

# Average similarity
avg_sim = results_df['similarity'].mean()
print(f"üìä Average Similarity: {avg_sim:.3f}")



EVALUATING ALL 3,080 QUERIES (K=5, WORKERS=10)


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3080/3080 [02:02<00:00, 25.13it/s]



OVERALL PERFORMANCE

Total queries: 3,080
Failed queries: 4
Successful: 3,076

Accuracy Metrics:
  Accuracy@1: 90.4%

EVALUATING ALL 3,080 QUERIES (K=3, WORKERS=10)


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3080/3080 [02:02<00:00, 25.18it/s]


  Accuracy@3: 94.7%

EVALUATING ALL 3,080 QUERIES (K=5, WORKERS=10)


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3080/3080 [02:04<00:00, 24.76it/s]

  Accuracy@5: 95.8%

üéØ Overall Accuracy@1: 90.4%
üìä Average Similarity: 0.409





In [8]:
# ============================================================================
# PER-CATEGORY ANALYSIS
# ============================================================================

print(f"\n{'='*80}")
print("PER-CATEGORY PERFORMANCE ANALYSIS")
print("="*80)

# Group by expected category
category_stats = []

for category in test_df['category'].unique():
    cat_results = results_df[results_df['expected'] == category]
    
    if len(cat_results) == 0:
        continue
    
    total_queries = len(cat_results)
    correct = cat_results['correct_at_1'].sum()
    accuracy = correct / total_queries if total_queries > 0 else 0
    avg_similarity = cat_results['similarity'].mean()
    
    # Get most common wrong predictions
    wrong_predictions = cat_results[cat_results['correct_at_1'] == False]['predicted'].value_counts()
    top_wrong = wrong_predictions.head(3).to_dict() if len(wrong_predictions) > 0 else {}
    
    category_stats.append({
        'category': category,
        'total_queries': total_queries,
        'correct': int(correct),
        'accuracy': accuracy,
        'avg_similarity': avg_similarity,
        'top_wrong_predictions': top_wrong
    })

category_df = pd.DataFrame(category_stats).sort_values('accuracy')

print(f"\nTotal categories: {len(category_df)}")
print(f"Categories with 100% accuracy: {(category_df['accuracy'] == 1.0).sum()}")
print(f"Categories with ‚â•90% accuracy: {(category_df['accuracy'] >= 0.9).sum()}")
print(f"Categories with ‚â•80% accuracy: {(category_df['accuracy'] >= 0.8).sum()}")
print(f"Categories with <80% accuracy: {(category_df['accuracy'] < ACCURACY_THRESHOLD).sum()}")


PER-CATEGORY PERFORMANCE ANALYSIS

Total categories: 77
Categories with 100% accuracy: 13
Categories with ‚â•90% accuracy: 56
Categories with ‚â•80% accuracy: 67
Categories with <80% accuracy: 10


In [9]:
# ============================================================================
# IDENTIFY GAP CATEGORIES
# ============================================================================

print(f"\n{'='*80}")
print(f"GAP CATEGORIES (ACCURACY < {ACCURACY_THRESHOLD:.0%})")
print("="*80)

gap_categories = category_df[category_df['accuracy'] < ACCURACY_THRESHOLD].copy()

if len(gap_categories) == 0:
    print(f"\nüéâ NO CATEGORIES BELOW {ACCURACY_THRESHOLD:.0%}!")
    print("Your knowledge base is performing excellently across all categories.")
else:
    print(f"\nFound {len(gap_categories)} categories needing improvement:\n")
    
    for idx, row in gap_categories.iterrows():
        print(f"\n{'‚îÄ'*80}")
        print(f"üìç Category: {row['category']}")
        print(f"   Accuracy: {row['accuracy']:.1%} ({row['correct']}/{row['total_queries']})")
        print(f"   Avg Similarity: {row['avg_similarity']:.3f}")
        
        if row['top_wrong_predictions']:
            print(f"   Most confused with:")
            for wrong_cat, count in list(row['top_wrong_predictions'].items())[:3]:
                print(f"      ‚Üí {wrong_cat} ({count} times)")



GAP CATEGORIES (ACCURACY < 80%)

Found 10 categories needing improvement:


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìç Category: balance_not_updated_after_bank_transfer
   Accuracy: 50.0% (20/40)
   Avg Similarity: 0.351
   Most confused with:
      ‚Üí transfer_not_received_by_recipient (8 times)
      ‚Üí transfer_timing (5 times)
      ‚Üí pending_transfer (3 times)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìç Category: top_up_failed
   Accuracy: 55.0% (22/40)
   Avg Similarity: 0.472
   Most confused with:
      ‚Üí top_up_reverted (15 times)
      ‚Üí verify_top_up (1 times)
      ‚

In [10]:
# ============================================================================
# DETAILED FAILURE ANALYSIS
# ============================================================================

print(f"\n{'='*80}")
print("FAILURE EXAMPLES")
print("="*80)

# Show examples from each gap category
if len(gap_categories) > 0:
    print(f"\nShowing failed queries from top 5 weakest categories:\n")
    
    for idx, row in gap_categories.head(5).iterrows():
        category = row['category']
        
        # Get failed queries for this category
        failed = results_df[
            (results_df['expected'] == category) & 
            (results_df['correct_at_1'] == False)
        ].head(3)
        
        if len(failed) > 0:
            print(f"\n{'‚îÄ'*80}")
            print(f"Category: {category} ({row['accuracy']:.1%})")
            print()
            
            for _, failure in failed.iterrows():
                print(f"  ‚ùå Query: {failure['query']}")
                print(f"     Expected: {failure['expected']}")
                print(f"     Got: {failure['predicted']}")
                print(f"     Similarity: {failure['similarity']:.3f}")
                print()



FAILURE EXAMPLES

Showing failed queries from top 5 weakest categories:


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Category: balance_not_updated_after_bank_transfer (50.0%)

  ‚ùå Query: the balance on my account didnt change when i transferred money
     Expected: balance_not_updated_after_bank_transfer
     Got: beneficiary_not_allowed
     Similarity: 0.099

  ‚ùå Query: How long will it take for my transferred money to show up?
     Expected: balance_not_updated_after_bank_transfer
     Got: transfer_timing
     Similarity: 0.510

  ‚ùå Query: How long does it take for an international transfer into my account?
     Expected: balance_not_updated_after_bank_transfer
     Got: transfer_timing
     Similarity: 0.530


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚

In [11]:
"""
Deep Dive: Why These 3 Categories Fail
=======================================
Analyzing root causes and testing hypotheses
"""

# 1. Analyze failure patterns
gap_categories = [
    'pending_transfer',
    'card_payment_not_recognised', 
    'balance_not_updated_after_bank_transfer'
]

for category in gap_categories:
    print(f"\n{'='*80}")
    print(f"ANALYZING: {category}")
    print('='*80)
    
    # Get all failures
    failures = results_df[
        (results_df['expected'] == category) & 
        (results_df['correct_at_1'] == False)
    ]
    
    # Confusion matrix
    print("\nMost common misclassifications:")
    print(failures['predicted'].value_counts().head(5))
    
    # Similarity distribution
    print(f"\nSimilarity scores:")
    print(f"  Mean: {failures['similarity'].mean():.3f}")
    print(f"  Median: {failures['similarity'].median():.3f}")
    print(f"  Min: {failures['similarity'].min():.3f}")
    
    # Analyze query characteristics
    print("\nQuery characteristics of failures:")
    print(f"  Avg length: {failures['query'].str.split().str.len().mean():.1f} words")
    
    # Compare successful vs failed queries
    successes = results_df[
        (results_df['expected'] == category) & 
        (results_df['correct_at_1'] == True)
    ]
    
    print(f"\nSuccess vs Failure comparison:")
    print(f"  Success similarity: {successes['similarity'].mean():.3f}")
    print(f"  Failure similarity: {failures['similarity'].mean():.3f}")
    print(f"  Difference: {successes['similarity'].mean() - failures['similarity'].mean():.3f}")


ANALYZING: pending_transfer

Most common misclassifications:
predicted
transfer_timing                            2
transfer_not_received_by_recipient         2
failed_transfer                            2
balance_not_updated_after_bank_transfer    1
transfer_fee_charged                       1
Name: count, dtype: int64

Similarity scores:
  Mean: 0.361
  Median: 0.353
  Min: 0.240

Query characteristics of failures:
  Avg length: 8.6 words

Success vs Failure comparison:
  Success similarity: 0.413
  Failure similarity: 0.361
  Difference: 0.052

ANALYZING: card_payment_not_recognised

Most common misclassifications:
predicted
direct_debit_payment_not_recognised    4
compromised_card                       4
cash_withdrawal_not_recognised         3
extra_charge_on_statement              2
reverted_card_payment?                 1
Name: count, dtype: int64

Similarity scores:
  Mean: 0.190
  Median: 0.229
  Min: -0.073

Query characteristics of failures:
  Avg length: 11.9 words

Succes