In [None]:
!pip install voyageai

Collecting voyageai
  Downloading voyageai-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting aiolimiter (from voyageai)
  Downloading aiolimiter-1.2.1-py3-none-any.whl.metadata (4.5 kB)
Downloading voyageai-0.3.5-py3-none-any.whl (28 kB)
Downloading aiolimiter-1.2.1-py3-none-any.whl (6.7 kB)
Installing collected packages: aiolimiter, voyageai
Successfully installed aiolimiter-1.2.1 voyageai-0.3.5


In [None]:
# Install required packages
import numpy as np
import pandas as pd
import json
import time
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import os
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import voyageai

# Upload your zip file
from google.colab import files
uploaded = files.upload()

# Get the actual filename from upload
zip_filename = list(uploaded.keys())[0]
print(f"Uploaded file: {zip_filename}")

# Extract the zip
import zipfile
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Check what was extracted
print("Extracted contents:")
for root, dirs, files in os.walk('/content/'):
    for file in files:
        if file.endswith('.npy'):
            print(os.path.join(root, file))

# FIXED: Load embeddings and metadata with correct paths and filenames
def load_embedding_data(dataset_name, model_name):
    base_path = '/content/Emebddings'  # Changed from '/content/embedding_results1'

    try:
        # FIXED: Updated filename patterns to match your actual files
        # Your files follow pattern: {dataset}_{model}_embeddings_{type}.npy
        embeddings = np.load(f'{base_path}/{dataset_name}_{model}_embeddings_embeddings.npy')
        metadata = np.load(f'{base_path}/{dataset_name}_{model}_embeddings_metadata.npy', allow_pickle=True)
        texts = np.load(f'{base_path}/{dataset_name}_{model}_embeddings_texts.npy', allow_pickle=True)
        return embeddings, metadata, texts
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return None, None, None

# Load all your data
datasets = ['Structural_Balanced', 'Structural_Hierarchical']
models = ['qwen', 'gemma', 'voyage']

embedding_data = {}
for dataset in datasets:
    embedding_data[dataset] = {}
    for model in models:
        embeddings, metadata, texts = load_embedding_data(dataset, model)
        if embeddings is not None:
            embedding_data[dataset][model] = {
                'embeddings': embeddings,
                'metadata': metadata,
                'texts': texts
            }
            print(f"Loaded {dataset}_{model}: {embeddings.shape[0]} embeddings, dim {embeddings.shape[1]}")
        else:
            print(f"Could not load {dataset}_{model}")

print("Data loading complete!")
print(f"Successfully loaded datasets: {list(embedding_data.keys())}")
for dataset_name, models_dict in embedding_data.items():
    print(f"  {dataset_name}: {list(models_dict.keys())}")

Saving emebdding_results1.zip to emebdding_results1.zip
Uploaded file: emebdding_results1.zip
Extracted contents:
/content/Emebddings/Structural_Balanced_voyage_embeddings_texts.npy
/content/Emebddings/Structural_Balanced_qwen_embeddings_texts.npy
/content/Emebddings/Structural_Balanced_qwen_embeddings_metadata.npy
/content/Emebddings/Structural_Hierarchical_qwen_embeddings_embeddings.npy
/content/Emebddings/Structural_Hierarchical_voyage_embeddings_embeddings.npy
/content/Emebddings/Structural_Hierarchical_gemma_embeddings_metadata.npy
/content/Emebddings/Structural_Hierarchical_gemma_embeddings_texts.npy
/content/Emebddings/Structural_Hierarchical_voyage_embeddings_texts.npy
/content/Emebddings/Structural_Balanced_qwen_embeddings_embeddings.npy
/content/Emebddings/Structural_Balanced_voyage_embeddings_metadata.npy
/content/Emebddings/Structural_Balanced_gemma_embeddings_texts.npy
/content/Emebddings/Structural_Hierarchical_qwen_embeddings_texts.npy
/content/Emebddings/Structural_Hier

In [None]:
def create_jiopay_evaluation_queries():
    """Create diverse evaluation queries for JioPay domain"""

    evaluation_queries = [
        # Onboarding & Registration
        {
            "query": "Steps to create JioPay merchant account",
            "expected_topics": ["merchant", "account", "registration", "signup"],
            "category": "onboarding",
            "difficulty": "simple"
        },
        {
            "query": "Required documents for business verification JioPay",
            "expected_topics": ["kyc", "documents", "business", "verification"],
            "category": "onboarding",
            "difficulty": "medium"
        },

        # Payment Processing
        {
            "query": "How does UPI payment work through JioPay gateway",
            "expected_topics": ["upi", "payment", "gateway", "process"],
            "category": "payments",
            "difficulty": "medium"
        },
        {
            "query": "JioPay payment gateway integration tutorial",
            "expected_topics": ["integration", "api", "gateway", "developer"],
            "category": "technical",
            "difficulty": "complex"
        },

        # Troubleshooting
        {
            "query": "Payment declined error resolution JioPay",
            "expected_topics": ["payment", "declined", "error", "troubleshooting"],
            "category": "support",
            "difficulty": "medium"
        },
        {
            "query": "Customer refund process in JioPay system",
            "expected_topics": ["refund", "customer", "process", "policy"],
            "category": "support",
            "difficulty": "medium"
        },

        # Features & Services
        {
            "query": "Dynamic QR code generation for payments",
            "expected_topics": ["qr", "dynamic", "generation", "payment"],
            "category": "features",
            "difficulty": "medium"
        },
        {
            "query": "Bulk payment collection using JioPay links",
            "expected_topics": ["bulk", "collection", "links", "payment"],
            "category": "features",
            "difficulty": "complex"
        },

        # Security & Compliance
        {
            "query": "PCI compliance requirements for JioPay merchants",
            "expected_topics": ["pci", "compliance", "security", "merchant"],
            "category": "compliance",
            "difficulty": "complex"
        },
        {
            "query": "Two-factor authentication setup JioPay",
            "expected_topics": ["2fa", "authentication", "security", "setup"],
            "category": "security",
            "difficulty": "simple"
        },

        # Pricing & Fees
        {
            "query": "Transaction charges for different payment methods",
            "expected_topics": ["charges", "fees", "transaction", "pricing"],
            "category": "pricing",
            "difficulty": "simple"
        },
        {
            "query": "Settlement timeline for JioPay transactions",
            "expected_topics": ["settlement", "timeline", "payout", "transaction"],
            "category": "pricing",
            "difficulty": "medium"
        }
    ]

    return evaluation_queries

eval_queries = create_jiopay_evaluation_queries()
print(f"Created {len(eval_queries)} evaluation queries")

Created 12 evaluation queries


In [None]:
from google.colab import userdata
from huggingface_hub import login
v_api = userdata.get('Voyageai')
g_api = userdata.get('GEMINI_API_KEY')
login(token=userdata.get('HF_TOKEN'))

In [None]:
# Model Loading Cell - Run this first
import torch
from transformers import AutoTokenizer, AutoModel
import voyageai
from google.colab import userdata
from huggingface_hub import login

# Login to Hugging Face
login(token=userdata.get('HF_TOKEN'))

# Add debug prints for model loading
def setup_qwen_for_query():
    print("  Loading Qwen model...")
    try:
        model_name = "Qwen/Qwen3-0.6B"  # Using a more reliable Qwen model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.eval()
        if torch.cuda.is_available():
            model = model.cuda()
        print("  ✓ Qwen model loaded successfully")
        return model, tokenizer
    except Exception as e:
        print(f"  ✗ Error loading Qwen: {e}")
        return None, None

def setup_gemma_for_query():
    print("  Loading Gemma model...")
    try:
        model_name = "google/embeddinggemma-300m"  # Using standard gemma model instead of embedding-specific
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        model.eval()
        if torch.cuda.is_available():
            model = model.cuda()
        print("  ✓ Gemma model loaded successfully")
        return model, tokenizer
    except Exception as e:
        print(f"  ✗ Error loading Gemma: {e}")
        return None, None

def setup_voyage_for_query():
    print("  Setting up Voyage client...")
    try:
        v_api = userdata.get('Voyageai')
        if not v_api:
            print("  ✗ Voyage API key not found")
            return None
        vo = voyageai.Client(api_key=v_api)
        print("  ✓ Voyage client setup successfully")
        return vo
    except Exception as e:
        print(f"  ✗ Error setting up Voyage: {e}")
        return None

# Load your actual models with proper error handling
print("Loading models for query embedding...")
qwen_model, qwen_tokenizer = setup_qwen_for_query()
gemma_model, gemma_tokenizer = setup_gemma_for_query()
voyage_client = setup_voyage_for_query()

# Only include models that loaded successfully
model_configs = {}
if qwen_model is not None and qwen_tokenizer is not None:
    model_configs['qwen'] = (qwen_model, qwen_tokenizer)
if gemma_model is not None and gemma_tokenizer is not None:
    model_configs['gemma'] = (gemma_model, gemma_tokenizer)
if voyage_client is not None:
    model_configs['voyage'] = voyage_client

print(f"\nSuccessfully loaded models: {list(model_configs.keys())}")
print("Model loading complete!")

Loading models for query embedding...
  Loading Qwen model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

  ✓ Qwen model loaded successfully
  Loading Gemma model...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

  ✓ Gemma model loaded successfully
  Setting up Voyage client...
  ✓ Voyage client setup successfully

Successfully loaded models: ['qwen', 'gemma', 'voyage']
Model loading complete!


In [None]:
@retry(
    stop=stop_after_attempt(6),
    wait=wait_exponential(multiplier=1, min=25, max=60),
    retry=retry_if_exception_type((Exception,)),
    reraise=True
)
def embed_query_with_voyage_retry(vo, query_text):
    """Embed query with retry logic"""
    result = vo.embed(
        [query_text],
        model="voyage-3.5",  # Match document model
        input_type="query"
    )
    return result.embeddings[0]

def embed_query_with_model(query_text, model_name, model_config):
    """Embed query using the specific model"""

    if model_name == 'voyage':
        vo = model_config
        try:
            embedding = embed_query_with_voyage_retry(vo, query_text)
            return torch.tensor(embedding)
        except Exception as e:
            print(f"Voyage API error: {e}")
            return None

    elif model_name == 'qwen':
        model, tokenizer = model_config
        try:
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            inputs = tokenizer([query_text], padding=True, truncation=True,
                              max_length=512, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                query_embedding = outputs.last_hidden_state.mean(dim=1)
                query_embedding = F.normalize(query_embedding, p=2, dim=1)

            return query_embedding.cpu().squeeze()
        except Exception as e:
            print(f"Qwen embedding error: {e}")
            return None

    elif model_name == 'gemma':
        model, tokenizer = model_config
        try:
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            inputs = tokenizer([query_text], padding=True, truncation=True,
                              max_length=512, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                    query_embedding = outputs.pooler_output
                else:
                    query_embedding = outputs.last_hidden_state.mean(dim=1)
                query_embedding = F.normalize(query_embedding, p=2, dim=1)

            return query_embedding.cpu().squeeze()
        except Exception as e:
            print(f"Gemma embedding error: {e}")
            return None

def perform_similarity_search_with_model(query_text, embeddings, model_name, model_config, top_k=5):
    """Perform similarity search using the actual model for query embedding"""

    start_time = time.time()

    # Embed query using the same model that created the document embeddings
    query_embedding = embed_query_with_model(query_text, model_name, model_config)

    if query_embedding is None:
        return {
            'top_k_indices': np.array([]),
            'top_k_scores': np.array([]),
            'query_time_ms': 0
        }

    # Convert to torch tensors if needed
    if not isinstance(embeddings, torch.Tensor):
        embeddings = torch.tensor(embeddings, dtype=torch.float32)
    if not isinstance(query_embedding, torch.Tensor):
        query_embedding = torch.tensor(query_embedding, dtype=torch.float32)

    # Ensure same device (no normalization to match document embeddings)
    embeddings = embeddings.cpu()
    query_embedding = query_embedding.cpu()

    # Calculate cosine similarity
    similarities = F.cosine_similarity(query_embedding.unsqueeze(0), embeddings, dim=1)

    # Get top-k results
    top_k_scores, top_k_indices = torch.topk(similarities, k=min(top_k, len(similarities)))

    query_time = (time.time() - start_time) * 1000  # Convert to ms

    return {
        'top_k_indices': top_k_indices.numpy(),
        'top_k_scores': top_k_scores.numpy(),
        'query_time_ms': query_time
    }

def evaluate_retrieval_quality_with_models_fixed(embeddings, texts, metadata, queries, model_name, model_config, top_k=5):
    """
    FIXED: Evaluate retrieval quality with proper metrics calculation

    Key fixes:
    1. Proper recall calculation (can't exceed 1.0)
    2. Better relevance matching
    3. Clearer metric definitions
    """

    recall_scores = []
    mrr_scores = []
    precision_scores = []
    latency_times = []
    detailed_results = []

    print(f"    Evaluating {model_name} on {len(queries)} queries...")

    for i, query_data in enumerate(tqdm(queries, desc=f"Evaluating {model_name}")):
        query_text = query_data["query"]
        expected_topics = query_data["expected_topics"]

        try:
            # Perform similarity search
            search_results = perform_similarity_search_with_model(
                query_text, embeddings, model_name, model_config, top_k
            )

            top_k_indices = search_results['top_k_indices']
            top_k_scores = search_results['top_k_scores']
            query_time = search_results['query_time_ms']
            latency_times.append(query_time)

            if len(top_k_indices) == 0:
                recall_scores.append(0.0)
                precision_scores.append(0.0)
                mrr_scores.append(0.0)
                detailed_results.append({
                    'query': query_text,
                    'expected_topics': expected_topics,
                    'found_relevant': 0,
                    'total_expected': len(expected_topics),
                    'recall': 0.0,
                    'precision': 0.0,
                    'mrr': 0.0
                })
                continue

            # FIXED: Evaluate relevance properly
            relevant_indices = []
            first_relevant_rank = None

            for rank, (idx, score) in enumerate(zip(top_k_indices, top_k_scores), 1):
                text_content = texts[idx].lower()

                # Improved relevance checking
                is_relevant = False
                matched_topics = []

                for topic in expected_topics:
                    topic_lower = topic.lower().strip()
                    if topic_lower in text_content:
                        is_relevant = True
                        matched_topics.append(topic)
                        break

                    # Check for partial matches for multi-word topics
                    topic_words = topic_lower.split()
                    if len(topic_words) > 1:
                        if all(word in text_content for word in topic_words):
                            is_relevant = True
                            matched_topics.append(topic)
                            break

                if is_relevant:
                    relevant_indices.append(idx)
                    if first_relevant_rank is None:
                        first_relevant_rank = rank

            # FIXED: Calculate metrics properly
            relevant_found = len(relevant_indices)
            total_expected = len(expected_topics)

            # Recall: What fraction of relevant documents were retrieved in top-k
            # Should be: relevant_found / total_relevant_in_corpus
            # But we approximate as: relevant_found / min(total_expected, top_k)
            recall = min(relevant_found / total_expected, 1.0) if total_expected > 0 else 0.0

            # Precision: What fraction of retrieved documents are relevant
            precision = relevant_found / top_k

            # MRR: 1/rank of first relevant document
            mrr = 1.0 / first_relevant_rank if first_relevant_rank else 0.0

            # Ensure bounds are correct
            assert 0.0 <= recall <= 1.0, f"Invalid recall: {recall}"
            assert 0.0 <= precision <= 1.0, f"Invalid precision: {precision}"
            assert 0.0 <= mrr <= 1.0, f"Invalid MRR: {mrr}"

            recall_scores.append(recall)
            precision_scores.append(precision)
            mrr_scores.append(mrr)

            detailed_results.append({
                'query': query_text,
                'expected_topics': expected_topics,
                'found_relevant': relevant_found,
                'total_expected': total_expected,
                'recall': recall,
                'precision': precision,
                'mrr': mrr,
                'top_scores': top_k_scores.tolist(),
                'first_relevant_rank': first_relevant_rank
            })

            # Print progress every 5 queries
            if (i + 1) % 5 == 0:
                print(f"      Processed {i+1}/{len(queries)} queries")

        except Exception as e:
            print(f"      Error processing query {i+1} with {model_name}: {e}")
            recall_scores.append(0.0)
            precision_scores.append(0.0)
            mrr_scores.append(0.0)
            latency_times.append(0.0)

    # Calculate final metrics
    successful_queries = sum(1 for score in recall_scores if score > 0)

    return {
        'recall_at_k': np.mean(recall_scores),
        'precision_at_k': np.mean(precision_scores),
        'mrr': np.mean(mrr_scores),
        'avg_latency_ms': np.mean(latency_times),
        'std_latency_ms': np.std(latency_times),
        'successful_queries': successful_queries,
        'total_queries': len(queries),
        'success_rate': successful_queries / len(queries) if queries else 0,
        'detailed_results': detailed_results  # For debugging
    }

def calculate_index_metrics(embeddings, texts):
    """Calculate index-related metrics"""

    embedding_size_mb = embeddings.nbytes / (1024 * 1024)
    num_chunks = len(embeddings)
    embedding_dim = embeddings.shape[1]
    avg_text_length = np.mean([len(text) for text in texts])
    total_text_size_mb = sum(len(text.encode('utf-8')) for text in texts) / (1024 * 1024)
    compression_ratio = embedding_size_mb / total_text_size_mb if total_text_size_mb > 0 else 0

    return {
        'index_size_mb': embedding_size_mb,
        'num_chunks': num_chunks,
        'embedding_dimension': embedding_dim,
        'avg_text_length': avg_text_length,
        'compression_ratio': compression_ratio,
        'total_text_size_mb': total_text_size_mb
    }

def run_ablation_study_with_proper_metrics(embedding_data, eval_queries, model_configs):
    """Run ablation study with FIXED metrics calculation"""

    print("\nRunning Embedding Ablation Study with PROPER Metrics")
    print("=" * 70)

    print(f"Available datasets: {list(embedding_data.keys())}")
    print(f"Available model configs: {list(model_configs.keys())}")
    print(f"Number of evaluation queries: {len(eval_queries)}")

    results = {}

    for dataset_name, dataset_models in embedding_data.items():
        print(f"\nEvaluating Dataset: {dataset_name}")
        print("-" * 40)

        results[dataset_name] = {}

        for model_name, data in dataset_models.items():
            print(f"  Model: {model_name}")

            if model_name not in model_configs:
                print(f"    ✗ Skipping {model_name} - model config not available")
                continue

            embeddings = data['embeddings']
            texts = data['texts']
            metadata = data['metadata']
            model_config = model_configs[model_name]

            print(f"    Data shapes: embeddings={embeddings.shape}, texts={len(texts)}")

            try:
                # FIXED: Use the corrected evaluation function
                retrieval_metrics = evaluate_retrieval_quality_with_models_fixed(
                    embeddings, texts, metadata, eval_queries, model_name, model_config, top_k=5
                )

                # Index metrics
                index_metrics = calculate_index_metrics(embeddings, texts)

                # Combine all metrics
                results[dataset_name][model_name] = {
                    **retrieval_metrics,
                    **index_metrics,
                    'model_name': model_name,
                    'dataset_name': dataset_name
                }

                # Print summary with bounds checking
                recall = retrieval_metrics['recall_at_k']
                mrr = retrieval_metrics['mrr']
                precision = retrieval_metrics['precision_at_k']
                success_rate = retrieval_metrics['success_rate']

                # Verify metrics are in valid ranges
                assert 0.0 <= recall <= 1.0, f"Invalid recall: {recall}"
                assert 0.0 <= mrr <= 1.0, f"Invalid MRR: {mrr}"
                assert 0.0 <= precision <= 1.0, f"Invalid precision: {precision}"

                print(f"    Results:")
                print(f"      Recall@5: {recall:.3f}")
                print(f"      MRR: {mrr:.3f}")
                print(f"      Precision@5: {precision:.3f}")
                print(f"      Index Size: {index_metrics['index_size_mb']:.1f} MB")
                print(f"      Avg Latency: {retrieval_metrics['avg_latency_ms']:.2f} ms")
                print(f"      Success Rate: {success_rate:.1%} ({retrieval_metrics['successful_queries']}/{retrieval_metrics['total_queries']})")

            except Exception as e:
                print(f"    ✗ Error evaluating {model_name}: {e}")
                import traceback
                traceback.print_exc()

    return results

# Debug function to analyze your current evaluation queries
def debug_evaluation_setup(eval_queries):
    """Debug the evaluation queries to understand what might be causing issues"""

    print("\n🔍 EVALUATION QUERY ANALYSIS")
    print("=" * 50)

    print(f"Total queries: {len(eval_queries)}")

    for i, query_data in enumerate(eval_queries):
        query_text = query_data["query"]
        expected_topics = query_data.get("expected_topics", [])

        print(f"\nQuery {i+1}:")
        print(f"  Text: {query_text}")
        print(f"  Expected topics ({len(expected_topics)}): {expected_topics}")

        if len(expected_topics) == 0:
            print(f"  ⚠️  WARNING: No expected topics defined!")

        if len(expected_topics) > 5:
            print(f"  ⚠️  WARNING: Many expected topics ({len(expected_topics)}) - might inflate recall")

print("\n🛠️ Ready to run ablation study with FIXED metrics!")
print("The key fixes:")
print("1. Recall can't exceed 1.0")
print("2. Better relevance matching")
print("3. Proper bounds checking")
print("4. Debug info for query analysis")
print("\nTo debug your queries first, run:")
print("debug_evaluation_setup(eval_queries)")
print("\nThen run the fixed evaluation:")
print("fixed_results = run_ablation_study_with_proper_metrics(embedding_data, eval_queries, model_configs)")


🛠️ Ready to run ablation study with FIXED metrics!
The key fixes:
1. Recall can't exceed 1.0
2. Better relevance matching
3. Proper bounds checking
4. Debug info for query analysis

To debug your queries first, run:
debug_evaluation_setup(eval_queries)

Then run the fixed evaluation:
fixed_results = run_ablation_study_with_proper_metrics(embedding_data, eval_queries, model_configs)


In [None]:
debug_evaluation_setup(eval_queries)


🔍 EVALUATION QUERY ANALYSIS
Total queries: 12

Query 1:
  Text: Steps to create JioPay merchant account
  Expected topics (4): ['merchant', 'account', 'registration', 'signup']

Query 2:
  Text: Required documents for business verification JioPay
  Expected topics (4): ['kyc', 'documents', 'business', 'verification']

Query 3:
  Text: How does UPI payment work through JioPay gateway
  Expected topics (4): ['upi', 'payment', 'gateway', 'process']

Query 4:
  Text: JioPay payment gateway integration tutorial
  Expected topics (4): ['integration', 'api', 'gateway', 'developer']

Query 5:
  Text: Payment declined error resolution JioPay
  Expected topics (4): ['payment', 'declined', 'error', 'troubleshooting']

Query 6:
  Text: Customer refund process in JioPay system
  Expected topics (4): ['refund', 'customer', 'process', 'policy']

Query 7:
  Text: Dynamic QR code generation for payments
  Expected topics (4): ['qr', 'dynamic', 'generation', 'payment']

Query 8:
  Text: Bulk payment co

In [None]:
fixed_results = run_ablation_study_with_proper_metrics(embedding_data, eval_queries, model_configs)


Running Embedding Ablation Study with PROPER Metrics
Available datasets: ['Structural_Balanced', 'Structural_Hierarchical']
Available model configs: ['qwen', 'gemma', 'voyage']
Number of evaluation queries: 12

Evaluating Dataset: Structural_Balanced
----------------------------------------
  Model: qwen
    Data shapes: embeddings=(368, 1024), texts=368
    Evaluating qwen on 12 queries...


Evaluating qwen:  58%|█████▊    | 7/12 [00:01<00:00,  8.10it/s]

      Processed 5/12 queries


Evaluating qwen: 100%|██████████| 12/12 [00:01<00:00,  8.32it/s]


      Processed 10/12 queries
    Results:
      Recall@5: 0.396
      MRR: 0.367
      Precision@5: 0.333
      Index Size: 1.4 MB
      Avg Latency: 119.56 ms
      Success Rate: 83.3% (10/12)
  Model: gemma
    Data shapes: embeddings=(368, 768), texts=368
    Evaluating gemma on 12 queries...


Evaluating gemma:  67%|██████▋   | 8/12 [00:00<00:00, 19.63it/s]

      Processed 5/12 queries


Evaluating gemma: 100%|██████████| 12/12 [00:00<00:00, 19.49it/s]


      Processed 10/12 queries
    Results:
      Recall@5: 0.271
      MRR: 0.379
      Precision@5: 0.217
      Index Size: 1.1 MB
      Avg Latency: 50.78 ms
      Success Rate: 91.7% (11/12)
  Model: voyage
    Data shapes: embeddings=(368, 1024), texts=368
    Evaluating voyage on 12 queries...


Evaluating voyage:  50%|█████     | 6/12 [01:15<01:24, 14.04s/it]

      Processed 5/12 queries


Evaluating voyage: 100%|██████████| 12/12 [03:46<00:00, 18.88s/it]

      Processed 10/12 queries





    Results:
      Recall@5: 0.958
      MRR: 0.903
      Precision@5: 0.883
      Index Size: 2.9 MB
      Avg Latency: 18883.79 ms
      Success Rate: 100.0% (12/12)

Evaluating Dataset: Structural_Hierarchical
----------------------------------------
  Model: qwen
    Data shapes: embeddings=(380, 1024), texts=380
    Evaluating qwen on 12 queries...


Evaluating qwen:  75%|███████▌  | 9/12 [00:00<00:00, 21.51it/s]

      Processed 5/12 queries


Evaluating qwen: 100%|██████████| 12/12 [00:00<00:00, 21.37it/s]


      Processed 10/12 queries
    Results:
      Recall@5: 0.417
      MRR: 0.417
      Precision@5: 0.350
      Index Size: 1.5 MB
      Avg Latency: 46.30 ms
      Success Rate: 75.0% (9/12)
  Model: gemma
    Data shapes: embeddings=(380, 768), texts=380
    Evaluating gemma on 12 queries...


Evaluating gemma:  75%|███████▌  | 9/12 [00:00<00:00, 21.64it/s]

      Processed 5/12 queries


Evaluating gemma: 100%|██████████| 12/12 [00:00<00:00, 21.74it/s]


      Processed 10/12 queries
    Results:
      Recall@5: 0.188
      MRR: 0.188
      Precision@5: 0.150
      Index Size: 1.1 MB
      Avg Latency: 45.46 ms
      Success Rate: 66.7% (8/12)
  Model: voyage
    Data shapes: embeddings=(380, 1024), texts=380
    Evaluating voyage on 12 queries...


Evaluating voyage:  42%|████▏     | 5/12 [02:30<03:02, 26.01s/it]

      Processed 5/12 queries


Evaluating voyage: 100%|██████████| 12/12 [05:01<00:00, 25.14s/it]

      Processed 10/12 queries
    Results:
      Recall@5: 0.958
      MRR: 0.917
      Precision@5: 0.883
      Index Size: 3.0 MB
      Avg Latency: 25136.86 ms
      Success Rate: 100.0% (12/12)





In [None]:
import voyageai
import google.generativeai as genai
import torch
import torch.nn.functional as F
import time
# from colab import userdata

# API Setup
VOYAGE_API_KEY = v_api
GEMINI_API_KEY = g_api

voyage_client = voyageai.Client(api_key=VOYAGE_API_KEY)
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.0-flash-exp")

def retrieve_chunks(query, embeddings, texts, metadata, top_k=3):
    """Retrieve relevant chunks using Voyage"""
    start_time = time.time()

    # Embed query
    result = voyage_client.embed([query], model="voyage-3.5", input_type="query")
    query_embedding = torch.tensor(result.embeddings[0])

    # Calculate similarities
    doc_embeddings = torch.tensor(embeddings)
    similarities = F.cosine_similarity(query_embedding.unsqueeze(0), doc_embeddings, dim=1)

    # Get top-k
    top_scores, top_indices = torch.topk(similarities, k=min(top_k, len(similarities)))

    retrieval_time = (time.time() - start_time) * 1000

    results = []
    for i, (idx, score) in enumerate(zip(top_indices, top_scores)):
        results.append({
            'rank': i + 1,
            'content': texts[idx],
            'score': score.item(),
            'metadata': metadata[idx]
        })

    return results, retrieval_time

def generate_answer(query, retrieved_chunks):
    """Generate answer using Gemini"""
    start_time = time.time()

    # Build context
    context = ""
    for chunk in retrieved_chunks:
        context += f"[Context {chunk['rank']} - Score: {chunk['score']:.3f}]\n"
        context += f"{chunk['content']}\n\n"

    # Create prompt
    prompt = f"""Answer the question based only on the provided context.

Context:
{context}

Question: {query}

Answer:"""

    # Generate response
    try:
        response = gemini_model.generate_content(
            prompt,
            generation_config={
                'temperature': 0.1,
                'max_output_tokens': 512,
            }
        )
        answer = response.text
    except Exception as e:
        answer = f"Error generating answer: {e}"

    generation_time = (time.time() - start_time) * 1000
    return answer, generation_time

def simple_rag_query(query, embeddings, texts, metadata, top_k=3):
    """Simple RAG pipeline"""
    print(f"\nQuery: {query}")
    print("-" * 50)

    # Retrieve
    chunks, retrieval_time = retrieve_chunks(query, embeddings, texts, metadata, top_k)

    # Generate
    answer, generation_time = generate_answer(query, chunks)

    # Display results
    print(f"Answer: {answer}")
    print(f"\nRetrieved {len(chunks)} chunks in {retrieval_time:.0f}ms")
    print(f"Generated answer in {generation_time:.0f}ms")
    print(f"Total time: {retrieval_time + generation_time:.0f}ms")

    print("\nTop retrieved chunks:")
    for chunk in chunks:
        print(f"  {chunk['rank']}. Score: {chunk['score']:.3f}")
        print(f"     {chunk['content'][:100]}...")
        if 'source' in chunk['metadata']:
            print(f"     Source: {chunk['metadata']['source']}")

    return answer, chunks

# Test with your data
def test_rag():
    """Test the RAG system with your embeddings"""

    # Load your actual data here
    # embeddings = embedding_data['Structural_Hierarchical']['voyage']['embeddings']
    # texts = embedding_data['Structural_Hierarchical']['voyage']['texts']
    # metadata = embedding_data['Structural_Hierarchical']['voyage']['metadata']

    print("Load your embeddings first:")
    print("embeddings = embedding_data['Structural_Hierarchical']['voyage']['embeddings']")
    print("texts = embedding_data['Structural_Hierarchical']['voyage']['texts']")
    print("metadata = embedding_data['Structural_Hierarchical']['voyage']['metadata']")
    print()

    # Test queries
    test_queries = [
        "What is the interest rate?",
        "How to reset password?",
        "What are the payment methods?",
        "Customer support contact details"
    ]

    print("Then run queries:")
    for query in test_queries:
        print(f"simple_rag_query('{query}', embeddings, texts, metadata)")

    # Uncomment when you have the data loaded:
    # for query in test_queries:
    #     simple_rag_query(query, embeddings, texts, metadata)
    #     print("\n" + "="*60 + "\n")

if __name__ == "__main__":
    print("Simple RAG Test - Structural Hierarchical + Voyage + Gemini")
    print("="*60)
    test_rag()

Simple RAG Test - Structural Hierarchical + Voyage + Gemini
Load your embeddings first:
embeddings = embedding_data['Structural_Hierarchical']['voyage']['embeddings']
texts = embedding_data['Structural_Hierarchical']['voyage']['texts']
metadata = embedding_data['Structural_Hierarchical']['voyage']['metadata']

Then run queries:
simple_rag_query('What is the interest rate?', embeddings, texts, metadata)
simple_rag_query('How to reset password?', embeddings, texts, metadata)
simple_rag_query('What are the payment methods?', embeddings, texts, metadata)
simple_rag_query('Customer support contact details', embeddings, texts, metadata)


In [None]:
simple_rag_query("What is the interest rate?", embeddings, texts, metadata)


Query: What is the interest rate?
--------------------------------------------------
Answer: Based on the provided context, interest rate is mentioned in relation to "Mortgage Interest Rate" in Context 2. There is also a mention of RBI guidelines that may affect the amount, in Context 1.


Retrieved 3 chunks in 171ms
Generated answer in 2814ms
Total time: 2986ms

Top retrieved chunks:
  1. Score: 0.449
     *Amount subjective to change based on RBI guidelines....
     Source: unknown
  2. Score: 0.362
     • Mortgage Interest Rate or Payment Reduction Services

• Money Service Business: Check Cashiers, Tr...
     Source: unknown
  3. Score: 0.346
     What is the potential earning structure within the JioPay Business Partner Program? With the JioPay ...
     Source: unknown


('Based on the provided context, interest rate is mentioned in relation to "Mortgage Interest Rate" in Context 2. There is also a mention of RBI guidelines that may affect the amount, in Context 1.\n',
 [{'rank': 1,
   'content': np.str_('*Amount subjective to change based on RBI guidelines.'),
   'score': 0.44922080243410367,
   'metadata': {'chunk_id': 279,
    'token_count': 8,
    'source': 'unknown',
    'type': 'unknown',
    'char_count': 53}},
  {'rank': 2,
   'content': np.str_('• Mortgage Interest Rate or Payment Reduction Services\n\n• Money Service Business: Check Cashiers, Travelers checks, Currency Dealer & Exchanger'),
   'score': 0.3616061451722381,
   'metadata': {'chunk_id': 154,
    'token_count': 20,
    'source': 'unknown',
    'type': 'unknown',
    'char_count': 143}},
  {'rank': 3,
   'content': np.str_('What is the potential earning structure within the JioPay Business Partner Program? With the JioPay Business Partner Program, you receive recurring payments bas

In [None]:
def run_ablation_study_with_models():
    """Run comprehensive ablation study using actual models"""

    print("\nRunning Comprehensive Embedding Ablation Study with Actual Models")
    print("=" * 70)

    results = {}

    for dataset_name, dataset_models in embedding_data.items():
        print(f"\nEvaluating Dataset: {dataset_name}")
        print("-" * 40)

        results[dataset_name] = {}

        for model_name, data in dataset_models.items():
            if model_name not in model_configs:
                print(f"  Skipping {model_name} - model config not available")
                continue

            print(f"  Model: {model_name}")

            embeddings = data['embeddings']
            texts = data['texts']
            metadata = data['metadata']
            model_config = model_configs[model_name]

            # Retrieval quality evaluation using actual model
            retrieval_metrics = evaluate_retrieval_quality_with_models(
                embeddings, texts, metadata, eval_queries, model_name, model_config, top_k=5
            )

            # Index metrics
            index_metrics = calculate_index_metrics(embeddings, texts)

            # Combine all metrics
            results[dataset_name][model_name] = {
                **retrieval_metrics,
                **index_metrics,
                'model_name': model_name,
                'dataset_name': dataset_name
            }

            # Print summary
            print(f"    Recall@5: {retrieval_metrics['recall_at_k']:.3f}")
            print(f"    MRR: {retrieval_metrics['mrr']:.3f}")
            print(f"    Precision@5: {retrieval_metrics['precision_at_k']:.3f}")
            print(f"    Index Size: {index_metrics['index_size_mb']:.1f} MB")
            print(f"    Avg Latency: {retrieval_metrics['avg_latency_ms']:.2f} ms")
            print(f"    Successful Queries: {retrieval_metrics['successful_queries']}/{len(eval_queries)}")

    return results

# Run the ablation study with actual models
ablation_results = run_ablation_study_with_models()

In [None]:
def create_comparison_table(results):
    """Create comprehensive comparison table"""

    comparison_data = []

    for dataset_name, dataset_results in results.items():
        for model_name, metrics in dataset_results.items():

            # Calculate cost estimate (simplified)
            latency_ms = metrics['avg_latency_ms']
            cost_per_1k = latency_ms * 0.001  # Simplified cost model

            comparison_data.append({
                'Dataset': dataset_name,
                'Model': model_name,
                'Recall@5': f"{metrics['recall_at_k']:.3f}",
                'MRR': f"{metrics['mrr']:.3f}",
                'Precision@5': f"{metrics['precision_at_k']:.3f}",
                'Index_Size_MB': f"{metrics['index_size_mb']:.1f}",
                'Avg_Latency_ms': f"{metrics['avg_latency_ms']:.2f}",
                'Embedding_Dim': metrics['embedding_dimension'],
                'Num_Chunks': metrics['num_chunks'],
                'Est_Cost_per_1k': f"${cost_per_1k:.4f}"
            })

    df = pd.DataFrame(comparison_data)
    return df

# Generate comparison table
comparison_df = create_comparison_table(ablation_results)
print("\nCOMPREHENSIVE EMBEDDING COMPARISON")
print("=" * 80)
print(comparison_df.to_string(index=False))

# Save results
comparison_df.to_csv('/content/embedding_ablation_results.csv', index=False)

# Download results
files.download('/content/embedding_ablation_results.csv')

print("\nAblation study complete! Results saved and downloaded.")

In [None]:
# Setup functions for your models
def setup_qwen_for_query():
    model_name = "Qwen/Qwen3-0.6B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()
    return model, tokenizer

def setup_gemma_for_query():
    model_name = "google/embeddinggemma-300m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()
    return model, tokenizer

def setup_voyage_for_query():
    vo = voyageai.Client(api_key=v_api)
    return vo

# Load your actual models
print("Loading models for query embedding...")
qwen_model, qwen_tokenizer = setup_qwen_for_query()
gemma_model, gemma_tokenizer = setup_gemma_for_query()
voyage_client = setup_voyage_for_query()

model_configs = {
    'qwen': (qwen_model, qwen_tokenizer),
    'gemma': (gemma_model, gemma_tokenizer),
    'voyage': voyage_client
}

@retry(
    stop=stop_after_attempt(6),
    wait=wait_exponential(multiplier=1, min=25, max=60),
    retry=retry_if_exception_type((Exception,)),
    reraise=True
)
def embed_query_with_voyage_retry(vo, query_text):
    """Embed query with retry logic"""
    result = vo.embed(
        [query_text],
        model="voyage-3.5",  # Match your document embedding model
        input_type="query"  # Use "query" for search queries
    )
    return result.embeddings[0]

def embed_query_with_model(query_text, model_name, model_config):
    """Embed query using the specific model"""

    if model_name == 'voyage':
        vo = model_config
        try:
            embedding = embed_query_with_voyage_retry(vo, query_text)
            return torch.tensor(embedding)
        except Exception as e:
            print(f"Voyage API error: {e}")
            return None
    elif model_name == 'qwen':
        model, tokenizer = model_config
        inputs = tokenizer([query_text], padding=True, truncation=True,
                          max_length=32768, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            query_embedding = outputs.last_hidden_state.mean(dim=1)
            query_embedding = F.normalize(query_embedding, p=2, dim=1)

        return query_embedding.cpu().squeeze()

    elif model_name == 'gemma':
        model, tokenizer = model_config
        inputs = tokenizer([query_text], padding=True, truncation=True,
                          max_length=8192, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                query_embedding = outputs.pooler_output
            else:
                query_embedding = outputs.last_hidden_state.mean(dim=1)
            query_embedding = F.normalize(query_embedding, p=2, dim=1)

        return query_embedding.cpu().squeeze()

def perform_similarity_search_with_model(query_text, embeddings, model_name, model_config, top_k=5):
    """Perform similarity search using the actual model for query embedding"""

    start_time = time.time()

    # Embed query using the same model that created the document embeddings
    query_embedding = embed_query_with_model(query_text, model_name, model_config)

    if query_embedding is None:
        return {
            'top_k_indices': np.array([]),
            'top_k_scores': np.array([]),
            'query_time_ms': 0
        }

    # Convert to torch tensors if needed
    if not isinstance(embeddings, torch.Tensor):
        embeddings = torch.tensor(embeddings, dtype=torch.float32)
    if not isinstance(query_embedding, torch.Tensor):
        query_embedding = torch.tensor(query_embedding, dtype=torch.float32)

    # Ensure same device
    embeddings = embeddings.cpu()
    query_embedding = query_embedding.cpu()

    # Calculate cosine similarity
    similarities = F.cosine_similarity(query_embedding.unsqueeze(0), embeddings, dim=1)

    # Get top-k results
    top_k_scores, top_k_indices = torch.topk(similarities, k=min(top_k, len(similarities)))

    query_time = (time.time() - start_time) * 1000  # Convert to ms

    return {
        'top_k_indices': top_k_indices.numpy(),
        'top_k_scores': top_k_scores.numpy(),
        'query_time_ms': query_time
    }

def evaluate_retrieval_quality_with_models(embeddings, texts, metadata, queries, model_name, model_config, top_k=5):
    """Evaluate retrieval quality using actual model for query embedding"""

    recall_scores = []
    mrr_scores = []
    precision_scores = []
    latency_times = []

    print(f"Evaluating {model_name} on {len(queries)} queries...")

    for query_data in tqdm(queries, desc=f"Evaluating {model_name}"):
        query_text = query_data["query"]
        expected_topics = query_data["expected_topics"]

        try:
            # Perform similarity search
            search_results = perform_similarity_search_with_model(
                query_text, embeddings, model_name, model_config, top_k
            )

            top_k_indices = search_results['top_k_indices']
            query_time = search_results['query_time_ms']
            latency_times.append(query_time)

            if len(top_k_indices) == 0:
                recall_scores.append(0.0)
                precision_scores.append(0.0)
                mrr_scores.append(0.0)
                continue

            # Evaluate relevance
            relevant_count = 0
            first_relevant_rank = None

            for rank, idx in enumerate(top_k_indices, 1):
                text_content = texts[idx].lower()

                # Check if any expected topics are in the retrieved text
                is_relevant = any(topic.lower() in text_content for topic in expected_topics)

                if is_relevant:
                    relevant_count += 1
                    if first_relevant_rank is None:
                        first_relevant_rank = rank

            # Calculate metrics
            recall = relevant_count / min(top_k, len(expected_topics))
            precision = relevant_count / top_k
            mrr = 1.0 / first_relevant_rank if first_relevant_rank else 0.0

            recall_scores.append(recall)
            precision_scores.append(precision)
            mrr_scores.append(mrr)

        except Exception as e:
            print(f"Error processing query with {model_name}: {e}")
            # Add zero scores for failed queries
            recall_scores.append(0.0)
            precision_scores.append(0.0)
            mrr_scores.append(0.0)
            latency_times.append(0.0)

    return {
        'recall_at_k': np.mean(recall_scores),
        'precision_at_k': np.mean(precision_scores),
        'mrr': np.mean(mrr_scores),
        'avg_latency_ms': np.mean(latency_times),
        'std_latency_ms': np.std(latency_times),
        'successful_queries': sum(1 for x in recall_scores if x > 0)
    }

def calculate_index_metrics(embeddings, texts):
    """Calculate index-related metrics"""

    # Index size calculation
    embedding_size_mb = embeddings.nbytes / (1024 * 1024)

    # Memory efficiency
    num_chunks = len(embeddings)
    embedding_dim = embeddings.shape[1]

    # Storage efficiency
    avg_text_length = np.mean([len(text) for text in texts])
    total_text_size_mb = sum(len(text.encode('utf-8')) for text in texts) / (1024 * 1024)
    compression_ratio = embedding_size_mb / total_text_size_mb if total_text_size_mb > 0 else 0

    return {
        'index_size_mb': embedding_size_mb,
        'num_chunks': num_chunks,
        'embedding_dimension': embedding_dim,
        'avg_text_length': avg_text_length,
        'compression_ratio': compression_ratio,
        'total_text_size_mb': total_text_size_mb
    }

def run_ablation_study_with_models():
    """Run comprehensive ablation study using actual models"""

    print("\nRunning Comprehensive Embedding Ablation Study with Actual Models")
    print("=" * 70)

    results = {}

    for dataset_name, dataset_models in embedding_data.items():
        print(f"\nEvaluating Dataset: {dataset_name}")
        print("-" * 40)

        results[dataset_name] = {}

        for model_name, data in dataset_models.items():
            if model_name not in model_configs:
                print(f"  Skipping {model_name} - model config not available")
                continue

            print(f"  Model: {model_name}")
            print("=== DEBUG INFO ===")
            print(f"Available models in model_configs: {list(model_configs.keys())}")
            print(f"Models in embedding_data:")
            for dataset_name, dataset_models in embedding_data.items():
                print(f"  {dataset_name}: {list(dataset_models.keys())}")

            print(f"Qwen model loaded: {qwen_model is not None}")
            print(f"Gemma model loaded: {gemma_model is not None}")
            print(f"Voyage client loaded: {voyage_client is not None}")
            print("==================")
            embeddings = data['embeddings']
            texts = data['texts']
            metadata = data['metadata']
            model_config = model_configs[model_name]

            # Retrieval quality evaluation using actual model
            retrieval_metrics = evaluate_retrieval_quality_with_models(
                embeddings, texts, metadata, eval_queries, model_name, model_config, top_k=5
            )

            # Index metrics
            index_metrics = calculate_index_metrics(embeddings, texts)

            # Combine all metrics
            results[dataset_name][model_name] = {
                **retrieval_metrics,
                **index_metrics,
                'model_name': model_name,
                'dataset_name': dataset_name
            }

            # Print summary
            print(f"    Recall@5: {retrieval_metrics['recall_at_k']:.3f}")
            print(f"    MRR: {retrieval_metrics['mrr']:.3f}")
            print(f"    Precision@5: {retrieval_metrics['precision_at_k']:.3f}")
            print(f"    Index Size: {index_metrics['index_size_mb']:.1f} MB")
            print(f"    Avg Latency: {retrieval_metrics['avg_latency_ms']:.2f} ms")
            print(f"    Successful Queries: {retrieval_metrics['successful_queries']}/{len(eval_queries)}")

    return results

# Run the ablation study with actual models
ablation_results = run_ablation_study_with_models()