In [None]:
# Retrieval Methods Evaluation for StarTech Dataset
# Google Colab Notebook for comparing different retrieval approaches

"""
This notebook evaluates various retrieval methods including:
1. Semantic Search (Vector Similarity)
2. Keyword Search (BM25)
3. Hybrid Retrieval (Semantic + Keyword)
4. Reranking Approaches
5. Advanced Hybrid Methods

We'll use the chunked StarTech product dataset and evaluate retrieval quality.
"""

# ==========================================
# SECTION 1: INSTALLATION & IMPORTS
# ==========================================

# Install required packages
!pip install sentence-transformers faiss-cpu rank-bm25 chromadb transformers torch datasets scikit-learn plotly seaborn


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from ch

In [None]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Search and retrieval libraries
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import chromadb
from chromadb.config import Settings

# NLP and ML utilities
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score, precision_score, recall_score
import re
import json
from typing import List, Dict, Any, Tuple
import time
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✅ All packages installed and imported successfully!")

# ==========================================
# SECTION 2: DATA LOADING & PREPARATION
# ==========================================

# Upload your chunked data or original CSV file
from google.colab import files

print("Please upload your StarTech dataset (original CSV or chunked data):")
uploaded = files.upload()

# Get the filename
filename = list(uploaded.keys())[0]
print(f"📁 Loaded file: {filename}")

# Load the dataset
df = pd.read_csv(filename)

print(f"📊 Dataset shape: {df.shape}")
print(f"🏷️  Columns: {list(df.columns)}")
print("\n📋 First few rows:")
display(df.head())

# ==========================================
# SECTION 3: DATA PREPROCESSING FOR RETRIEVAL
# ==========================================

# Check if this is chunked data or original data
if 'strategy' in df.columns and 'text' in df.columns:
    print("📄 Detected chunked data from previous step")
    chunks_df = df.copy()
    documents = chunks_df['text'].tolist()
else:
    print("📄 Detected original product data - creating text representations")

    def create_searchable_text(row):
        """Create searchable text from product data"""
        text_parts = []

        # Add name
        if pd.notna(row['name']):
            text_parts.append(f"{row['name']}")

        # Add brand and model
        if pd.notna(row['brand']):
            text_parts.append(f"Brand: {row['brand']}")
        if pd.notna(row['model']):
            text_parts.append(f"Model: {row['model']}")

        # Add category information
        if pd.notna(row['category']):
            text_parts.append(f"Category: {row['category']}")
        if pd.notna(row['subcategory']):
            text_parts.append(f"Subcategory: {row['subcategory']}")

        # Add price and availability
        if pd.notna(row['price']):
            text_parts.append(f"Price: {row['price']}")
        if pd.notna(row['availability']):
            text_parts.append(f"Availability: {row['availability']}")

        return ". ".join(text_parts)

    # Create text representations
    df['searchable_text'] = df.apply(create_searchable_text, axis=1)
    documents = df['searchable_text'].tolist()

    # Create chunks dataframe for consistency
    chunks_df = pd.DataFrame({
        'chunk_id': range(len(documents)),
        'text': documents,
        'strategy': 'original',
        'length': [len(text) for text in documents],
        'product_id': df.index,
        'name': df['name'],
        'category': df['category'],
        'price': df['price']
    })

print(f"📝 Prepared {len(documents)} documents for retrieval evaluation")
print(f"📊 Average document length: {np.mean([len(doc) for doc in documents]):.1f} characters")

# ==========================================
# SECTION 4: CREATE TEST QUERIES
# ==========================================

print("\n" + "="*60)
print("🔍 CREATING TEST QUERIES FOR EVALUATION")
print("="*60)

# Create diverse test queries based on the StarTech dataset
test_queries = [
    # Brand-specific queries
    "ASUS laptops with good performance",
    "Samsung monitors for gaming",
    "Apple products under 50000 taka",
    "Dell workstation computers",
    "HP printers for office use",

    # Category-specific queries
    "gaming laptops with high refresh rate",
    "4K monitors for video editing",
    "wireless mechanical keyboards",
    "budget smartphones under 20000",
    "SSD storage 1TB capacity",

    # Feature-specific queries
    "RGB lighting gaming peripherals",
    "noise cancelling headphones",
    "fast charging power banks",
    "waterproof smartwatches",
    "high DPI gaming mouse",

    # Price-range queries
    "cheap accessories under 5000 taka",
    "premium laptops over 100000",
    "mid-range graphics cards",
    "affordable tablets for students",
    "expensive gaming setups",

    # Technical specifications
    "Intel core i7 processors",
    "NVIDIA RTX graphics cards",
    "16GB RAM computers",
    "USB-C charging cables",
    "Bluetooth wireless speakers"
]

print(f"📋 Created {len(test_queries)} test queries for evaluation")
for i, query in enumerate(test_queries[:5]):
    print(f"  {i+1}. {query}")
print("  ...")

# ==========================================
# SECTION 5: SEMANTIC SEARCH SETUP
# ==========================================

print("\n" + "="*60)
print("🧠 SETTING UP SEMANTIC SEARCH")
print("="*60)

# Initialize sentence transformer model
print("📥 Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good quality
print("✅ Embedding model loaded successfully!")

# Generate embeddings for all documents
print("🔄 Generating document embeddings...")
start_time = time.time()
document_embeddings = embedding_model.encode(documents, show_progress_bar=True)
embedding_time = time.time() - start_time
print(f"✅ Generated embeddings in {embedding_time:.2f} seconds")
print(f"📊 Embedding shape: {document_embeddings.shape}")

# Set up FAISS index for fast similarity search
print("🔧 Setting up FAISS index...")
dimension = document_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
faiss_index.add(document_embeddings.astype('float32'))
print(f"✅ FAISS index created with {faiss_index.ntotal} vectors")

# Generate query embeddings
print("🔄 Generating query embeddings...")
query_embeddings = embedding_model.encode(test_queries, show_progress_bar=True)
print(f"✅ Generated {len(query_embeddings)} query embeddings")

# ==========================================
# SECTION 6: KEYWORD SEARCH SETUP (BM25)
# ==========================================

print("\n" + "="*60)
print("🔤 SETTING UP KEYWORD SEARCH (BM25)")
print("="*60)

def preprocess_text(text):
    """Simple text preprocessing for BM25"""
    # Convert to lowercase and split into tokens
    text = text.lower()
    # Remove special characters and split
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Preprocess documents for BM25
print("🔄 Preprocessing documents for BM25...")
tokenized_docs = [preprocess_text(doc) for doc in documents]
tokenized_queries = [preprocess_text(query) for query in test_queries]

# Initialize BM25
print("🔧 Initializing BM25 index...")
bm25 = BM25Okapi(tokenized_docs)
print(f"✅ BM25 index created for {len(tokenized_docs)} documents")

# ==========================================
# SECTION 7: RETRIEVAL FUNCTIONS
# ==========================================

print("\n" + "="*60)
print("⚙️ DEFINING RETRIEVAL FUNCTIONS")
print("="*60)

def semantic_search(query_embedding, top_k=10):
    """Perform semantic search using FAISS"""
    scores, indices = faiss_index.search(
        query_embedding.reshape(1, -1).astype('float32'),
        top_k
    )
    return indices[0], scores[0]

def keyword_search(query_tokens, top_k=10):
    """Perform keyword search using BM25"""
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    top_scores = scores[top_indices]
    return top_indices, top_scores

def hybrid_search(query_embedding, query_tokens, top_k=10, alpha=0.5):
    """Combine semantic and keyword search results"""
    # Get semantic search results
    sem_indices, sem_scores = semantic_search(query_embedding, top_k*2)

    # Get keyword search results
    key_indices, key_scores = keyword_search(query_tokens, top_k*2)

    # Normalize scores to [0, 1]
    sem_scores_norm = (sem_scores - sem_scores.min()) / (sem_scores.max() - sem_scores.min() + 1e-8)
    key_scores_norm = (key_scores - key_scores.min()) / (key_scores.max() - key_scores.min() + 1e-8)

    # Create combined scores dictionary
    combined_scores = {}

    # Add semantic scores
    for idx, score in zip(sem_indices, sem_scores_norm):
        combined_scores[idx] = alpha * score

    # Add keyword scores
    for idx, score in zip(key_indices, key_scores_norm):
        if idx in combined_scores:
            combined_scores[idx] += (1 - alpha) * score
        else:
            combined_scores[idx] = (1 - alpha) * score

    # Sort by combined score
    sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_indices = [idx for idx, _ in sorted_items[:top_k]]
    final_scores = [score for _, score in sorted_items[:top_k]]

    return np.array(final_indices), np.array(final_scores)

def reciprocal_rank_fusion(sem_indices, key_indices, k=60, top_k=10):
    """Combine results using Reciprocal Rank Fusion"""
    rrf_scores = {}

    # Add semantic search scores
    for rank, idx in enumerate(sem_indices):
        rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (rank + k)

    # Add keyword search scores
    for rank, idx in enumerate(key_indices):
        rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (rank + k)

    # Sort by RRF score
    sorted_items = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    final_indices = [idx for idx, _ in sorted_items[:top_k]]
    final_scores = [score for _, score in sorted_items[:top_k]]

    return np.array(final_indices), np.array(final_scores)

print("✅ Retrieval functions defined successfully!")

# ==========================================
# SECTION 8: RERANKING SETUP
# ==========================================

print("\n" + "="*60)
print("🎯 SETTING UP RERANKING MODELS")
print("="*60)

# Initialize cross-encoder for reranking
print("📥 Loading reranking model...")
try:
    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    print("✅ Reranking model loaded successfully!")
    reranking_available = True
except Exception as e:
    print(f"⚠️  Could not load reranking model: {e}")
    print("Continuing without reranking...")
    reranking_available = False

def rerank_results(query, retrieved_docs, retrieved_indices, top_k=10):
    """Rerank retrieved results using cross-encoder"""
    if not reranking_available:
        return retrieved_indices[:top_k], np.ones(min(len(retrieved_indices), top_k))

    # Create query-document pairs
    pairs = [[query, doc] for doc in retrieved_docs]

    # Get reranking scores
    rerank_scores = reranker.predict(pairs)

    # Sort by reranking scores
    sorted_pairs = sorted(zip(retrieved_indices, rerank_scores), key=lambda x: x[1], reverse=True)

    reranked_indices = [idx for idx, _ in sorted_pairs[:top_k]]
    reranked_scores = [score for _, score in sorted_pairs[:top_k]]

    return np.array(reranked_indices), np.array(reranked_scores)

# ==========================================
# SECTION 9: EVALUATION METRICS
# ==========================================

print("\n" + "="*60)
print("📊 DEFINING EVALUATION METRICS")
print("="*60)

def calculate_retrieval_metrics(query, retrieved_indices, method_name):
    """Calculate various retrieval metrics"""
    retrieved_docs = [documents[i] for i in retrieved_indices[:10]]

    # Simple relevance scoring based on query terms in documents
    query_terms = set(preprocess_text(query))
    relevance_scores = []

    for doc in retrieved_docs:
        doc_terms = set(preprocess_text(doc))
        # Jaccard similarity as simple relevance measure
        intersection = len(query_terms.intersection(doc_terms))
        union = len(query_terms.union(doc_terms))
        jaccard = intersection / union if union > 0 else 0
        relevance_scores.append(jaccard)

    # Calculate metrics
    metrics = {
        'method': method_name,
        'query': query,
        'avg_relevance': np.mean(relevance_scores),
        'max_relevance': np.max(relevance_scores),
        'relevant_docs': sum(1 for score in relevance_scores if score > 0.1),
        'top_relevance': relevance_scores[0] if relevance_scores else 0,
        'retrieved_docs': len(retrieved_docs)
    }

    return metrics, retrieved_docs

def evaluate_diversity(retrieved_indices):
    """Evaluate diversity of retrieved results"""
    if len(retrieved_indices) < 2:
        return 0

    retrieved_embeddings = document_embeddings[retrieved_indices]

    # Calculate pairwise similarities
    similarities = cosine_similarity(retrieved_embeddings)

    # Average pairwise similarity (lower = more diverse)
    upper_triangle = similarities[np.triu_indices_from(similarities, k=1)]
    avg_similarity = np.mean(upper_triangle) if len(upper_triangle) > 0 else 0

    # Diversity score (1 - similarity)
    diversity = 1 - avg_similarity
    return diversity

print("✅ Evaluation metrics defined successfully!")

# ==========================================
# SECTION 10: RUN COMPREHENSIVE EVALUATION
# ==========================================

print("\n" + "="*60)
print("🚀 RUNNING COMPREHENSIVE RETRIEVAL EVALUATION")
print("="*60)

# Initialize results storage
all_results = []
detailed_results = {}

print("🔄 Evaluating all retrieval methods...")

for i, query in enumerate(test_queries):
    print(f"\n📝 Query {i+1}/{len(test_queries)}: {query}")

    query_embedding = query_embeddings[i]
    query_tokens = tokenized_queries[i]

    # Dictionary to store results for this query
    query_results = {'query': query, 'methods': {}}

    # 1. Semantic Search
    sem_indices, sem_scores = semantic_search(query_embedding, top_k=20)
    metrics, docs = calculate_retrieval_metrics(query, sem_indices, "Semantic Search")
    metrics['diversity'] = evaluate_diversity(sem_indices[:10])
    metrics['avg_score'] = np.mean(sem_scores[:10])
    all_results.append(metrics)
    query_results['methods']['semantic'] = {
        'indices': sem_indices[:10].tolist(),
        'scores': sem_scores[:10].tolist(),
        'metrics': metrics
    }

    # 2. Keyword Search (BM25)
    key_indices, key_scores = keyword_search(query_tokens, top_k=20)
    metrics, docs = calculate_retrieval_metrics(query, key_indices, "Keyword Search (BM25)")
    metrics['diversity'] = evaluate_diversity(key_indices[:10])
    metrics['avg_score'] = np.mean(key_scores[:10])
    all_results.append(metrics)
    query_results['methods']['keyword'] = {
        'indices': key_indices[:10].tolist(),
        'scores': key_scores[:10].tolist(),
        'metrics': metrics
    }

    # 3. Hybrid Search (Weighted)
    hyb_indices, hyb_scores = hybrid_search(query_embedding, query_tokens, top_k=10, alpha=0.7)
    metrics, docs = calculate_retrieval_metrics(query, hyb_indices, "Hybrid Search (Weighted)")
    metrics['diversity'] = evaluate_diversity(hyb_indices)
    metrics['avg_score'] = np.mean(hyb_scores)
    all_results.append(metrics)
    query_results['methods']['hybrid_weighted'] = {
        'indices': hyb_indices.tolist(),
        'scores': hyb_scores.tolist(),
        'metrics': metrics
    }

    # 4. Reciprocal Rank Fusion
    rrf_indices, rrf_scores = reciprocal_rank_fusion(sem_indices[:20], key_indices[:20], top_k=10)
    metrics, docs = calculate_retrieval_metrics(query, rrf_indices, "Reciprocal Rank Fusion")
    metrics['diversity'] = evaluate_diversity(rrf_indices)
    metrics['avg_score'] = np.mean(rrf_scores)
    all_results.append(metrics)
    query_results['methods']['rrf'] = {
        'indices': rrf_indices.tolist(),
        'scores': rrf_scores.tolist(),
        'metrics': metrics
    }

    # 5. Semantic + Reranking
    if reranking_available:
        sem_docs_for_rerank = [documents[idx] for idx in sem_indices[:20]]
        rerank_indices, rerank_scores = rerank_results(query, sem_docs_for_rerank, sem_indices[:20], top_k=10)
        metrics, docs = calculate_retrieval_metrics(query, rerank_indices, "Semantic + Reranking")
        metrics['diversity'] = evaluate_diversity(rerank_indices)
        metrics['avg_score'] = np.mean(rerank_scores)
        all_results.append(metrics)
        query_results['methods']['semantic_rerank'] = {
            'indices': rerank_indices.tolist(),
            'scores': rerank_scores.tolist(),
            'metrics': metrics
        }

    # 6. Hybrid + Reranking
    if reranking_available:
        hyb_docs_for_rerank = [documents[idx] for idx in hyb_indices]
        hyb_rerank_indices, hyb_rerank_scores = rerank_results(query, hyb_docs_for_rerank, hyb_indices, top_k=10)
        metrics, docs = calculate_retrieval_metrics(query, hyb_rerank_indices, "Hybrid + Reranking")
        metrics['diversity'] = evaluate_diversity(hyb_rerank_indices)
        metrics['avg_score'] = np.mean(hyb_rerank_scores)
        all_results.append(metrics)
        query_results['methods']['hybrid_rerank'] = {
            'indices': hyb_rerank_indices.tolist(),
            'scores': hyb_rerank_scores.tolist(),
            'metrics': metrics
        }

    detailed_results[f"query_{i}"] = query_results

print("\n✅ Evaluation completed successfully!")

# ==========================================
# SECTION 11: RESULTS ANALYSIS & VISUALIZATION
# ==========================================

print("\n" + "="*60)
print("📈 ANALYZING RESULTS")
print("="*60)

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Calculate average metrics by method
method_summary = results_df.groupby('method').agg({
    'avg_relevance': ['mean', 'std'],
    'max_relevance': ['mean', 'std'],
    'relevant_docs': ['mean', 'std'],
    'top_relevance': ['mean', 'std'],
    'diversity': ['mean', 'std'],
    'avg_score': ['mean', 'std']
}).round(4)

print("📊 Method Performance Summary:")
print(method_summary)

# Create comprehensive visualizations
fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=('Average Relevance by Method', 'Diversity by Method',
                   'Top Document Relevance', 'Number of Relevant Documents',
                   'Method Comparison Heatmap', 'Score Distribution'),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "bar"}],
           [{"type": "box"}, {"type": "violin"}]]
)

# 1. Average Relevance
avg_relevance = results_df.groupby('method')['avg_relevance'].mean().sort_values(ascending=False)
fig.add_trace(
    go.Bar(x=avg_relevance.index, y=avg_relevance.values, name="Avg Relevance",
           marker_color='lightblue'),
    row=1, col=1
)

# 2. Diversity
avg_diversity = results_df.groupby('method')['diversity'].mean().sort_values(ascending=False)
fig.add_trace(
    go.Bar(x=avg_diversity.index, y=avg_diversity.values, name="Diversity",
           marker_color='lightgreen'),
    row=1, col=2
)

# 3. Top Document Relevance
top_relevance = results_df.groupby('method')['top_relevance'].mean().sort_values(ascending=False)
fig.add_trace(
    go.Bar(x=top_relevance.index, y=top_relevance.values, name="Top Relevance",
           marker_color='lightcoral'),
    row=2, col=1
)

# 4. Relevant Documents Count
relevant_count = results_df.groupby('method')['relevant_docs'].mean().sort_values(ascending=False)
fig.add_trace(
    go.Bar(x=relevant_count.index, y=relevant_count.values, name="Relevant Docs",
           marker_color='lightyellow'),
    row=2, col=2
)

# 5. Box plot for average relevance distribution
methods = results_df['method'].unique()
for method in methods:
    method_data = results_df[results_df['method'] == method]['avg_relevance']
    fig.add_trace(
        go.Box(y=method_data, name=method, showlegend=False),
        row=3, col=1
    )

# 6. Violin plot for diversity distribution
for method in methods:
    method_data = results_df[results_df['method'] == method]['diversity']
    fig.add_trace(
        go.Violin(y=method_data, name=method, showlegend=False),
        row=3, col=2
    )

fig.update_layout(height=1200, title_text="Retrieval Methods Comprehensive Evaluation")
fig.show()

# ==========================================
# SECTION 12: DETAILED METHOD COMPARISON
# ==========================================

print("\n" + "="*60)
print("🔍 DETAILED METHOD COMPARISON")
print("="*60)

# Create ranking comparison
metrics_for_ranking = ['avg_relevance', 'top_relevance', 'diversity', 'relevant_docs']

print("🏆 Method Rankings:")
print("="*40)

for metric in metrics_for_ranking:
    print(f"\n📊 {metric.replace('_', ' ').title()}:")
    ranking = results_df.groupby('method')[metric].mean().sort_values(ascending=False)
    for i, (method, score) in enumerate(ranking.items()):
        print(f"  {i+1}. {method}: {score:.4f}")

# Statistical significance testing
from scipy import stats

print("\n🔬 Statistical Significance Tests:")
print("="*40)

methods = results_df['method'].unique()
for i, method1 in enumerate(methods):
    for method2 in methods[i+1:]:
        group1 = results_df[results_df['method'] == method1]['avg_relevance']
        group2 = results_df[results_df['method'] == method2]['avg_relevance']

        if len(group1) > 1 and len(group2) > 1:
            t_stat, p_value = stats.ttest_ind(group1, group2)
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
            print(f"{method1} vs {method2}: p={p_value:.4f} {significance}")

# ==========================================
# SECTION 13: QUERY-SPECIFIC ANALYSIS
# ==========================================

print("\n" + "="*60)
print("🎯 QUERY-SPECIFIC ANALYSIS")
print("="*60)

# Find best method for each query
query_best_methods = []
for query in test_queries:
    query_results = results_df[results_df['query'] == query]
    best_method = query_results.loc[query_results['avg_relevance'].idxmax(), 'method']
    best_score = query_results['avg_relevance'].max()
    query_best_methods.append({
        'query': query,
        'best_method': best_method,
        'best_score': best_score
    })

query_analysis_df = pd.DataFrame(query_best_methods)

# Count wins per method
method_wins = query_analysis_df['best_method'].value_counts()
print("🏆 Method Wins Count:")
for method, wins in method_wins.items():
    print(f"  {method}: {wins} wins ({wins/len(test_queries)*100:.1f}%)")

# Analyze query characteristics
print("\n📊 Query Analysis:")
query_lengths = [len(query.split()) for query in test_queries]
print(f"Average query length: {np.mean(query_lengths):.1f} words")
print(f"Query length range: {min(query_lengths)} - {max(query_lengths)} words")

# ==========================================
# SECTION 14: SAMPLE RESULTS INSPECTION
# ==========================================

print("\n" + "="*60)
print("🔍 SAMPLE RESULTS INSPECTION")
print("="*60)

# Show detailed results for a few sample queries
sample_queries = test_queries[:3]

for query in sample_queries:
    print(f"\n🔍 Query: '{query}'")
    print("="*50)

    query_results = results_df[results_df['query'] == query]

    for _, result in query_results.iterrows():
        method = result['method']
        relevance = result['avg_relevance']
        diversity = result['diversity']
        relevant_count = result['relevant_docs']

        print(f"\n📊 {method}:")
        print(f"  Avg Relevance: {relevance:.4f}")
        print(f"  Diversity: {diversity:.4f}")
        print(f"  Relevant Docs: {relevant_count}")

        # Show top 3 retrieved documents
        if query in [res['query'] for res in detailed_results.values()]:
            for query_data in detailed_results.values():
                if query_data['query'] == query:
                    method_key = method.lower().replace(' ', '_').replace('(', '').replace(')', '').replace('+', '_')
                    if method_key in query_data['methods']:
                        indices = query_data['methods'][method_key]['indices'][:3]
                        print(f"  Top 3 documents:")
                        for i, idx in enumerate(indices):
                            doc_preview = documents[idx][:100] + "..." if len(documents[idx]) > 100 else documents[idx]
                            print(f"    {i+1}. {doc_preview}")
                    break

# ==========================================
# SECTION 15: RECOMMENDATIONS & EXPORT
# ==========================================

print("\n" + "="*60)
print("💡 RECOMMENDATIONS & RESULTS EXPORT")
print("="*60)

# Generate recommendations
best_overall_method = results_df.groupby('method')['avg_relevance'].mean().idxmax()
most_diverse_method = results_df.groupby('method')['diversity'].mean().idxmax()
most_consistent_method = results_df.groupby('method')['avg_relevance'].std().idxmin()

print("🎯 Key Recommendations:")
print(f"  🏆 Best Overall Performance: {best_overall_method}")
print(f"  🌈 Most Diverse Results: {most_diverse_method}")
print(f"  📊 Most Consistent: {most_consistent_method}")

# Performance insights
print("\n📈 Performance Insights:")
semantic_performance = results_df[results_df['method'] == 'Semantic Search']['avg_relevance'].mean()
keyword_performance = results_df[results_df['method'] == 'Keyword Search (BM25)']['avg_relevance'].mean()

if semantic_performance > keyword_performance:
    print(f"  📊 Semantic search outperforms keyword search by {((semantic_performance/keyword_performance-1)*100):.1f}%")
else:
    print(f"  📊 Keyword search outperforms semantic search by {((keyword_performance/semantic_performance-1)*100):.1f}%")

# Check if hybrid methods improve performance
hybrid_methods = [method for method in results_df['method'].unique() if 'Hybrid' in method or 'Fusion' in method]
if hybrid_methods:
    hybrid_performance = results_df[results_df['method'].isin(hybrid_methods)]['avg_relevance'].mean()
    base_performance = max(semantic_performance, keyword_performance)
    if hybrid_performance > base_performance:
        print(f"  🔗 Hybrid methods improve performance by {((hybrid_performance/base_performance-1)*100):.1f}%")
    else:
        print(f"  ⚠️  Hybrid methods show {((1-hybrid_performance/base_performance)*100):.1f}% lower performance")

# Export results
results_df.to_csv('retrieval_evaluation_results.csv', index=False)
method_summary.to_csv('method_performance_summary.csv')

# Convert numpy types to Python native types for JSON serialization
def convert_numpy_types(obj):
    """Recursively convert numpy types to Python native types"""
    if isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Export detailed results with numpy type conversion
print("🔄 Converting numpy types for JSON export...")
detailed_results_converted = convert_numpy_types(detailed_results)

with open('detailed_retrieval_results.json', 'w') as f:
    json.dump(detailed_results_converted, f, indent=2)

print("\n💾 Results exported successfully!")
print("📁 Files created:")
print("  - retrieval_evaluation_results.csv")
print("  - method_performance_summary.csv")
print("  - detailed_retrieval_results.json")

print("\n✨ Retrieval evaluation completed successfully! ✨")

print("\n🚀 Next Steps:")
print("  1. Fine-tune the best performing method with optimal parameters")
print("  2. Consider domain-specific embedding models for your product data")
print("  3. Experiment with different reranking models")
print("  4. A/B test the top methods in your production environment")
print("  5. Create custom evaluation metrics based on your specific use case")

✅ All packages installed and imported successfully!
Please upload your StarTech dataset (original CSV or chunked data):


Saving startech_fast_20250903_195133.csv to startech_fast_20250903_195133.csv
📁 Loaded file: startech_fast_20250903_195133.csv
📊 Dataset shape: (8462, 10)
🏷️  Columns: ['name', 'price', 'brand', 'category', 'subcategory', 'availability', 'image_url', 'product_url', 'model', 'rating']

📋 First few rows:


Unnamed: 0,name,price,brand,category,subcategory,availability,image_url,product_url,model,rating
0,Intel Core i3-12100 12th Gen Budget Desktop PC,"28,300৳30,120৳",,Intel PC,Star PC,,https://www.startech.com.bd/image/cache/catalog/desktop-pc/desktop-offer/intel-core-i3-12100-12t...,https://www.startech.com.bd/intel-core-i3-12100-12th-gen-budget-desktop-pc,,
1,Intel 10th Gen Core i5-10400 Desktop PC,"32,150৳",,Intel PC,Star PC,,https://www.startech.com.bd/image/cache/catalog/desktop-pc/desktop-offer/intel-10th-gen-core-i5-...,https://www.startech.com.bd/intel-10th-gen-core-i5-10400-desktop-pc,,
2,Intel 12th Gen Core i5-12400 Desktop PC,"33,248৳35,750৳",,Intel PC,Star PC,,https://www.startech.com.bd/image/cache/catalog/desktop-pc/desktop-offer/38909-228x228.webp,https://www.startech.com.bd/intel-12th-gen-core-i5-12400-desktop-pc,,
3,Intel Core i5 14500 14th Gen Desktop PC,"48,500৳51,800৳",,Intel PC,Star PC,,https://www.startech.com.bd/image/cache/catalog/desktop-pc/desktop-offer/intel-core-i5-14500-des...,https://www.startech.com.bd/intel-core-i5-14500-desktop-pc,,
4,Intel 13th Gen Core i5 13400 Budget Desktop PC with Monitor,"58,999৳60,600৳",,Intel PC,Star PC,,https://www.startech.com.bd/image/cache/catalog/desktop-pc/desktop-offer/intel-13th-gen-core-i5-...,https://www.startech.com.bd/intel-13th-gen-core-i5-13400-budget-desktop-pc-with-monitor,,


📄 Detected original product data - creating text representations
📝 Prepared 8462 documents for retrieval evaluation
📊 Average document length: 112.0 characters

🔍 CREATING TEST QUERIES FOR EVALUATION
📋 Created 25 test queries for evaluation
  1. ASUS laptops with good performance
  2. Samsung monitors for gaming
  3. Apple products under 50000 taka
  4. Dell workstation computers
  5. HP printers for office use
  ...

🧠 SETTING UP SEMANTIC SEARCH
📥 Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded successfully!
🔄 Generating document embeddings...


Batches:   0%|          | 0/265 [00:00<?, ?it/s]

✅ Generated embeddings in 157.13 seconds
📊 Embedding shape: (8462, 384)
🔧 Setting up FAISS index...
✅ FAISS index created with 8462 vectors
🔄 Generating query embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Generated 25 query embeddings

🔤 SETTING UP KEYWORD SEARCH (BM25)
🔄 Preprocessing documents for BM25...
🔧 Initializing BM25 index...
✅ BM25 index created for 8462 documents

⚙️ DEFINING RETRIEVAL FUNCTIONS
✅ Retrieval functions defined successfully!

🎯 SETTING UP RERANKING MODELS
📥 Loading reranking model...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ Reranking model loaded successfully!

📊 DEFINING EVALUATION METRICS
✅ Evaluation metrics defined successfully!

🚀 RUNNING COMPREHENSIVE RETRIEVAL EVALUATION
🔄 Evaluating all retrieval methods...

📝 Query 1/25: ASUS laptops with good performance

📝 Query 2/25: Samsung monitors for gaming

📝 Query 3/25: Apple products under 50000 taka

📝 Query 4/25: Dell workstation computers

📝 Query 5/25: HP printers for office use

📝 Query 6/25: gaming laptops with high refresh rate

📝 Query 7/25: 4K monitors for video editing

📝 Query 8/25: wireless mechanical keyboards

📝 Query 9/25: budget smartphones under 20000

📝 Query 10/25: SSD storage 1TB capacity

📝 Query 11/25: RGB lighting gaming peripherals

📝 Query 12/25: noise cancelling headphones

📝 Query 13/25: fast charging power banks

📝 Query 14/25: waterproof smartwatches

📝 Query 15/25: high DPI gaming mouse

📝 Query 16/25: cheap accessories under 5000 taka

📝 Query 17/25: premium laptops over 100000

📝 Query 18/25: mid-range graphics cards

📝


🔍 DETAILED METHOD COMPARISON
🏆 Method Rankings:

📊 Avg Relevance:
  1. Keyword Search (BM25): 0.1048
  2. Reciprocal Rank Fusion: 0.0891
  3. Hybrid Search (Weighted): 0.0802
  4. Hybrid + Reranking: 0.0802
  5. Semantic + Reranking: 0.0768
  6. Semantic Search: 0.0697

📊 Top Relevance:
  1. Keyword Search (BM25): 0.1195
  2. Reciprocal Rank Fusion: 0.0965
  3. Hybrid + Reranking: 0.0922
  4. Semantic + Reranking: 0.0897
  5. Hybrid Search (Weighted): 0.0895
  6. Semantic Search: 0.0774

📊 Diversity:
  1. Keyword Search (BM25): 0.4306
  2. Reciprocal Rank Fusion: 0.4039
  3. Hybrid + Reranking: 0.3428
  4. Hybrid Search (Weighted): 0.3428
  5. Semantic Search: 0.2971
  6. Semantic + Reranking: 0.2724

📊 Relevant Docs:
  1. Keyword Search (BM25): 4.8000
  2. Reciprocal Rank Fusion: 4.0800
  3. Hybrid Search (Weighted): 3.7600
  4. Hybrid + Reranking: 3.7600
  5. Semantic + Reranking: 3.5600
  6. Semantic Search: 3.0800

🔬 Statistical Significance Tests:
Semantic Search vs Keyword Searc