In [2]:
import requests
import json
import random
from datetime import datetime

# Elasticsearch configuration
ES_HOST = "http://localhost:9200"

# Index configuration
QUERY_INDICES = {
    'public': 'queries',
    'private': 'private_queries', 
    'private_clean': 'private_queries_clean',
    'training': 'queries_training'
}

ARTICLES_INDEX = "articles_content_title"

class NotebookQueryArticleSearcher:
    def __init__(self, es_host=ES_HOST):
        self.es_host = es_host
        self.session = requests.Session()
        self.session.headers.update({'Content-Type': 'application/json'})
        
    def test_connection(self):
        """Test Elasticsearch connection"""
        try:
            response = self.session.get(self.es_host)
            if response.status_code == 200:
                cluster_info = response.json()
                print(f"✅ Connected to Elasticsearch {cluster_info['version']['number']}")
                return True
            return False
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            return False
    
    def check_index_exists(self, index_name):
        """Check if index exists"""
        try:
            response = self.session.get(f"{self.es_host}/{index_name}")
            return response.status_code == 200
        except:
            return False
    
    def get_queries_from_index(self, query_index, search_term=None, size=10, random_sample=False):
        """Lấy queries từ query index"""
        
        if random_sample:
            search_body = {
                "query": {"match_all": {}},
                "sort": [{"_script": {"script": "Math.random()", "type": "number"}}],
                "size": size
            }
        elif search_term:
            search_body = {
                "query": {
                    "match": {
                        "query_text": {
                            "query": search_term,
                            "fuzziness": "AUTO"
                        }
                    }
                },
                "sort": ["_score"],
                "size": size
            }
        else:
            search_body = {
                "query": {"match_all": {}},
                "sort": [{"created_at": {"order": "desc", "missing": "_last"}}],
                "size": size
            }
        
        try:
            response = self.session.post(
                f"{self.es_host}/{query_index}/_search",
                data=json.dumps(search_body),
                timeout=30
            )
            
            if response.status_code == 200:
                results = response.json()
                queries = []
                
                for hit in results['hits']['hits']:
                    source = hit['_source']
                    query_text = source.get('query_text', '').strip()
                    
                    if query_text:
                        queries.append({
                            'query_id': source.get('query_id', hit['_id']),
                            'query_text': query_text,
                            'created_at': source.get('created_at', 'N/A'),
                            'entities': source.get('entities', [])
                        })
                
                return queries
            else:
                print(f"❌ Failed to get queries: {response.status_code}")
                return []
                
        except Exception as e:
            print(f"❌ Error getting queries: {e}")
            return []
    
    def search_articles_with_query(self, query_text, size=10):
        """Dùng query_text để search trong articles_content_title"""
        
        search_body = {
            "query": {
                "multi_match": {
                    "query": query_text,
                    "fields": [
                        "title^3",      # Boost title
                        "content^1"     # Content weight
                    ],
                    "type": "best_fields",
                    "fuzziness": "AUTO"
                }
            },
            "highlight": {
                "fields": {
                    "title": {},
                    "content": {
                        "fragment_size": 200,
                        "number_of_fragments": 2
                    }
                }
            },
            "sort": ["_score"],
            "size": size
        }
        
        try:
            response = self.session.post(
                f"{self.es_host}/{ARTICLES_INDEX}/_search",
                data=json.dumps(search_body),
                timeout=30
            )
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"❌ Article search failed: {response.status_code}")
                return None
                
        except Exception as e:
            print(f"❌ Article search error: {e}")
            return None
    
    def display_search_results(self, query_info, article_results):
        """Display search results"""
        
        query_text = query_info['query_text']
        query_id = query_info['query_id']
        
        print(f"\n🔍 QUERY: {query_text}")
        print(f"📋 Query ID: {query_id}")
        
        if not article_results or not article_results.get('hits'):
            print(f"❌ No articles found for this query")
            return
        
        total = article_results['hits']['total']['value']
        print(f"📊 Found: {total:,} articles")
        print("=" * 80)
        
        for i, hit in enumerate(article_results['hits']['hits'], 1):
            source = hit['_source']
            score = hit['_score']
            
            title = source.get('title', 'No title')
            url = source.get('url', 'No URL')
            date = source.get('date', 'N/A')
            
            print(f"\n{i}. {title}")
            print(f"   🔗 {url}")
            print(f"   📅 {date} | Score: {score:.2f}")
            
            # Show highlights
            if 'highlight' in hit:
                if 'title' in hit['highlight']:
                    title_highlight = hit['highlight']['title'][0]
                    print(f"   💡 Title: {title_highlight}")
                
                if 'content' in hit['highlight']:
                    content_highlight = hit['highlight']['content'][0]
                    print(f"   📄 Content: {content_highlight}...")
            
            print("-" * 80)

# Initialize searcher
searcher = NotebookQueryArticleSearcher()

# Test connection và check indices
if not searcher.test_connection():
    print(f"❌ Cannot connect to Elasticsearch at {ES_HOST}")
else:
    print("🚀 Query-to-Article searcher ready!")
    
    # Check available indices
    print(f"\n📊 Checking indices...")
    for name, index in QUERY_INDICES.items():
        status = "✅" if searcher.check_index_exists(index) else "❌"
        print(f"  {name:15} | {index:20} | {status}")
    
    articles_status = "✅" if searcher.check_index_exists(ARTICLES_INDEX) else "❌"
    print(f"  {'articles':15} | {ARTICLES_INDEX:20} | {articles_status}")

# =============================================================================
# NOTEBOOK-FRIENDLY FUNCTIONS
# =============================================================================

def test_random_public_queries(count=3, articles_per_query=5):
    """
    Test random queries từ public index
    
    Args:
        count (int): Số lượng queries để test
        articles_per_query (int): Số articles per query
    
    Example:
        test_random_public_queries(5, 3)
    """
    print(f"🎲 Testing {count} random PUBLIC queries")
    print("=" * 50)
    
    queries = searcher.get_queries_from_index(
        QUERY_INDICES['public'], 
        size=count,
        random_sample=True
    )
    
    for i, query_info in enumerate(queries, 1):
        print(f"\n📝 TEST {i}/{len(queries)}")
        print("-" * 30)
        
        article_results = searcher.search_articles_with_query(
            query_info['query_text'], 
            size=articles_per_query
        )
        
        searcher.display_search_results(query_info, article_results)

def test_random_private_queries(count=3, articles_per_query=5):
    """
    Test random queries từ private index
    
    Args:
        count (int): Số lượng queries để test
        articles_per_query (int): Số articles per query
    
    Example:
        test_random_private_queries(5, 3)
    """
    print(f"🔒 Testing {count} random PRIVATE queries")
    print("=" * 50)
    
    queries = searcher.get_queries_from_index(
        QUERY_INDICES['private'], 
        size=count,
        random_sample=True
    )
    
    for i, query_info in enumerate(queries, 1):
        print(f"\n📝 TEST {i}/{len(queries)}")
        print("-" * 30)
        
        article_results = searcher.search_articles_with_query(
            query_info['query_text'], 
            size=articles_per_query
        )
        
        searcher.display_search_results(query_info, article_results)

def test_recent_queries(index_type='public', count=3, articles_per_query=5):
    """
    Test recent queries từ index cụ thể
    
    Args:
        index_type (str): 'public', 'private', 'private_clean', 'training'
        count (int): Số lượng queries
        articles_per_query (int): Số articles per query
    
    Example:
        test_recent_queries('private', 5, 3)
    """
    if index_type not in QUERY_INDICES:
        print(f"❌ Invalid index type. Use: {list(QUERY_INDICES.keys())}")
        return
    
    query_index = QUERY_INDICES[index_type]
    
    print(f"📅 Testing {count} recent queries from {query_index}")
    print("=" * 50)
    
    queries = searcher.get_queries_from_index(
        query_index, 
        size=count,
        random_sample=False
    )
    
    for i, query_info in enumerate(queries, 1):
        print(f"\n📝 TEST {i}/{len(queries)}")
        print("-" * 30)
        
        article_results = searcher.search_articles_with_query(
            query_info['query_text'], 
            size=articles_per_query
        )
        
        searcher.display_search_results(query_info, article_results)

def find_and_test_queries(search_term, index_type='private', count=3, articles_per_query=5):
    """
    Tìm queries chứa term cụ thể rồi test
    
    Args:
        search_term (str): Term để tìm trong query_text
        index_type (str): Index để tìm ('public', 'private', etc.)
        count (int): Số queries để test
        articles_per_query (int): Số articles per query
    
    Example:
        find_and_test_queries("machine learning", "private", 5, 3)
    """
    if index_type not in QUERY_INDICES:
        print(f"❌ Invalid index type. Use: {list(QUERY_INDICES.keys())}")
        return
    
    query_index = QUERY_INDICES[index_type]
    
    print(f"🔍 Finding queries about '{search_term}' in {query_index}")
    print("=" * 50)
    
    queries = searcher.get_queries_from_index(
        query_index, 
        search_term=search_term,
        size=count
    )
    
    if not queries:
        print(f"❌ No queries found containing '{search_term}'")
        return
    
    print(f"Found {len(queries)} relevant queries. Testing them...")
    
    for i, query_info in enumerate(queries, 1):
        print(f"\n📝 TEST {i}/{len(queries)}")
        print("-" * 30)
        
        article_results = searcher.search_articles_with_query(
            query_info['query_text'], 
            size=articles_per_query
        )
        
        searcher.display_search_results(query_info, article_results)

def test_custom_query(query_text, articles_count=5):
    """
    Test một query tự viết
    
    Args:
        query_text (str): Query text để test
        articles_count (int): Số articles để return
    
    Example:
        test_custom_query("autonomous vehicle technology", 10)
    """
    print(f"🔍 Testing custom query: '{query_text}'")
    print("=" * 50)
    
    article_results = searcher.search_articles_with_query(query_text, size=articles_count)
    
    query_info = {
        'query_text': query_text,
        'query_id': 'custom',
        'created_at': 'now'
    }
    
    searcher.display_search_results(query_info, article_results)
    return article_results

def compare_query_performance(query_text, indices=['public', 'private'], articles_per_index=3):
    """
    So sánh cùng 1 query across multiple query indices
    
    Args:
        query_text (str): Query để test
        indices (list): List of index types to compare
        articles_per_index (int): Articles per index
    
    Example:
        compare_query_performance("AI research", ['public', 'private', 'training'])
    """
    print(f"⚖️ Comparing query performance: '{query_text}'")
    print("=" * 60)
    
    # Test the custom query first
    print(f"\n🎯 CUSTOM QUERY RESULTS:")
    print("-" * 30)
    custom_results = test_custom_query(query_text, articles_per_index)
    
    # Then find similar queries in different indices
    for index_type in indices:
        if index_type not in QUERY_INDICES:
            continue
            
        query_index = QUERY_INDICES[index_type]
        
        print(f"\n📚 SIMILAR QUERIES IN {query_index.upper()}:")
        print("-" * 40)
        
        # Find similar queries
        similar_queries = searcher.get_queries_from_index(
            query_index,
            search_term=query_text,
            size=2  # Just get 2 similar queries
        )
        
        if similar_queries:
            for j, query_info in enumerate(similar_queries, 1):
                print(f"\n  Similar Query {j}: {query_info['query_text']}")
                article_results = searcher.search_articles_with_query(
                    query_info['query_text'], 
                    size=2  # Fewer results for comparison
                )
                
                if article_results and article_results['hits']['hits']:
                    for k, hit in enumerate(article_results['hits']['hits'], 1):
                        title = hit['_source'].get('title', 'No title')[:60] + "..."
                        score = hit['_score']
                        print(f"    {k}. {title} (Score: {score:.2f})")
                else:
                    print("    No articles found")
        else:
            print(f"  No similar queries found in {query_index}")

def batch_test_analysis(index_type='private', count=10):
    """
    Batch test và analyze results
    
    Args:
        index_type (str): Index type để test
        count (int): Số queries để test
    
    Example:
        batch_test_analysis('private', 20)
    """
    if index_type not in QUERY_INDICES:
        print(f"❌ Invalid index type. Use: {list(QUERY_INDICES.keys())}")
        return
    
    query_index = QUERY_INDICES[index_type]
    
    print(f"📊 Batch analysis: {count} queries from {query_index}")
    print("=" * 50)
    
    queries = searcher.get_queries_from_index(
        query_index, 
        size=count,
        random_sample=True
    )
    
    results_summary = []
    
    for i, query_info in enumerate(queries, 1):
        print(f"Testing {i}/{len(queries)}: {query_info['query_text'][:50]}...")
        
        article_results = searcher.search_articles_with_query(
            query_info['query_text'], 
            size=5
        )
        
        if article_results:
            total_found = article_results['hits']['total']['value']
            avg_score = sum(hit['_score'] for hit in article_results['hits']['hits']) / len(article_results['hits']['hits']) if article_results['hits']['hits'] else 0
            
            results_summary.append({
                'query': query_info['query_text'],
                'total_found': total_found,
                'avg_score': avg_score,
                'has_results': total_found > 0
            })
    
    # Analysis
    total_queries = len(results_summary)
    queries_with_results = sum(1 for r in results_summary if r['has_results'])
    avg_articles_found = sum(r['total_found'] for r in results_summary) / total_queries
    avg_relevance_score = sum(r['avg_score'] for r in results_summary if r['avg_score'] > 0) / len([r for r in results_summary if r['avg_score'] > 0])
    
    print(f"\n📈 BATCH ANALYSIS RESULTS:")
    print("=" * 40)
    print(f"Total queries tested: {total_queries}")
    print(f"Queries with results: {queries_with_results} ({queries_with_results/total_queries*100:.1f}%)")
    print(f"Average articles found: {avg_articles_found:.1f}")
    print(f"Average relevance score: {avg_relevance_score:.2f}")
    
    # Top and bottom performers
    results_summary.sort(key=lambda x: x['total_found'], reverse=True)
    
    print(f"\n🏆 TOP PERFORMING QUERIES:")
    for i, result in enumerate(results_summary[:3], 1):
        print(f"  {i}. {result['query'][:60]}... ({result['total_found']:,} articles)")
    
    print(f"\n🔍 BOTTOM PERFORMING QUERIES:")
    for i, result in enumerate(results_summary[-3:], 1):
        print(f"  {i}. {result['query'][:60]}... ({result['total_found']:,} articles)")
    
    return results_summary

# =============================================================================
# QUICK START EXAMPLES
# =============================================================================

print("\n🎯 READY TO USE! Try these commands:")
print("=" * 60)
print("# Test random queries")
print("test_random_public_queries(3, 5)")
print("test_random_private_queries(3, 5)    # ← Main use case!")
print()
print("# Test recent queries")
print("test_recent_queries('private', 5, 3)")
print("test_recent_queries('training', 3, 5)")
print()
print("# Find specific queries and test them")
print("find_and_test_queries('machine learning', 'private', 3)")
print("find_and_test_queries('autonomous vehicle', 'public', 5)")
print()
print("# Test custom queries")
print("test_custom_query('artificial intelligence research')")
print()
print("# Advanced analysis")
print("compare_query_performance('blockchain', ['public', 'private'])")
print("batch_test_analysis('private', 20)")
print()
print("💡 The main purpose: Test how well user queries work on your article database!")

✅ Connected to Elasticsearch 9.0.1
🚀 Query-to-Article searcher ready!

📊 Checking indices...
  public          | queries              | ✅
  private         | private_queries      | ✅
  private_clean   | private_queries_clean | ✅
  training        | queries_training     | ✅
  articles        | articles_content_title | ✅

🎯 READY TO USE! Try these commands:
# Test random queries
test_random_public_queries(3, 5)
test_random_private_queries(3, 5)    # ← Main use case!

# Test recent queries
test_recent_queries('private', 5, 3)
test_recent_queries('training', 3, 5)

# Find specific queries and test them
find_and_test_queries('machine learning', 'private', 3)
find_and_test_queries('autonomous vehicle', 'public', 5)

# Test custom queries
test_custom_query('artificial intelligence research')

# Advanced analysis
compare_query_performance('blockchain', ['public', 'private'])
batch_test_analysis('private', 20)

💡 The main purpose: Test how well user queries work on your article database!


In [10]:
test_random_private_queries(3, 5)

🔒 Testing 3 random PRIVATE queries

📝 TEST 1/3
------------------------------

🔍 QUERY: This photograph captures a woman likely named Paulette Leaphart in a moment of contemplation amidst public scrutiny surrounding her online presence and personal journey. Leaphart, known for her Facebook posts detailing her breast cancer experience and plans for a cross-country walk to raise awareness, received both admiration and criticism for her candor and self-promotion. The dimly lit room, the turned-off TV, and the obscured framed pictures suggest a desire for privacy or reflection. Her direct gaze towards the camera invites the viewer to consider the complex emotions she likely experienced while navigating public attention and controversy. The image's pixelated quality adds a layer of mystery and emphasizes the digital nature of her narrative, hinting at the power and intricacies of online storytelling.
📋 Query ID: 9905194501784f80
📊 Found: 10,000 articles

1. Paulette Leaphart and the naked t

In [3]:
import pandas as pd
import csv
import time
from datetime import datetime
import os

# Thêm vào notebook searcher đã có
# =============================================================================
# BATCH EXPORT FUNCTIONS
# =============================================================================

def batch_export_query_results(index_type='private', total_queries=1000, top_k=10, 
                               batch_size=50, output_file=None, search_term=None):
    """
    Chạy full batch test và export ra CSV
    
    Args:
        index_type (str): 'public', 'private', 'private_clean', 'training'
        total_queries (int): Tổng số queries để test (default: 1000)
        top_k (int): Top K articles per query (default: 10)
        batch_size (int): Process theo batch để tránh timeout (default: 50)
        output_file (str): Tên file output (default: auto generate)
        search_term (str): Filter queries containing term (optional)
    
    Returns:
        str: Path to generated CSV file
    
    Example:
        batch_export_query_results('private', 2000, 5, output_file='private_results.csv')
    """
    
    if index_type not in QUERY_INDICES:
        print(f"❌ Invalid index type. Use: {list(QUERY_INDICES.keys())}")
        return None
    
    query_index = QUERY_INDICES[index_type]
    
    # Generate output filename if not provided
    if output_file is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filter_suffix = f"_{search_term.replace(' ', '_')}" if search_term else ""
        output_file = f"query_article_results_{index_type}_{total_queries}q_{top_k}k{filter_suffix}_{timestamp}.csv"
    
    print(f"🚀 BATCH EXPORT STARTING")
    print(f"📊 Query index: {query_index}")
    print(f"🎯 Total queries: {total_queries:,}")
    print(f"📈 Top K articles: {top_k}")
    print(f"🔢 Batch size: {batch_size}")
    print(f"📄 Output file: {output_file}")
    if search_term:
        print(f"🔍 Filter term: '{search_term}'")
    print("=" * 60)
    
    # Prepare CSV header
    headers = ['query_id', 'query_text'] + [f'article_id_{i+1}' for i in range(top_k)]
    
    # Initialize CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
    
    # Process in batches
    processed_count = 0
    successful_count = 0
    start_time = time.time()
    
    while processed_count < total_queries:
        current_batch_size = min(batch_size, total_queries - processed_count)
        
        print(f"\n📦 Processing batch: {processed_count + 1} to {processed_count + current_batch_size}")
        
        # Get queries for this batch
        queries = searcher.get_queries_from_index(
            query_index,
            search_term=search_term,
            size=current_batch_size,
            random_sample=True  # Random sampling để tránh bias
        )
        
        if not queries:
            print(f"⚠️ No more queries available. Stopping at {processed_count}")
            break
        
        # Process each query in batch
        batch_results = []
        
        for query_info in queries:
            try:
                # Search articles
                article_results = searcher.search_articles_with_query(
                    query_info['query_text'], 
                    size=top_k
                )
                
                # Extract article IDs
                article_ids = []
                if article_results and article_results['hits']['hits']:
                    for hit in article_results['hits']['hits']:
                        article_id = hit['_source'].get('article_id', hit['_id'])
                        article_ids.append(article_id)
                
                # Pad with empty strings if not enough articles
                while len(article_ids) < top_k:
                    article_ids.append('')
                
                # Prepare row
                row = [
                    query_info['query_id'],
                    query_info['query_text']
                ] + article_ids[:top_k]  # Ensure exactly top_k articles
                
                batch_results.append(row)
                successful_count += 1
                
            except Exception as e:
                print(f"❌ Error processing query {query_info['query_id']}: {e}")
                continue
        
        # Write batch results to CSV
        with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(batch_results)
        
        processed_count += len(queries)
        
        # Progress update
        elapsed = time.time() - start_time
        speed = processed_count / elapsed if elapsed > 0 else 0
        eta = (total_queries - processed_count) / speed if speed > 0 else 0
        
        print(f"✅ Batch completed: {len(batch_results)} successful")
        print(f"📊 Progress: {processed_count}/{total_queries} ({processed_count/total_queries*100:.1f}%)")
        print(f"⚡ Speed: {speed:.1f} queries/sec")
        print(f"🎯 ETA: {eta/60:.1f} minutes")
        
        # Small delay to avoid overwhelming ES
        time.sleep(1)
    
    # Final summary
    elapsed_total = time.time() - start_time
    avg_speed = successful_count / elapsed_total if elapsed_total > 0 else 0
    
    print(f"\n🎉 BATCH EXPORT COMPLETED!")
    print(f"⏱️ Total time: {elapsed_total/60:.1f} minutes")
    print(f"📊 Total processed: {processed_count:,} queries")
    print(f"✅ Successful: {successful_count:,} queries")
    print(f"⚡ Average speed: {avg_speed:.1f} queries/sec")
    print(f"📄 Output file: {output_file}")
    print(f"💾 File size: {os.path.getsize(output_file) / (1024*1024):.1f} MB")
    
    return output_file

def quick_export_sample(index_type='private', sample_size=100, top_k=5, output_file=None):
    """
    Quick export cho sample nhỏ để test
    
    Args:
        index_type (str): Index type
        sample_size (int): Number of queries to sample
        top_k (int): Top K articles per query
        output_file (str): Output filename
    
    Example:
        quick_export_sample('private', 200, 5, 'test_sample.csv')
    """
    
    if output_file is None:
        timestamp = datetime.now().strftime("%H%M%S")
        output_file = f"sample_{index_type}_{sample_size}q_{top_k}k_{timestamp}.csv"
    
    print(f"🚀 Quick sample export: {sample_size} queries from {index_type}")
    print(f"📄 Output: {output_file}")
    
    result_file = batch_export_query_results(
        index_type=index_type,
        total_queries=sample_size,
        top_k=top_k,
        batch_size=sample_size,  # Single batch
        output_file=output_file
    )
    
    return result_file

def export_filtered_queries(index_type='private', search_terms=None, top_k=10, output_file=None):
    """
    Export queries filtered by specific terms
    
    Args:
        index_type (str): Index type
        search_terms (list): List of terms to search for
        top_k (int): Top K articles per query
        output_file (str): Output filename
    
    Example:
        export_filtered_queries('private', ['AI', 'machine learning', 'blockchain'], 5)
    """
    
    if search_terms is None:
        search_terms = ['AI', 'machine learning', 'blockchain', 'autonomous vehicle']
    
    if output_file is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        terms_str = "_".join(search_terms[:2]).replace(' ', '')  # Use first 2 terms
        output_file = f"filtered_{index_type}_{terms_str}_{timestamp}.csv"
    
    print(f"🔍 Exporting filtered queries from {index_type}")
    print(f"🎯 Search terms: {search_terms}")
    print(f"📄 Output: {output_file}")
    
    # Prepare CSV
    headers = ['query_id', 'query_text', 'filter_term'] + [f'article_id_{i+1}' for i in range(top_k)]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
    
    total_exported = 0
    
    for term in search_terms:
        print(f"\n🔍 Processing term: '{term}'")
        
        # Get queries for this term
        queries = searcher.get_queries_from_index(
            QUERY_INDICES[index_type],
            search_term=term,
            size=50,  # 50 queries per term
            random_sample=False
        )
        
        term_results = []
        
        for query_info in queries:
            try:
                article_results = searcher.search_articles_with_query(
                    query_info['query_text'], 
                    size=top_k
                )
                
                article_ids = []
                if article_results and article_results['hits']['hits']:
                    for hit in article_results['hits']['hits']:
                        article_id = hit['_source'].get('article_id', hit['_id'])
                        article_ids.append(article_id)
                
                while len(article_ids) < top_k:
                    article_ids.append('')
                
                row = [
                    query_info['query_id'],
                    query_info['query_text'],
                    term
                ] + article_ids[:top_k]
                
                term_results.append(row)
                
            except Exception as e:
                print(f"❌ Error: {e}")
                continue
        
        # Write results for this term
        with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(term_results)
        
        total_exported += len(term_results)
        print(f"✅ {len(term_results)} queries exported for '{term}'")
    
    print(f"\n🎉 Filtered export completed!")
    print(f"📊 Total exported: {total_exported} queries")
    print(f"📄 File: {output_file}")
    
    return output_file

def analyze_csv_results(csv_file):
    """
    Analyze exported CSV results
    
    Args:
        csv_file (str): Path to CSV file
    
    Example:
        analyze_csv_results('private_results.csv')
    """
    
    print(f"📊 Analyzing results from: {csv_file}")
    
    try:
        df = pd.read_csv(csv_file)
        
        total_queries = len(df)
        print(f"📄 Total queries: {total_queries:,}")
        
        # Count non-empty article columns
        article_cols = [col for col in df.columns if col.startswith('article_id_')]
        top_k = len(article_cols)
        
        print(f"🎯 Top K: {top_k}")
        
        # Coverage analysis
        coverage_stats = {}
        for i, col in enumerate(article_cols, 1):
            non_empty = df[col].notna().sum()
            coverage_stats[f"Position {i}"] = non_empty
            print(f"📈 Position {i}: {non_empty:,} queries ({non_empty/total_queries*100:.1f}%)")
        
        # Queries with no results
        first_col = article_cols[0]
        no_results = df[first_col].isna().sum()
        print(f"❌ Queries with no results: {no_results:,} ({no_results/total_queries*100:.1f}%)")
        
        # Average results per query
        total_articles = 0
        for col in article_cols:
            total_articles += df[col].notna().sum()
        
        avg_articles = total_articles / total_queries
        print(f"📊 Average articles per query: {avg_articles:.1f}")
        
        # Most common articles
        all_articles = []
        for col in article_cols:
            all_articles.extend(df[col].dropna().tolist())
        
        if all_articles:
            from collections import Counter
            article_counts = Counter(all_articles)
            print(f"\n🏆 TOP 10 MOST RETRIEVED ARTICLES:")
            for article_id, count in article_counts.most_common(10):
                print(f"  {article_id}: {count} times")
        
        return df
        
    except Exception as e:
        print(f"❌ Error analyzing CSV: {e}")
        return None

def resume_export(existing_file, target_total, index_type='private', top_k=10):
    """
    Resume an interrupted export
    
    Args:
        existing_file (str): Path to existing CSV file
        target_total (int): Target total number of queries
        index_type (str): Index type
        top_k (int): Top K articles
    
    Example:
        resume_export('private_results.csv', 5000, 'private', 10)
    """
    
    try:
        # Check current progress
        df = pd.read_csv(existing_file)
        current_count = len(df)
        
        remaining = target_total - current_count
        
        if remaining <= 0:
            print(f"✅ Export already complete: {current_count} queries")
            return existing_file
        
        print(f"🔄 RESUMING EXPORT")
        print(f"📊 Current: {current_count:,} queries")
        print(f"🎯 Target: {target_total:,} queries")
        print(f"⏳ Remaining: {remaining:,} queries")
        
        # Create temporary file for new results
        temp_file = f"temp_resume_{datetime.now().strftime('%H%M%S')}.csv"
        
        # Export remaining queries
        batch_export_query_results(
            index_type=index_type,
            total_queries=remaining,
            top_k=top_k,
            output_file=temp_file
        )
        
        # Merge files
        print(f"\n🔗 Merging results...")
        
        # Read both files
        df_existing = pd.read_csv(existing_file)
        df_new = pd.read_csv(temp_file)
        
        # Combine
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        
        # Save combined results
        final_file = f"resumed_{existing_file}"
        df_combined.to_csv(final_file, index=False)
        
        # Cleanup
        os.remove(temp_file)
        
        print(f"✅ Resume completed!")
        print(f"📄 Final file: {final_file}")
        print(f"📊 Total queries: {len(df_combined):,}")
        
        return final_file
        
    except Exception as e:
        print(f"❌ Error resuming export: {e}")
        return None

# =============================================================================
# QUICK START EXAMPLES FOR BATCH EXPORT
# =============================================================================

print("\n🚀 BATCH EXPORT FUNCTIONS READY!")
print("=" * 50)
print("# Quick samples (for testing)")
print("quick_export_sample('private', 100, 5)          # 100 private queries, top 5 articles")
print("quick_export_sample('public', 200, 10)          # 200 public queries, top 10 articles")
print()
print("# Full batch exports")
print("batch_export_query_results('private', 5000, 10) # 5K private queries, top 10 articles")
print("batch_export_query_results('public', 2000, 5)   # 2K public queries, top 5 articles")
print()
print("# Filtered exports")
print("export_filtered_queries('private', ['AI', 'blockchain'], 5)")
print("export_filtered_queries('training', ['machine learning'], 10)")
print()
print("# Analysis")
print("analyze_csv_results('your_file.csv')")
print()
print("# Resume interrupted export")
print("resume_export('private_results.csv', 10000, 'private', 10)")
print()
print("💡 CSV Format: query_id, query_text, article_id_1, article_id_2, ..., article_id_top_k")


🚀 BATCH EXPORT FUNCTIONS READY!
# Quick samples (for testing)
quick_export_sample('private', 100, 5)          # 100 private queries, top 5 articles
quick_export_sample('public', 200, 10)          # 200 public queries, top 10 articles

# Full batch exports
batch_export_query_results('private', 5000, 10) # 5K private queries, top 10 articles
batch_export_query_results('public', 2000, 5)   # 2K public queries, top 5 articles

# Filtered exports
export_filtered_queries('private', ['AI', 'blockchain'], 5)
export_filtered_queries('training', ['machine learning'], 10)

# Analysis
analyze_csv_results('your_file.csv')

# Resume interrupted export
resume_export('private_results.csv', 10000, 'private', 10)

💡 CSV Format: query_id, query_text, article_id_1, article_id_2, ..., article_id_top_k


In [6]:
def check_and_fix_missing_results(csv_file, top_k=30, backup_original=True):
    """
    Check CSV file for missing results and fix them
    
    Args:
        csv_file (str): Path to CSV file to check and fix
        top_k (int): Number of top articles (should match original export)
        backup_original (bool): Create backup of original file
    
    Returns:
        dict: Summary of fixes applied
    
    Example:
        check_and_fix_missing_results('query_article_results_private_2000q_30k_20250616_201703.csv')
    """
    
    print(f"🔍 CHECKING & FIXING MISSING RESULTS")
    print(f"📄 File: {csv_file}")
    print(f"🎯 Expected top K: {top_k}")
    print("=" * 60)
    
    try:
        # Read the CSV file
        df = pd.read_csv(csv_file)
        total_queries = len(df)
        
        print(f"📊 Total queries in file: {total_queries:,}")
        
        # Find article ID columns
        article_cols = [col for col in df.columns if col.startswith('article_id_')]
        actual_top_k = len(article_cols)
        
        print(f"📈 Article columns found: {actual_top_k}")
        
        if actual_top_k != top_k:
            print(f"⚠️ Warning: Expected {top_k} columns, found {actual_top_k}")
        
        # Find missing results (queries with empty article_id_1)
        first_article_col = article_cols[0] if article_cols else None
        
        if not first_article_col:
            print("❌ No article columns found in CSV!")
            return None
        
        # Find rows with missing results
        missing_mask = df[first_article_col].isna() | (df[first_article_col] == '')
        missing_queries = df[missing_mask].copy()
        missing_count = len(missing_queries)
        
        print(f"❌ Missing results: {missing_count:,} queries ({missing_count/total_queries*100:.1f}%)")
        
        if missing_count == 0:
            print("✅ No missing results found! File is complete.")
            return {"status": "complete", "fixed": 0}
        
        # Create backup if requested
        if backup_original:
            backup_file = f"backup_{datetime.now().strftime('%H%M%S')}_{csv_file}"
            df.to_csv(backup_file, index=False)
            print(f"💾 Backup created: {backup_file}")
        
        print(f"\\n🔄 Starting to fix {missing_count:,} missing queries...")
        
        # Process missing queries in smaller batches to avoid timeout
        batch_size = 10  # Smaller batch size to avoid timeout
        fixed_count = 0
        failed_count = 0
        
        for i in range(0, missing_count, batch_size):
            batch_end = min(i + batch_size, missing_count)
            batch_queries = missing_queries.iloc[i:batch_end]
            
            print(f"\\n📦 Processing batch {i//batch_size + 1}: queries {i+1} to {batch_end}")
            
            for idx, row in batch_queries.iterrows():
                query_id = row['query_id']
                query_text = row['query_text']
                
                try:
                    print(f"  🔍 Fixing: {query_text[:50]}...")
                    
                    # Search for articles
                    article_results = searcher.search_articles_with_query(
                        query_text, 
                        size=actual_top_k
                    )
                    
                    # Extract article IDs
                    article_ids = []
                    if article_results and article_results['hits']['hits']:
                        for hit in article_results['hits']['hits']:
                            article_id = hit['_source'].get('article_id', hit['_id'])
                            article_ids.append(article_id)
                    
                    # Pad with empty strings if not enough articles
                    while len(article_ids) < actual_top_k:
                        article_ids.append('')
                    
                    # Update the dataframe
                    for j, col in enumerate(article_cols):
                        if j < len(article_ids):
                            df.loc[idx, col] = article_ids[j]
                    
                    fixed_count += 1
                    print(f"    ✅ Fixed: {len([aid for aid in article_ids if aid])} articles found")
                    
                except Exception as e:
                    failed_count += 1
                    print(f"    ❌ Failed: {e}")
                    # Add small delay on error
                    time.sleep(2)
                    continue
            
            # Small delay between batches
            time.sleep(1)
            
            # Progress update
            progress = (batch_end / missing_count) * 100
            print(f"  📊 Batch progress: {progress:.1f}% ({fixed_count} fixed, {failed_count} failed)")
        
        # Save the updated file
        output_file = f"fixed_{csv_file}"
        df.to_csv(output_file, index=False)
        
        # Final verification
        verification_df = pd.read_csv(output_file)
        remaining_missing = verification_df[first_article_col].isna().sum()
        
        print(f"\\n🎉 FIXING COMPLETED!")
        print(f"✅ Fixed queries: {fixed_count:,}")
        print(f"❌ Failed queries: {failed_count:,}")
        print(f"📄 Updated file: {output_file}")
        print(f"🔍 Remaining missing: {remaining_missing:,}")
        print(f"📊 Success rate: {(fixed_count/(fixed_count+failed_count)*100):.1f}%")
        
        return {
            "status": "completed",
            "original_missing": missing_count,
            "fixed": fixed_count,
            "failed": failed_count,
            "remaining_missing": remaining_missing,
            "output_file": output_file
        }
        
    except Exception as e:
        print(f"❌ Error during fix process: {e}")
        return {"status": "error", "error": str(e)}

def analyze_missing_patterns(csv_file):
    """
    Analyze patterns in missing results to understand why some queries fail
    
    Args:
        csv_file (str): Path to CSV file to analyze
    
    Example:
        analyze_missing_patterns('query_article_results_private_2000q_30k_20250616_201703.csv')
    """
    
    print(f"📊 ANALYZING MISSING RESULT PATTERNS")
    print(f"📄 File: {csv_file}")
    print("=" * 50)
    
    try:
        df = pd.read_csv(csv_file)
        
        # Find article columns
        article_cols = [col for col in df.columns if col.startswith('article_id_')]
        first_col = article_cols[0] if article_cols else None
        
        if not first_col:
            print("❌ No article columns found!")
            return
        
        # Identify missing queries
        missing_mask = df[first_col].isna() | (df[first_col] == '')
        missing_queries = df[missing_mask]
        
        total_queries = len(df)
        missing_count = len(missing_queries)
        
        print(f"📊 Total queries: {total_queries:,}")
        print(f"❌ Missing results: {missing_count:,} ({missing_count/total_queries*100:.1f}%)")
        
        if missing_count == 0:
            print("✅ No missing results to analyze!")
            return
        
        print(f"\\n📝 MISSING QUERY ANALYSIS:")
        print("-" * 30)
        
        # Query length analysis
        missing_queries['query_length'] = missing_queries['query_text'].str.len()
        avg_missing_length = missing_queries['query_length'].mean()
        
        # Compare with successful queries
        successful_queries = df[~missing_mask]
        if len(successful_queries) > 0:
            successful_queries['query_length'] = successful_queries['query_text'].str.len()
            avg_successful_length = successful_queries['query_length'].mean()
            
            print(f"📏 Average query length:")
            print(f"  Missing results: {avg_missing_length:.0f} characters")
            print(f"  Successful results: {avg_successful_length:.0f} characters")
            print(f"  Difference: {avg_missing_length - avg_successful_length:.0f} characters")
        
        # Show some example missing queries
        print(f"\\n🔍 SAMPLE MISSING QUERIES:")
        print("-" * 40)
        
        sample_missing = missing_queries.head(10)
        for i, (_, row) in enumerate(sample_missing.iterrows(), 1):
            query_text = row['query_text']
            query_length = len(query_text)
            preview = query_text[:100] + "..." if len(query_text) > 100 else query_text
            print(f"{i:2d}. [{query_length:3d} chars] {preview}")
        
        # Query word analysis
        print(f"\\n🔤 QUERY CHARACTERISTICS:")
        print("-" * 30)
        
        # Word count
        missing_queries['word_count'] = missing_queries['query_text'].str.split().str.len()
        avg_missing_words = missing_queries['word_count'].mean()
        
        if len(successful_queries) > 0:
            successful_queries['word_count'] = successful_queries['query_text'].str.split().str.len()
            avg_successful_words = successful_queries['word_count'].mean()
            
            print(f"📊 Average word count:")
            print(f"  Missing results: {avg_missing_words:.1f} words")
            print(f"  Successful results: {avg_successful_words:.1f} words")
        
        # Common patterns in missing queries
        print(f"\\n🔍 COMMON PATTERNS IN MISSING QUERIES:")
        print("-" * 40)
        
        # Check for very long queries
        very_long = missing_queries[missing_queries['query_length'] > 1000]
        print(f"📏 Very long queries (>1000 chars): {len(very_long)}")
        
        # Check for queries with special characters
        special_chars = missing_queries[missing_queries['query_text'].str.contains(r'[^\w\s.,!?-]', regex=True, na=False)]
        print(f"🔤 Queries with special characters: {len(special_chars)}")
        
        # Check for queries that might be image descriptions
        image_keywords = ['photograph', 'image', 'picture', 'shows', 'captures', 'depicts']
        image_queries = missing_queries[missing_queries['query_text'].str.contains('|'.join(image_keywords), case=False, na=False)]
        print(f"🖼️ Image description queries: {len(image_queries)}")
        
        return {
            "total_queries": total_queries,
            "missing_count": missing_count,
            "missing_percentage": missing_count/total_queries*100,
            "avg_missing_length": avg_missing_length,
            "avg_missing_words": avg_missing_words,
            "very_long_count": len(very_long),
            "special_chars_count": len(special_chars),
            "image_queries_count": len(image_queries)
        }
        
    except Exception as e:
        print(f"❌ Error analyzing patterns: {e}")
        return None

def quick_test_missing_queries(csv_file, sample_size=5):
    """
    Quick test a few missing queries to see what's happening
    
    Args:
        csv_file (str): Path to CSV file
        sample_size (int): Number of missing queries to test
    
    Example:
        quick_test_missing_queries('query_article_results_private_2000q_30k_20250616_201703.csv', 3)
    """
    
    print(f"🧪 QUICK TEST OF MISSING QUERIES")
    print(f"📄 File: {csv_file}")
    print(f"🔢 Sample size: {sample_size}")
    print("=" * 50)
    
    try:
        df = pd.read_csv(csv_file)
        
        # Find missing queries
        article_cols = [col for col in df.columns if col.startswith('article_id_')]
        first_col = article_cols[0] if article_cols else None
        
        if not first_col:
            print("❌ No article columns found!")
            return
        
        missing_mask = df[first_col].isna() | (df[first_col] == '')
        missing_queries = df[missing_mask].head(sample_size)
        
        if len(missing_queries) == 0:
            print("✅ No missing queries found!")
            return
        
        print(f"📊 Testing {len(missing_queries)} missing queries...")
        
        for i, (_, row) in enumerate(missing_queries.iterrows(), 1):
            query_id = row['query_id']
            query_text = row['query_text']
            
            print(f"\\n🔍 TEST {i}/{len(missing_queries)}")
            print(f"📋 Query ID: {query_id}")
            print(f"📝 Query: {query_text[:100]}...")
            print(f"📏 Length: {len(query_text)} characters")
            print("-" * 40)
            
            try:
                # Test the search
                start_time = time.time()
                article_results = searcher.search_articles_with_query(query_text, size=5)
                search_time = time.time() - start_time
                
                print(f"⏱️ Search time: {search_time:.2f} seconds")
                
                if article_results:
                    total_found = article_results['hits']['total']['value']
                    results_returned = len(article_results['hits']['hits'])
                    
                    print(f"📊 Total found: {total_found:,}")
                    print(f"📄 Results returned: {results_returned}")
                    
                    if results_returned > 0:
                        print("✅ Query works! Sample results:")
                        for j, hit in enumerate(article_results['hits']['hits'][:3], 1):
                            title = hit['_source'].get('title', 'No title')[:50]
                            score = hit['_score']
                            print(f"  {j}. {title}... (Score: {score:.2f})")
                    else:
                        print("⚠️ Query executed but no results found")
                else:
                    print("❌ Query failed - no response")
                    
            except Exception as e:
                print(f"❌ Query failed with error: {e}")
            
            print("=" * 50)
    
    except Exception as e:
        print(f"❌ Error during quick test: {e}")

# =============================================================================
# QUICK START EXAMPLES FOR FIXING
# =============================================================================

print("\\n🔧 MISSING RESULTS FIXING FUNCTIONS READY!")
print("=" * 50)
print("# Check and fix missing results")
print("check_and_fix_missing_results('your_file.csv', 30)")
print()
print("# Analyze missing patterns")
print("analyze_missing_patterns('your_file.csv')")
print()
print("# Quick test some missing queries")
print("quick_test_missing_queries('your_file.csv', 5)")
print()
print("💡 These functions help you identify and fix timeout/missing results in your CSV exports!")


\n🔧 MISSING RESULTS FIXING FUNCTIONS READY!
# Check and fix missing results
check_and_fix_missing_results('your_file.csv', 30)

# Analyze missing patterns
analyze_missing_patterns('your_file.csv')

# Quick test some missing queries
quick_test_missing_queries('your_file.csv', 5)

💡 These functions help you identify and fix timeout/missing results in your CSV exports!


In [15]:
# Sử dụng với file CSV của bạn
csv_filename = 'query_article_results_private_2000q_30k_20250616_201703.csv'

# Trước tiên, analyze xem có bao nhiêu missing
print("🔍 ANALYZING YOUR CSV FILE...")
analyze_missing_patterns(csv_filename)


🔍 ANALYZING YOUR CSV FILE...
📊 ANALYZING MISSING RESULT PATTERNS
📄 File: query_article_results_private_2000q_30k_20250616_201703.csv
📊 Total queries: 2,000
❌ Missing results: 11 (0.5%)
\n📝 MISSING QUERY ANALYSIS:
------------------------------
📏 Average query length:
  Missing results: 838 characters
  Successful results: 804 characters
  Difference: 33 characters
\n🔍 SAMPLE MISSING QUERIES:
----------------------------------------
 1. [594 chars] The image depicts a unique and stylized interpretation of a golf course scene, possibly reflecting a...
 2. [952 chars] This image captures a thrilling scene from the upcoming Wimbledon tennis tournament, slated to begin...
 3. [1026 chars] Manassero's victory at the PGA Championship, a pinnacle moment in the world of professional golf, is...
 4. [656 chars] The image captures Bernard Tomic mid-match during his first round encounter at Wimbledon against Jo-...
 5. [1572 chars] This image captures a dynamic moment from the Australian Open tenn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_queries['query_length'] = missing_queries['query_text'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  successful_queries['query_length'] = successful_queries['query_text'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_queries['word_count'] = missing_queries['qu

{'total_queries': 2000,
 'missing_count': 11,
 'missing_percentage': 0.5499999999999999,
 'avg_missing_length': np.float64(837.6363636363636),
 'avg_missing_words': np.float64(128.72727272727272),
 'very_long_count': 3,
 'special_chars_count': 10,
 'image_queries_count': 11}

In [13]:
batch_export_query_results('private', 2000, 30)


🚀 BATCH EXPORT STARTING
📊 Query index: private_queries
🎯 Total queries: 2,000
📈 Top K articles: 30
🔢 Batch size: 50
📄 Output file: query_article_results_private_2000q_30k_20250616_201703.csv

📦 Processing batch: 1 to 50
✅ Batch completed: 50 successful
📊 Progress: 50/2000 (2.5%)
⚡ Speed: 0.1 queries/sec
🎯 ETA: 223.6 minutes

📦 Processing batch: 51 to 100
✅ Batch completed: 50 successful
📊 Progress: 100/2000 (5.0%)
⚡ Speed: 0.1 queries/sec
🎯 ETA: 226.5 minutes

📦 Processing batch: 101 to 150
✅ Batch completed: 50 successful
📊 Progress: 150/2000 (7.5%)
⚡ Speed: 0.1 queries/sec
🎯 ETA: 219.7 minutes

📦 Processing batch: 151 to 200
✅ Batch completed: 50 successful
📊 Progress: 200/2000 (10.0%)
⚡ Speed: 0.1 queries/sec
🎯 ETA: 208.8 minutes

📦 Processing batch: 201 to 250
✅ Batch completed: 50 successful
📊 Progress: 250/2000 (12.5%)
⚡ Speed: 0.1 queries/sec
🎯 ETA: 203.3 minutes

📦 Processing batch: 251 to 300
✅ Batch completed: 50 successful
📊 Progress: 300/2000 (15.0%)
⚡ Speed: 0.1 queries/se

'query_article_results_private_2000q_30k_20250616_201703.csv'

In [16]:
check_and_fix_missing_results('query_article_results_private_2000q_30k_20250616_201703.csv', 30)

🔍 CHECKING & FIXING MISSING RESULTS
📄 File: query_article_results_private_2000q_30k_20250616_201703.csv
🎯 Expected top K: 30
📊 Total queries in file: 2,000
📈 Article columns found: 30
❌ Missing results: 11 queries (0.5%)
💾 Backup created: backup_002353_query_article_results_private_2000q_30k_20250616_201703.csv
\n🔄 Starting to fix 11 missing queries...
\n📦 Processing batch 1: queries 1 to 10
  🔍 Fixing: The image depicts a unique and stylized interpreta...
    ✅ Fixed: 30 articles found
  🔍 Fixing: This image captures a thrilling scene from the upc...
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
    ✅ Fixed: 0 articles found
  🔍 Fixing: Manassero's victory at the PGA Championship, a pin...
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
    ✅ Fixed: 0 articles found
  🔍 Fixing: The image captures Bernard Tomic mid-match during ...
    ✅ Fixed: 30 articles found
  🔍 F

{'status': 'completed',
 'original_missing': 11,
 'fixed': 11,
 'failed': 0,
 'remaining_missing': np.int64(2),
 'output_file': 'fixed_query_article_results_private_2000q_30k_20250616_201703.csv'}

In [4]:
batch_export_query_results('private', 2000, 200)


🚀 BATCH EXPORT STARTING
📊 Query index: private_queries
🎯 Total queries: 2,000
📈 Top K articles: 200
🔢 Batch size: 50
📄 Output file: query_article_results_private_2000q_200k_20250617_103428.csv

📦 Processing batch: 1 to 50
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
❌ Article search error: HTTPC

PermissionError: [Errno 13] Permission denied: 'query_article_results_private_2000q_200k_20250617_103428.csv'

In [7]:
check_and_fix_missing_results('query_article_results_private_2000q_200k_20250617_103428.csv', 200)

🔍 CHECKING & FIXING MISSING RESULTS
📄 File: query_article_results_private_2000q_200k_20250617_103428.csv
🎯 Expected top K: 200
📊 Total queries in file: 600
📈 Article columns found: 200
❌ Missing results: 151 queries (25.2%)
💾 Backup created: backup_160314_query_article_results_private_2000q_200k_20250617_103428.csv
\n🔄 Starting to fix 151 missing queries...
\n📦 Processing batch 1: queries 1 to 10
  🔍 Fixing: In this striking photograph, tennis player Fabio F...
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
    ✅ Fixed: 0 articles found
  🔍 Fixing: This image captures a pivotal moment in golf histo...
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
    ✅ Fixed: 0 articles found
  🔍 Fixing: This image captures a moment during the Masters go...
❌ Article search error: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)
    ✅ Fixed: 0 articl

{'status': 'completed',
 'original_missing': 151,
 'fixed': 151,
 'failed': 0,
 'remaining_missing': np.int64(134),
 'output_file': 'fixed_query_article_results_private_2000q_200k_20250617_103428.csv'}