In [None]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import numpy as np
import urllib3
from elasticsearch import helpers
import json
from typing import Dict, List
import logging

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Connection (already established)
es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', ELASTIC_PASSWORD),
    verify_certs=False,
    ssl_show_warn=False
)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Test the model
test_embedding = model.encode("test text")
print(f"Embedding dimension: {len(test_embedding)}")

# Define index names
agenda_index = 'dynamic_project_search_v1_0_6_with_embeddings_v1'
source_agenda_index = 'dynamic_project_search_v1_0_6' 

# Define mapping for the agenda index with embeddings
agenda_mapping = {
    "mappings": {
        "properties": {
            # Original text fields
            "agenda_responses": {"type": "text"},
            "client_geography": {"type": "text"},
            "description": {"type": "text"},
            "expert_geographies": {"type": "text"},
            "functions": {"type": "text"},
            "id": {"type": "integer"},
            "name": {"type": "text"},
            "target_companies": {"type": "text"},
            "topic": {"type": "text"},
            
            # Individual embeddings
            "topic_embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "agenda_responses_embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "description_embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            
            # Combined agenda questions embedding (all questions together)
            "agenda_questions_combined_embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            
            # Agenda questions as nested objects with individual embeddings
            "agenda_questions": {
                "type": "nested",
                "properties": {
                    "question_text": {"type": "text"},
                    "question_number": {"type": "integer"},
                    "embedding": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    }
                }
            },
            
            # Complete combined embedding for the entire agenda
            "combined_embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

In [None]:
def process_agenda_questions(questions):
    """
    Process agenda questions array into nested structure with embeddings
    """
    if not questions:
        return []
    
    # Handle different input types
    if isinstance(questions, str):
        # If it's a single string, try to split by common delimiters
        if '\n' in questions:
            questions = [q.strip() for q in questions.split('\n') if q.strip()]
        else:
            questions = [questions]
    elif not isinstance(questions, list):
        questions = [str(questions)]
    
    processed_questions = []
    for idx, question in enumerate(questions):
        if question and str(question).strip():
            processed_question = {
                "question_text": str(question).strip(),
                "question_number": idx + 1,
                "embedding": model.encode(str(question)).tolist()
            }
            processed_questions.append(processed_question)
    
    return processed_questions

def create_agenda_combined_text(doc: Dict) -> str:
    """
    Create comprehensive text representation of the agenda for combined embedding
    """
    text_parts = []
    
    # Topic - most important
    if doc.get('topic') and str(doc['topic']).strip():
        text_parts.append(f"Topic: {doc['topic']}")
    
    # Name
    if doc.get('name') and str(doc['name']).strip():
        text_parts.append(f"Agenda name: {doc['name']}")
    
    # Description
    if doc.get('description') and str(doc['description']).strip():
        text_parts.append(f"Description: {doc['description']}")
    
    # Agenda questions
    if doc.get('agenda_questions'):
        questions = doc['agenda_questions']
        if isinstance(questions, str):
            text_parts.append(f"Questions: {questions}")
        elif isinstance(questions, list):
            questions_text = " | ".join([f"Q{i+1}: {q}" for i, q in enumerate(questions) if q])
            if questions_text:
                text_parts.append(f"Questions: {questions_text}")
    
    # Agenda responses
    if doc.get('agenda_responses') and str(doc['agenda_responses']).strip():
        text_parts.append(f"Responses: {doc['agenda_responses']}")
    
    # Functions
    if doc.get('functions') and str(doc['functions']).strip():
        text_parts.append(f"Functions: {doc['functions']}")
    
    # Target companies
    if doc.get('target_companies') and str(doc['target_companies']).strip():
        text_parts.append(f"Target companies: {doc['target_companies']}")
    
    # Geographies
    if doc.get('client_geography') and str(doc['client_geography']).strip():
        text_parts.append(f"Client geography: {doc['client_geography']}")
    
    if doc.get('expert_geographies') and str(doc['expert_geographies']).strip():
        text_parts.append(f"Expert geographies: {doc['expert_geographies']}")
    
    return " | ".join(text_parts)

def generate_agenda_embeddings(doc: Dict, model) -> Dict:
    """Generate embeddings for agenda document"""
    enhanced_doc = doc.copy()
    
    # Generate embeddings for individual fields
    embedding_fields = {
        'topic': 'topic_embedding',
        'agenda_responses': 'agenda_responses_embedding',
        'description': 'description_embedding'
    }
    
    for field, embedding_field in embedding_fields.items():
        if field in doc and doc[field] and str(doc[field]).strip():
            try:
                text = str(doc[field])
                embedding = model.encode(text)
                enhanced_doc[embedding_field] = embedding.tolist()
            except Exception as e:
                logger.warning(f"Error generating embedding for {field}: {e}")
    
    # Process agenda questions
    if doc.get('agenda_questions'):
        # Process individual questions with embeddings
        processed_questions = process_agenda_questions(doc['agenda_questions'])
        enhanced_doc['agenda_questions'] = processed_questions
        
        # Create combined questions embedding
        if processed_questions:
            all_questions_text = " | ".join([q['question_text'] for q in processed_questions])
            try:
                questions_embedding = model.encode(all_questions_text)
                enhanced_doc['agenda_questions_combined_embedding'] = questions_embedding.tolist()
            except Exception as e:
                logger.warning(f"Error generating combined questions embedding: {e}")
    
    # Generate weighted combined embedding
    combined_texts = []
    weights = []
    
    # Weighting scheme for agenda fields
    field_weights = {
        'topic': 2.0,              
        'agenda_questions': 2.5,  
        'description': 2.0,       
        'agenda_responses': 1.5,   
        'functions': 1.0,          
        'target_companies': 0.8,   
        'geographies': 0.5         
    }
    
    # Prepare texts for weighted combination
    if doc.get('topic') and str(doc['topic']).strip():
        combined_texts.append(str(doc['topic']))
        weights.append(field_weights['topic'])
    
    if doc.get('agenda_questions'):
        questions_text = ""
        if isinstance(doc['agenda_questions'], str):
            questions_text = doc['agenda_questions']
        elif isinstance(doc['agenda_questions'], list):
            questions_text = " | ".join([str(q) for q in doc['agenda_questions'] if q])
        
        if questions_text.strip():
            combined_texts.append(questions_text)
            weights.append(field_weights['agenda_questions'])
    
    if doc.get('description') and str(doc['description']).strip():
        combined_texts.append(str(doc['description']))
        weights.append(field_weights['description'])
    
    if doc.get('agenda_responses') and str(doc['agenda_responses']).strip():
        combined_texts.append(str(doc['agenda_responses']))
        weights.append(field_weights['agenda_responses'])
    
    if doc.get('functions') and str(doc['functions']).strip():
        combined_texts.append(str(doc['functions']))
        weights.append(field_weights['functions'])
    
    if doc.get('target_companies') and str(doc['target_companies']).strip():
        combined_texts.append(str(doc['target_companies']))
        weights.append(field_weights['target_companies'])
    
    # Combine geographies
    geo_texts = []
    if doc.get('client_geography') and str(doc['client_geography']).strip():
        geo_texts.append(f"Client: {doc['client_geography']}")
    if doc.get('expert_geographies') and str(doc['expert_geographies']).strip():
        geo_texts.append(f"Expert: {doc['expert_geographies']}")
    
    if geo_texts:
        combined_texts.append(" | ".join(geo_texts))
        weights.append(field_weights['geographies'])
    
    # Create combined embedding
    if combined_texts:
        try:
            # Generate embeddings for each text
            embeddings = [model.encode(text) for text in combined_texts]
            
            # Weighted average
            weighted_embeddings = []
            for emb, weight in zip(embeddings, weights):
                weighted_embeddings.append(emb * weight)
            
            combined_embedding = np.mean(weighted_embeddings, axis=0)
            # Normalize
            combined_embedding = combined_embedding / np.linalg.norm(combined_embedding)
            enhanced_doc["combined_embedding"] = combined_embedding.tolist()
        except Exception as e:
            logger.warning(f"Error generating combined embedding: {e}")
    
    return enhanced_doc

# Create the index
if es.indices.exists(index=agenda_index):
    print(f"Index {agenda_index} already exists. Deleting...")
    es.indices.delete(index=agenda_index)

es.indices.create(index=agenda_index, body=agenda_mapping)

In [None]:
def reindex_agendas_with_embeddings(source_index, target_index, batch_size=500):
    """Reindex agendas with embeddings"""
    total_processed = 0
    actions = []
    
    print(f"\nStarting reindexing from '{source_index}' to '{target_index}'...")
    
    try:
        # Check if source index exists
        if not es.indices.exists(index=source_index):
            print(f"Source index '{source_index}' not found!")
            # List available indices
            indices = es.indices.get_alias(index="*")
            print("\nAvailable indices:")
            for idx in sorted(indices.keys()):
                if not idx.startswith('.'):  # Skip system indices
                    print(f"  - {idx}")
            return
        
        # Get document count
        count = es.count(index=source_index)['count']
        print(f"Found {count} documents to process")

        milestones = [500, 1000, 2000, 3000, 5000, 7500, 10000, 15000, 20000]
        next_milestone_idx = 0
        # Process documents using helpers.scan
        for doc in helpers.scan(es, index=source_index, size=100):
            try:
                # Get the source document
                source_doc = doc['_source']
                
                # Generate embeddings
                enhanced_doc = generate_agenda_embeddings(source_doc, model)
                
                # Prepare bulk action
                action = {
                    "_index": target_index,
                    "_id": doc['_id'],
                    "_source": enhanced_doc
                }
                actions.append(action)
                
                # Bulk index when batch is full
                if len(actions) >= batch_size:
                    helpers.bulk(es, actions)
                    total_processed += len(actions)
                    
                    # Print at milestones only
                    if next_milestone_idx < len(milestones) and total_processed >= milestones[next_milestone_idx]:
                        print(f"Indexed {milestones[next_milestone_idx]} documents")
                        next_milestone_idx += 1
                    
                    actions = []
                    
            except Exception as e:
                print(f"Error processing document {doc.get('_id', 'unknown')}: {e}")
        
        # Index remaining documents
        if actions:
            helpers.bulk(es, actions)
            total_processed += len(actions)
        
        print(f"\nReindexing complete! Total documents indexed: {total_processed}")
        
        # Refresh the index
        es.indices.refresh(index=target_index)
        print(f"Index '{target_index}' refreshed and ready for search")
        
    except Exception as e:
        print(f"Error during reindexing: {e}")
        import traceback
        traceback.print_exc()

# # Test the embedding generation
# test_agenda = {
#     "topic": "AI Strategy Implementation for Enterprise",
#     "name": "Q3 2024 AI Roadmap Discussion",
#     "description": "Discuss the implementation of AI initiatives across different business units",
#     "agenda_questions": [
#         "What are the key AI use cases for our customer service department?",
#         "How can we integrate LLMs into our existing workflow?",
#         "What are the budget requirements for AI infrastructure?",
#         "What training programs do we need for our teams?"
#     ],
#     "agenda_responses": "Initial assessment shows high potential for chatbot implementation and process automation",
#     "functions": "Technology, Customer Service, Operations",
#     "target_companies": "Enterprise Software Companies, Financial Services",
#     "client_geography": "North America",
#     "expert_geographies": "North America, Europe"
# }

# print("\n=== Testing agenda embedding generation ===")
# enhanced_test = generate_agenda_embeddings(test_agenda, model)
# embedding_count = len([k for k in enhanced_test.keys() if 'embedding' in k])
# print(f"Test agenda enhanced with {embedding_count} embeddings")
# print(f"Generated embeddings: {[k for k in enhanced_test.keys() if 'embedding' in k]}")
# print(f"Number of questions processed: {len(enhanced_test.get('agenda_questions', []))}")

reindex_agendas_with_embeddings(source_agenda_index, agenda_index)