In [None]:
# filepath: notebooks/02_index_creation_tutorial.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial de Criação de Índices\n",
    "\n",
    "Este notebook demonstra como criar e configurar índices inteligentes no Azure Cognitive Search.\n",
    "\n",
    "## Objetivos\n",
    "- Definir schemas de índice otimizados\n",
    "- Configurar skills cognitivas para enriquecimento\n",
    "- Criar indexadores para processamento automático\n",
    "- Monitorar o processo de indexação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports e configurações\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.indexing.index_manager import IndexManager\n",
    "from src.indexing.schema_builder import IndexSchemaBuilder\n",
    "from src.indexing.cognitive_skills import CognitiveSkillsManager\n",
    "from config.azure_config import config\n",
    "\n",
    "import time\n",
    "import json\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from datetime import datetime\n",
    "\n",
    "print(\"Módulos importados com sucesso!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Configuração do Gerenciador de Índices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inicializar gerenciadores\n",
    "index_manager = IndexManager(\n",
    "    service_name=config.search_service_name,\n",
    "    admin_key=config.search_admin_key\n",
    ")\n",
    "\n",
    "skills_manager = CognitiveSkillsManager(\n",
    "    service_name=config.search_service_name,\n",
    "    admin_key=config.search_admin_key,\n",
    "    cognitive_services_key=config.cognitive_services_key\n",
    ")\n",
    "\n",
    "print(\"Gerenciadores inicializados!\")\n",
    "\n",
    "# Listar índices existentes\n",
    "existing_indexes = index_manager.list_indexes()\n",
    "print(f\"\\nÍndices existentes: {existing_indexes}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Definição do Schema do Índice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Criar schema para documentos\n",
    "schema_builder = IndexSchemaBuilder()\n",
    "\n",
    "# Campos básicos\n",
    "schema_builder.add_field(\"id\", \"Edm.String\", key=True)\n",
    "schema_builder.add_field(\"title\", \"Edm.String\", searchable=True, filterable=True, sortable=True)\n",
    "schema_builder.add_field(\"content\", \"Edm.String\", searchable=True)\n",
    "schema_builder.add_field(\"summary\", \"Edm.String\", searchable=True)\n",
    "\n",
    "# Metadados\n",
    "schema_builder.add_field(\"category\", \"Edm.String\", filterable=True, facetable=True)\n",
    "schema_builder.add_field(\"file_type\", \"Edm.String\", filterable=True, facetable=True)\n",
    "schema_builder.add_field(\"file_size\", \"Edm.Int64\", filterable=True, sortable=True)\n",
    "schema_builder.add_field(\"created_date\", \"Edm.DateTimeOffset\", filterable=True, sortable=True)\n",
    "schema_builder.add_field(\"modified_date\", \"Edm.DateTimeOffset\", filterable=True, sortable=True)\n",
    "\n",
    "# Campos enriquecidos por IA\n",
    "schema_builder.add_field(\"language\", \"Edm.String\", filterable=True, facetable=True)\n",
    "schema_builder.add_field(\"key_phrases\", \"Collection(Edm.String)\", searchable=True, filterable=True)\n",
    "schema_builder.add_field(\"entities\", \"Collection(Edm.String)\", searchable=True, filterable=True)\n",
    "schema_builder.add_field(\"sentiment_score\", \"Edm.Double\", filterable=True, sortable=True)\n",
    "schema_builder.add_field(\"sentiment\", \"Edm.String\", filterable=True, facetable=True)\n",
    "\n",
    "# Construir o schema final\n",
    "fields = schema_builder.build()\n",
    "\n",
    "print(f\"Schema criado com {len(fields)} campos:\")\n",
    "for field in fields:\n",
    "    field_info = f\"- {field.name} ({field.type})\"\n",
    "    if hasattr(field, 'key') and field.key:\n",
    "        field_info += \" [KEY]\"\n",
    "    if hasattr(field, 'searchable') and field.searchable:\n",
    "        field_info += \" [SEARCHABLE]\"\n",
    "    if hasattr(field, 'filterable') and field.filterable:\n",
    "        field_info += \" [FILTERABLE]\"\n",
    "    print(field_info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Criação do Skillset de IA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configurar skills cognitivas\n",
    "skills_config = {\n",
    "    'language_detection': True,\n",
    "    'entity_extraction': True,\n",
    "    'key_phrase_extraction': True,\n",
    "    'sentiment_analysis': True,\n",
    "    'ocr_enabled': True\n",
    "}\n",
    "\n",
    "skillset_name = \"document-processing-skillset\"\n",
    "\n",
    "print(f\"Criando skillset '{skillset_name}' com as seguintes capabilities:\")\n",
    "for skill, enabled in skills_config.items():\n",
    "    status = \"✅\" if enabled else \"❌\"\n",
    "    print(f\"{status} {skill.replace('_', ' ').title()}\")\n",
    "\n",
    "# Criar o skillset\n",
    "skillset_result = skills_manager.create_skillset(skillset_name, skills_config)\n",
    "\n",
    "if skillset_result['success']:\n",
    "    print(f\"\\n✅ Skillset criado com sucesso! ({skillset_result['skills_count']} skills)\")\n",
    "else:\n",
    "    print(f\"\\n❌ Erro ao criar skillset: {skillset_result['error']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Criação do Índice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Criar o índice\n",
    "index_name = \"intelligent-documents-index\"\n",
    "\n",
    "print(f\"Criando índice '{index_name}'...\")\n",
    "\n",
    "index_result = index_manager.create_index(index_name, fields)\n",
    "\n",
    "if index_result['success']:\n",
    "    print(f\"✅ Índice criado com sucesso!\")\n",
    "    print(f\"   Nome: {index_result['index_name']}\")\n",
    "    print(f\"   Campos: {index_result['field_count']}\")\n",
    "else:\n",
    "    print(f\"❌ Erro ao criar índice: {index_result['error']}\")\n",
    "    # Se o índice já existe, podemos atualizá-lo\n",
    "    if \"already exists\" in index_result.get('error', '').lower():\n",
    "        print(\"\\nTentando atualizar o índice existente...\")\n",
    "        update_result = index_manager.update_index(index_name, fields)\n",
    "        if update_result['success']:\n",
    "            print(\"✅ Índice atualizado com sucesso!\")\n",
    "        else:\n",
    "            print(f\"❌ Erro ao atualizar índice: {update_result['error']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Configuração da Fonte de Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configurar fonte de dados do Blob Storage\n",
    "from azure.search.documents.indexes import SearchIndexerClient\n",
    "from azure.search.documents.indexes.models import (\n",
    "    SearchIndexerDataSourceConnection,\n",
    "    SearchIndexerDataContainer\n",
    ")\n",
    "from azure.core.credentials import AzureKeyCredential\n",
    "\n",
    "indexer_client = SearchIndexerClient(\n",
    "    endpoint=f\"https://{config.search_service_name}.search.windows.net\",\n",
    "    credential=AzureKeyCredential(config.search_admin_key)\n",
    ")\n",
    "\n",
    "# Definir fonte de dados\n",
    "datasource_name = \"documents-blob-datasource\"\n",
    "\n",
    "datasource = SearchIndexerDataSourceConnection(\n",
    "    name=datasource_name,\n",
    "    type=\"azureblob\",\n",
    "    connection_string=config.storage_connection_string,\n",
    "    container=SearchIndexerDataContainer(name=config.storage_container_name)\n",
    ")\n",
    "\n",
    "try:\n",
    "    indexer_client.create_data_source(datasource)\n",
    "    print(f\"✅ Fonte de dados '{datasource_name}' criada com sucesso!\")\n",
    "except Exception as e:\n",
    "    if \"already exists\" in str(e).lower():\n",
    "        print(f\"ℹ️ Fonte de dados '{datasource_name}' já existe.\")\n",
    "    else:\n",
    "        print(f\"❌ Erro ao criar fonte de dados: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Criação do Indexador"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Criar indexador\n",
    "from azure.search.documents.indexes.models import (\n",
    "    SearchIndexer,\n",
    "    FieldMapping,\n",
    "    OutputFieldMappingEntry\n",
    ")\n",
    "\n",
    "indexer_name = \"documents-indexer\"\n",
    "\n",
    "# Mapeamentos de campo\n",
    "field_mappings = [\n",
    "    FieldMapping(source_field_name=\"metadata_storage_path\", target_field_name=\"id\", \n",
    "                 mapping_function={\"name\": \"base64Encode\"}),\n",
    "    FieldMapping(source_field_name=\"metadata_storage_name\", target_field_name=\"title\"),\n",
    "    FieldMapping(source_field_name=\"content\", target_field_name=\"content\"),\n",
    "    FieldMapping(source_field_name=\"metadata_storage_size\", target_field_name=\"file_size\"),\n",
    "    FieldMapping(source_field_name=\"metadata_storage_last_modified\", target_field_name=\"modified_date\"),\n",
    "    FieldMapping(source_field_name=\"metadata_storage_content_type\", target_field_name=\"file_type\")\n",
    "]\n",
    "\n",
    "# Mapeamentos de saída (do skillset)\n",
    "output_field_mappings = [\n",
    "    OutputFieldMappingEntry(source_field_name=\"/document/languageCode\", target_field_name=\"language\"),\n",
    "    OutputFieldMappingEntry(source_field_name=\"/document/keyPhrases\", target_field_name=\"key_phrases\"),\n",
    "    OutputFieldMappingEntry(source_field_name=\"/document/entities\", target_field_name=\"entities\"),\n",
    "    OutputFieldMappingEntry(source_field_name=\"/document/sentiment\", target_field_name=\"sentiment\"),\n",
    "    OutputFieldMappingEntry(source_field_name=\"/document/sentimentScore\", target_field_name=\"sentiment_score\")\n",
    "]\n",
    "\n",
    "indexer = SearchIndexer(\n",
    "    name=indexer_name,\n",
    "    data_source_name=datasource_name,\n",
    "    target_index_name=index_name,\n",
    "    skillset_name=skillset_name,\n",
    "    field_mappings=field_mappings,\n",
    "    output_field_mappings=output_field_mappings,\n",
    "    parameters={\n",
    "        \"batchSize\": 50,\n",
    "        \"maxFailedItems\": 10,\n",
    "        \"maxFailedItemsPerBatch\": 5,\n",
    "        \"configuration\": {\n",
    "            \"dataToExtract\": \"contentAndMetadata\",\n",
    "            \"parsingMode\": \"default\",\n",
    "            \"imageAction\": \"generateNormalizedImages\"\n",
    "        }\n",
    "    }\n",
    ")\n",
    "\n",
    "try:\n",
    "    indexer_client.create_indexer(indexer)\n",
    "    print(f\"✅ Indexador '{indexer_name}' criado com sucesso!\")\n",
    "except Exception as e:\n",
    "    if \"already exists\" in str(e).lower():\n",
    "        print(f\"ℹ️ Indexador '{indexer_name}' já existe.\")\n",
    "        # Atualizar o indexador existente\n",
    "        try:\n",
    "            indexer_client.create_or_update_indexer(indexer)\n",
    "            print(f\"✅ Indexador '{indexer_name}' atualizado!\")\n",
    "        except Exception as update_e:\n",
    "            print(f\"❌ Erro ao atualizar indexador: {update_e}\")\n",
    "    else:\n",
    "        print(f\"❌ Erro ao criar indexador: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Execução e Monitoramento da Indexação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Executar indexador\n",
    "print(f\"Iniciando execução do indexador '{indexer_name}'...\")\n",
    "\n",
    "try:\n",
    "    indexer_client.run_indexer(indexer_name)\n",
    "    print(\"✅ Indexador iniciado com sucesso!\")\n",
    "    \n",
    "    # Monitorar progresso\n",
    "    print(\"\\nMonitorando progresso...\")\n",
    "    \n",
    "    for i in range(30):  # Monitorar por até 5 minutos\n",
    "        status = indexer_client.get_indexer_status(indexer_name)\n",
    "        \n",
    "        execution_status = status.status\n",
    "        last_result = status.last_result\n",
    "        \n",
    "        if last_result:\n",
    "            items_processed = last_result.item_count or 0\n",
    "            items_failed = last_result.failed_item_count or 0\n",
    "            \n",
    "            print(f\"Status: {execution_status} | Processados: {items_processed} | Falhas: {items_failed}\")\n",
    "            \n",
    "            if execution_status == \"running\":\n",
    "                time.sleep(10)  # Aguardar 10 segundos\n",
    "            else:\n",
    "                break\n",
    "        else:\n",
    "            print(f\"Status: {execution_status}\")\n",
    "            time.sleep(10)\n",
    "    \n",
    "    # Status final\n",
    "    final_status = indexer_client.get_indexer_status(indexer_name)\n",
    "    if final_status.last_result:\n",
    "        result = final_status.last_result\n",
    "        print(f\"\\n📊 Resultado Final:\")\n",
    "        print(f\"   Status: {result.status}\")\n",
    "        print(f\"   Itens processados: {result.item_count or 0}\")\n",
    "        print(f\"   Itens com falha: {result.failed_item_count or 0}\")\n",
    "        print(f\"   Início: {result.start_time}\")\n",
    "        print(f\"   Fim: {result.end_time}\")\n",
    "        \n",
    "        if result.errors:\n",
    "            print(f\"\\n❌ Erros encontrados:\")\n",
    "            for error in result.errors[:5]:  # Mostrar apenas os primeiros 5\n",
    "                print(f\"   - {error.error_message}\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"❌ Erro ao executar indexador: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Verificação dos Resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Obter estatísticas do índice\n",
    "stats = index_manager.get_index_statistics(index_name)\n",
    "\n",
    "if stats:\n",
    "    print(f\"📊 Estatísticas do Índice '{index_name}':\")\n",
    "    print(f\"   Documentos indexados: {stats.get('document_count', 0):,}\")\n",
    "    print(f\"   Tamanho do armazenamento: {stats.get('storage_size', 0):,} bytes\")\n",
    "    \n",
    "    # Converter tamanho para formato legível\n",
    "    storage_size = stats.get('storage_size', 0)\n",
    "    if storage_size > 0:\n",
    "        for unit in ['bytes', 'KB', 'MB', 'GB']:\n",
    "            if storage_size < 1024.0:\n",
    "                print(f\"   Tamanho formatado: {storage_size:.1f} {unit}\")\n",
    "                break\n",
    "            storage_size /= 1024.0\n",
    "else:\n",
    "    print(\"Não foi possível obter estatísticas do índice.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Teste rápido de busca\n",
    "from azure.search.documents import SearchClient\n",
    "from azure.core.credentials import AzureKeyCredential\n",
    "\n",
    "search_client = SearchClient(\n",
    "    endpoint=f\"https://{config.search_service_name}.search.windows.net\",\n",
    "    index_name=index_name,\n",
    "    credential=AzureKeyCredential(config.search_query_key or config.search_admin_key)\n",
    ")\n",
    "\n",
    "# Fazer uma busca simples\n",
    "try:\n",
    "    results = search_client.search(search_text=\"*\", top=5, include_total_count=True)\n",
    "    \n",
    "    total_count = results.get_count()\n",
    "    print(f\"\\n🔍 Teste de Busca - Total de documentos encontrados: {total_count}\")\n",
    "    \n",
    "    if total_count > 0:\n",
    "        print(\"\\nPrimeiros 5 documentos:\")\n",
    "        for i, doc in enumerate(results, 1):\n",
    "            title = doc.get('title', 'Sem título')[:50]\n",
    "            file_type = doc.get('file_type', 'Desconhecido')\n",
    "            language = doc.get('language', 'Não detectado')\n",
    "            \n",
    "            print(f\"{i}. {title}... ({file_type}, {language})\")\n",
    "    else:\n",
    "        print(\"Nenhum documento encontrado. Verifique se a indexação foi concluída.\")\n",
    "        \n",
    "except Exception as e:\n",
    "    print(f\"❌ Erro no teste de busca: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Salvamento da Configuração"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Salvar configuração do projeto\n",
    "project_config = {\n",
    "    'timestamp': datetime.now().isoformat(),\n",
    "    'index_name': index_name,\n",
    "    'skillset_name': skillset_name,\n",
    "    'indexer_name': indexer_name,\n",
    "    'datasource_name': datasource_name,\n",
    "    'schema_fields': len(fields),\n",
    "    'skills_enabled': skills_config,\n",
    "    'index_statistics': stats\n",
    "}\n",
    "\n",
    "config_file = \"../data/processed/index_configuration.json\"\n",
    "os.makedirs(\"../data/processed\", exist_ok=True)\n",
    "\n",
    "with open(config_file, 'w') as f:\n",
    "    json.dump(project_config, f, indent=2, default=str)\n",
    "\n",
    "print(f\"✅ Configuração salva em: {config_file}\")\n",
    "\n",
    "# Resumo final\n",
    "print(f\"\\n🎉 Configuração do Índice Completa!\")\n",
    "print(f\"\\nRecursos criados:\")\n",
    "print(f\"✅ Índice: {index_name}\")\n",
    "print(f\"✅ Skillset: {skillset_name}\")\n",
    "print(f\"✅ Fonte de dados: {datasource_name}\")\n",
    "print(f\"✅ Indexador: {indexer_name}\")\n",
    "print(f\"\\nPróximo passo: Use o notebook '03_search_exploration.ipynb' para explorar os dados indexados!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}