In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Explora√ß√£o de Pesquisa Inteligente\n",
    "\n",
    "Este notebook demonstra como realizar pesquisas avan√ßadas e an√°lises nos documentos indexados.\n",
    "\n",
    "## Objetivos\n",
    "- Executar diferentes tipos de pesquisa\n",
    "- Analisar e visualizar resultados\n",
    "- Explorar capacidades de facetas e filtros\n",
    "- Gerar insights a partir dos dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports e configura√ß√µes\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.search.search_engine import IntelligentSearch\n",
    "from src.search.query_builder import QueryBuilder\n",
    "from src.search.result_processor import SearchResultProcessor\n",
    "from config.azure_config import config\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import json\n",
    "from datetime import datetime, timedelta\n",
    "from collections import Counter\n",
    "import numpy as np\n",
    "\n",
    "# Configurar estilo dos gr√°ficos\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "\n",
    "print(\"Ambiente configurado com sucesso!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Configura√ß√£o da Engine de Busca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar configura√ß√£o do √≠ndice\n",
    "try:\n",
    "    with open('../data/processed/index_configuration.json', 'r') as f:\n",
    "        index_config = json.load(f)\n",
    "    \n",
    "    index_name = index_config['index_name']\n",
    "    print(f\"‚úÖ Configura√ß√£o carregada - √çndice: {index_name}\")\n",
    "    print(f\"   Criado em: {index_config['timestamp']}\")\n",
    "    print(f\"   Campos: {index_config['schema_fields']}\")\n",
    "    \n",
    "except FileNotFoundError:\n",
    "    print(\"‚ö†Ô∏è Arquivo de configura√ß√£o n√£o encontrado. Usando configura√ß√£o padr√£o.\")\n",
    "    index_name = \"intelligent-documents-index\"\n",
    "\n",
    "# Inicializar search engine\n",
    "search_engine = IntelligentSearch(\n",
    "    service_name=config.search_service_name,\n",
    "    query_key=config.search_query_key or config.search_admin_key,\n",
    "    index_name=index_name\n",
    ")\n",
    "\n",
    "result_processor = SearchResultProcessor()\n",
    "\n",
    "print(f\"\\nüîç Search engine inicializado para √≠ndice: {index_name}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Explora√ß√£o Inicial dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Busca geral para obter overview dos dados\n",
    "overview_results = search_engine.advanced_search(\n",
    "    query=\"*\",\n",
    "    facets=[\"category\", \"file_type\", \"language\", \"sentiment\"],\n",
    "    top=100\n",
    ")\n",
    "\n",
    "if overview_results['success']:\n",
    "    total_docs = overview_results['total_count']\n",
    "    retrieved_docs = len(overview_results['documents'])\n",
    "    \n",
    "    print(f\"üìä Overview do √çndice:\")\n",
    "    print(f\"   Total de documentos: {total_docs:,}\")\n",
    "    print(f\"   Documentos recuperados para an√°lise: {retrieved_docs}\")\n",
    "    \n",
    "    # Processar resultados\n",
    "    processed_results = result_processor.process_results(overview_results)\n",
    "    analytics = processed_results.get('analytics', {})\n",
    "    \n",
    "    if analytics:\n",
    "        print(f\"\\nüìà Analytics Iniciais:\")\n",
    "        \n",
    "        # Distribui√ß√£o por categoria\n",
    "        categories = analytics.get('category_distribution', {})\n",
    "        if categories:\n",
    "            print(f\"   Categorias encontradas: {len(categories)}\")\n",
    "            for cat, count in list(categories.items())[:5]:\n",
    "                print(f\"     - {cat}: {count} documentos\")\n",
    "        \n",
    "        # Distribui√ß√£o por idioma\n",
    "        languages = analytics.get('language_distribution', {})\n",
    "        if languages:\n",
    "            print(f\"   Idiomas detectados: {list(languages.keys())}\")\n",
    "        \n",
    "        # Estat√≠sticas de conte√∫do\n",
    "        content_stats = analytics.get('content_statistics', {})\n",
    "        if content_stats:\n",
    "            file_types = content_stats.get('file_type_distribution', {})\n",
    "            print(f\"   Tipos de arquivo: {list(file_types.keys())}\")\n",
    "    \n",
    "else:\n",
    "    print(f\"‚ùå Erro na busca inicial: {overview_results.get('error')}\")\n",
    "    total_docs = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualiza√ß√µes dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if overview_results['success'] and analytics:\n",
    "    # Criar subplots\n",
    "    fig = make_subplots(\n",
    "        rows=2, cols=2,\n",
    "        subplot_titles=('Distribui√ß√£o por Categoria', 'Tipos de Arquivo', \n",
    "                       'Distribui√ß√£o por Idioma', 'An√°lise de Sentimento'),\n",
    "        specs=[[{\"type\": \"pie\"}, {\"type\": \"bar\"}],\n",
    "               [{\"type\": \"pie\"}, {\"type\": \"bar\"}]]\n",
    "    )\n",
    "    \n",
    "    # Gr√°fico 1: Categorias\n",
    "    categories = analytics.get('category_distribution', {})\n",
    "    if categories:\n",
    "        fig.add_trace(\n",
    "            go.Pie(labels=list(categories.keys()), \n",
    "                   values=list(categories.values()),\n",
    "                   name=\"Categorias\"),\n",
    "            row=1, col=1\n",
    "        )\n",
    "    \n",
    "    # Gr√°fico 2: Tipos de arquivo\n",
    "    content_stats = analytics.get('content_statistics', {})\n",
    "    file_types = content_stats.get('file_type_distribution', {})\n",
    "    if file_types:\n",
    "        fig.add_trace(\n",
    "            go.Bar(x=list(file_types.keys()), \n",
    "                   y=list(file_types.values()),\n",
    "                   name=\"Tipos de Arquivo\"),\n",
    "            row=1, col=2\n",
    "        )\n",
    "    \n",
    "    # Gr√°fico 3: Idiomas\n",
    "    languages = analytics.get('language_distribution', {})\n",
    "    if languages:\n",
    "        fig.add_trace(\n",
    "            go.Pie(labels=list(languages.keys()), \n",
    "                   values=list(languages.values()),\n",
    "                   name=\"Idiomas\"),\n",
    "            row=2, col=1\n",
    "        )\n",
    "    \n",
    "    # Gr√°fico 4: Sentimentos (se dispon√≠vel)\n",
    "    # Fazer uma busca espec√≠fica para sentimentos\n",
    "    sentiment_results = search_engine.advanced_search(\n",
    "        query=\"*\",\n",
    "        facets=[\"sentiment\"],\n",
    "        top=0  # S√≥ queremos as facetas\n",
    "    )\n",
    "    \n",
    "    if sentiment_results['success'] and sentiment_results.get('facets', {}).get('sentiment'):\n",
    "        sentiment_data = sentiment_results['facets']['sentiment']\n",
    "        sentiment_labels = [item['value'] for item in sentiment_data]\n",
    "        sentiment_counts = [item['count'] for item in sentiment_data]\n",
    "        \n",
    "        fig.add_trace(\n",
    "            go.Bar(x=sentiment_labels, \n",
    "                   y=sentiment_counts,\n",
    "                   name=\"Sentimentos\"),\n",
    "            row=2, col=2\n",
    "        )\n",
    "    \n",
    "    # Atualizar layout\n",
    "    fig.update_layout(\n",
    "        height=800,\n",
    "        title_text=\"Dashboard de An√°lise de Documentos\",\n",
    "        showlegend=False\n",
    "    )\n",
    "    \n",
    "    fig.show()\n",
    "else:\n",
    "    print(\"Dados insuficientes para gerar visualiza√ß√µes.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Pesquisas Espec√≠ficas por Dom√≠nio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Definir consultas de teste por dom√≠nio\n",
    "domain_queries = {\n",
    "    \"Tecnologia\": [\n",
    "        \"intelig√™ncia artificial\",\n",
    "        \"machine learning\",\n",
    "        \"cloud computing\",\n",
    "        \"blockchain\",\n",
    "        \"DevOps\"\n",
    "    ],\n",
    "    \"Neg√≥cios\": [\n",
    "        \"estrat√©gia empresarial\",\n",
    "        \"marketing digital\",\n",
    "        \"gest√£o de projetos\",\n",
    "        \"inova√ß√£o\",\n",
    "        \"transforma√ß√£o digital\"\n",
    "    ],\n",
    "    \"Jur√≠dico\": [\n",
    "        \"contratos\",\n",
    "        \"compliance\",\n",
    "        \"regulamenta√ß√£o\",\n",
    "        \"LGPD\",\n",
    "        \"direitos autorais\"\n",
    "    ]\n",
    "}\n",
    "\n",
    "# Executar pesquisas e coletar resultados\n",
    "domain_results = {}\n",
    "\n",
    "for domain, queries in domain_queries.items():\n",
    "    print(f\"\\nüîç Testando consultas para dom√≠nio: {domain}\")\n",
    "    domain_data = []\n",
    "    \n",
    "    for query in queries:\n",
    "        result = search_engine.simple_search(query, top=20)\n",
    "        \n",
    "        if result['success']:\n",
    "            count = result['total_count']\n",
    "            print(f\"   '{query}': {count} resultados\")\n",
    "            \n",
    "            domain_data.append({\n",
    "                'query': query,\n",
    "                'count': count,\n",
    "                'documents': result['documents']\n",
    "            })\n",
    "        else:\n",
    "            print(f\"   '{query}': Erro na busca\")\n",
    "            domain_data.append({\n",
    "                'query': query,\n",
    "                'count': 0,\n",
    "                'documents': []\n",
    "            })\n",
    "    \n",
    "    domain_results[domain] = domain_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualizar resultados por dom√≠nio\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
    "fig.suptitle('N√∫mero de Documentos por Consulta em Diferentes Dom√≠nios', fontsize=16)\n",
    "\n",
    "for idx, (domain, data) in enumerate(domain_results.items()):\n",
    "    queries = [item['query'] for item in data]\n",
    "    counts = [item['count'] for item in data]\n",
    "    \n",
    "    axes[idx].bar(range(len(queries)), counts, color=plt.cm.Set3(idx))\n",
    "    axes[idx].set_title(f'Dom√≠nio: {domain}')\n",
    "    axes[idx].set_xlabel('Consultas')\n",
    "    axes[idx].set_ylabel('N√∫mero de Documentos')\n",
    "    axes[idx].set_xticks(range(len(queries)))\n",
    "    axes[idx].set_xticklabels(queries, rotation=45, ha='right')\n",
    "    \n",
    "    # Adicionar valores nos barras\n",
    "    for i, count in enumerate(counts):\n",
    "        axes[idx].text(i, count + max(counts) * 0.01, str(count), \n",
    "                      ha='center', va='bottom')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Estat√≠sticas resumidas\n",
    "print(\"\\nüìä Resumo por Dom√≠nio:\")\n",
    "for domain, data in domain_results.items():\n",
    "    total_docs = sum(item['count'] for item in data)\n",
    "    avg_docs = total_docs / len(data) if data else 0\n",
    "    best_query = max(data, key=lambda x: x['count']) if data else None\n",
    "    \n",
    "    print(f\"\\n{domain}:\")\n",
    "    print(f\"   Total de documentos encontrados: {total_docs}\")\n",
    "    print(f\"   M√©dia por consulta: {avg_docs:.1f}\")\n",
    "    if best_query:\n",
    "        print(f\"   Melhor consulta: '{best_query['query']}' ({best_query['count']} docs)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. An√°lise de Frases-chave e Entidades"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Buscar documentos com frases-chave e entidades\n",
    "enriched_search = search_engine.advanced_search(\n",
    "    query=\"*\",\n",
    "    top=50\n",
    ")\n",
    "\n",
    "if enriched_search['success']:\n",
    "    documents = enriched_search['documents']\n",
    "    \n",
    "    # Extrair todas as frases-chave\n",
    "    all_key_phrases = []\n",
    "    all_entities = []\n",
    "    \n",
    "    for doc in documents:\n",
    "        # Frases-chave\n",
    "        key_phrases = doc.get('key_phrases', [])\n",
    "        if isinstance(key_phrases, list):\n",
    "            all_key_phrases.extend(key_phrases)\n",
    "        elif key_phrases:  # String √∫nica\n",
    "            all_key_phrases.append(key_phrases)\n",
    "        \n",
    "        # Entidades\n",
    "        entities = doc.get('entities', [])\n",
    "        if isinstance(entities, list):\n",
    "            all_entities.extend(entities)\n",
    "        elif entities:  # String √∫nica\n",
    "            all_entities.append(entities)\n",
    "    \n",
    "    # Contar frequ√™ncias\n",
    "    phrase_counter = Counter(all_key_phrases)\n",
    "    entity_counter = Counter(all_entities)\n",
    "    \n",
    "    # Top 20 frases-chave\n",
    "    top_phrases = phrase_counter.most_common(20)\n",
    "    # Top 20 entidades\n",
    "    top_entities = entity_counter.most_common(20)\n",
    "    \n",
    "    print(f\"\\nüî§ An√°lise de Frases-chave e Entidades:\")\n",
    "    print(f\"   Total de frases-chave √∫nicas: {len(phrase_counter)}\")\n",
    "    print(f\"   Total de entidades √∫nicas: {len(entity_counter)}\")\n",
    "    \n",
    "    if top_phrases:\n",
    "        print(f\"\\nüìù Top 10 Frases-chave:\")\n",
    "        for phrase, count in top_phrases[:10]:\n",
    "            print(f\"   {count:3d}x - {phrase}\")\n",
    "    \n",
    "    if top_entities:\n",
    "        print(f\"\\nüè∑Ô∏è Top 10 Entidades:\")\n",
    "        for entity, count in top_entities[:10]:\n",
    "            print(f\"   {count:3d}x - {entity}\")\n",
    "else:\n",
    "    print(\"‚ùå Erro ao buscar documentos enriquecidos\")\n",
    "    top_phrases = []\n",
    "    top_entities = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualizar frases-chave e entidades mais frequentes\n",
    "if top_phrases or top_entities:\n",
    "    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))\n",
    "    \n",
    "    # Gr√°fico de frases-chave\n",
    "    if top_phrases:\n",
    "        phrases, phrase_counts = zip(*top_phrases[:15])\n",
    "        y_pos = np.arange(len(phrases))\n",
    "        \n",
    "        bars1 = ax1.barh(y_pos, phrase_counts, color='skyblue')\n",
    "        ax1.set_yticks(y_pos)\n",
    "        ax1.set_yticklabels(phrases)\n",
    "        ax1.set_xlabel('Frequ√™ncia')\n",
    "        ax1.set_title('Top 15 Frases-chave Mais Frequentes')\n",
    "        ax1.invert_yaxis()\n",
    "        \n",
    "        # Adicionar valores nas barras\n",
    "        for i, bar in enumerate(bars1):\n",
    "            width = bar.get_width()\n",
    "            ax1.text(width, bar.get_y() + bar.get_height()/2, \n",
    "                    f'{int(width)}', ha='left', va='center')\n",
    "    \n",
    "    # Gr√°fico de entidades\n",
    "    if top_entities:\n",
    "        entities, entity_counts = zip(*top_entities[:15])\n",
    "        y_pos = np.arange(len(entities))\n",
    "        \n",
    "        bars2 = ax2.barh(y_pos, entity_counts, color='lightcoral')\n",
    "        ax2.set_yticks(y_pos)\n",
    "        ax2.set_yticklabels(entities)\n",
    "        ax2.set_xlabel('Frequ√™ncia')\n",
    "        ax2.set_title('Top 15 Entidades Mais Frequentes')\n",
    "        ax2.invert_yaxis()\n",
    "        \n",
    "        # Adicionar valores nas barras\n",
    "        for i, bar in enumerate(bars2):\n",
    "            width = bar.get_width()\n",
    "            ax2.text(width, bar.get_y() + bar.get_height()/2, \n",
    "                    f'{int(width)}', ha='left', va='center')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"Dados insuficientes para visualiza√ß√£o de frases-chave e entidades.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Pesquisas Complexas com Query Builder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Exemplos de consultas complexas\n",
    "print(\"üîß Testando Query Builder para consultas complexas...\\n\")\n",
    "\n",
    "# Consulta 1: Documentos de tecnologia com sentimento positivo\n",
    "query_builder = QueryBuilder()\n",
    "tech_positive = query_builder.add_term(\"tecnologia OR intelig√™ncia artificial\") \\\n",
    "                             .add_filter(\"sentiment\", \"eq\", \"positive\") \\\n",
    "                             .add_facet(\"category\") \\\n",
    "                             .add_sort(\"modified_date\", \"desc\") \\\n",
    "                             .build_search_params()\n",
    "\n",
    "result1 = search_engine.advanced_search(**tech_positive, top=10)\n",
    "print(f\"Consulta 1 - Documentos de tecnologia com sentimento positivo:\")\n",
    "print(f\"   Resultados: {result1['total_count'] if result1['success'] else 'Erro'}\")\n",
    "\n",
    "# Consulta 2: Documentos grandes (>1MB) dos √∫ltimos 6 meses\n",
    "six_months_ago = (datetime.now() - timedelta(days=180)).isoformat()\n",
    "query_builder.reset()\n",
    "large_recent = query_builder.add_term(\"*\") \\\n",
    "                           .add_filter(\"file_size\", \"gt\", 1048576) \\\n",
    "                           .add_filter(\"modified_date\", \"gt\", six_months_ago) \\\n",
    "                           .add_facet(\"file_type\") \\\n",
    "                           .add_sort(\"file_size\", \"desc\") \\\n",
    "                           .build_search_params()\n",
    "\n",
    "result2 = search_engine.advanced_search(**large_recent, top=10)\n",
    "print(f\"\\nConsulta 2 - Documentos grandes (>1MB) dos √∫ltimos 6 meses:\")\n",
    "print(f\"   Resultados: {result2['total_count'] if result2['success'] else 'Erro'}\")\n",
    "\n",
    "# Consulta 3: Busca por frase exata com filtros m√∫ltiplos\n",
    "query_builder.reset()\n",
    "exact_phrase = query_builder.add_phrase(\"machine learning\") \\\n",
    "                           .add_filter(\"language\", \"eq\", \"pt\") \\\n",
    "                           .add_filter(\"file_type\", \"in\", [\"pdf\", \"docx\"]) \\\n",
    "                           .add_facet(\"category\") \\\n",
    "                           .add_highlight(\"content\") \\\n",
    "                           .build_search_params()\n",
    "\n",
    "result3 = search_engine.advanced_search(**exact_phrase, top=10)\n",
    "print(f\"\\nConsulta 3 - Frase exata 'machine learning' em portugu√™s (PDF/DOCX):\")\n",
    "print(f\"   Resultados: {result3['total_count'] if result3['success'] else 'Erro'}\")\n",
    "\n",
    "# Exibir alguns resultados detalhados\n",
    "if result1['success'] and result1['documents']:\n",
    "    print(f\"\\nüìÑ Exemplo de resultado da Consulta 1:\")\n",
    "    doc = result1['documents'][0]\n",
    "    print(f\"   T√≠tulo: {doc.get('title', 'N/A')[:60]}...\")\n",
    "    print(f\"   Sentimento: {doc.get('sentiment', 'N/A')}\")\n",
    "    print(f\"   Categoria: {doc.get('category', 'N/A')}\")\n",
    "    print(f\"   Score: {doc.get('@search.score', 'N/A')}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. An√°lise de Performance das Consultas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Teste de performance com diferentes tipos de consulta\n",
    "import time\n",
    "\n",
    "performance_tests = {\n",
    "    \"Busca simples\": lambda: search_engine.simple_search(\"tecnologia\", top=20),\n",
    "    \"Busca com filtros\": lambda: search_engine.advanced_search(\n",
    "        query=\"intelig√™ncia artificial\",\n",
    "        filters=\"category eq 'tecnologia'\",\n",
    "        top=20\n",
    "    ),\n",
    "    \"Busca com facetas\": lambda: search_engine.advanced_search(\n",
    "        query=\"*\",\n",
    "        facets=[\"category\", \"file_type\", \"language\"],\n",
    "        top=20\n",
    "    ),\n",
    "    \"Busca complexa\": lambda: search_engine.advanced_search(\n",
    "        query='\"machine learning\" OR \"artificial intelligence\"',\n",
    "        filters=\"file_size gt 100000 and language eq 'pt'\",\n",
    "        facets=[\"category\", \"sentiment\"],\n",
    "        order_by=[\"@search.score desc\", \"modified_date desc\"],\n",
    "        top=20\n",
    "    )\n",
    "}\n",
    "\n",
    "performance_results = []\n",
    "\n",
    "print(\"‚è±Ô∏è Testando performance das consultas...\\n\")\n",
    "\n",
    "for test_name, test_func in performance_tests.items():\n",
    "    times = []\n",
    "    \n",
    "    # Executar 5 vezes cada teste\n",
    "    for i in range(5):\n",
    "        start_time = time.time()\n",
    "        result = test_func()\n",
    "        end_time = time.time()\n",
    "        \n",
    "        if result.get('success'):\n",
    "            times.append(end_time - start_time)\n",
    "        else:\n",
    "            print(f\"   ‚ùå Erro em {test_name}: {result.get('error')}\")\n",
    "    \n",
    "    if times:\n",
    "        avg_time = np.mean(times)\n",
    "        min_time = np.min(times)\n",
    "        max_time = np.max(times)\n",
    "        std_time = np.std(times)\n",
    "        \n",
    "        performance_results.append({\n",
    "            'test': test_name,\n",
    "            'avg_time': avg_time,\n",
    "            'min_time': min_time,\n",
    "            'max_time': max_time,\n",
    "            'std_time': std_time,\n",
    "            'result_count': result.get('total_count', 0)\n",
    "        })\n",
    "        \n",
    "        print(f\"{test_name}:\")\n",
    "        print(f\"   Tempo m√©dio: {avg_time*1000:.1f}ms\")\n",
    "        print(f\"   Min/Max: {min_time*1000:.1f}ms / {max_time*1000:.1f}ms\")\n",
    "        print(f\"   Resultados: {result.get('total_count', 0)}\")\n",
    "        print()\n",
    "\n",
    "# Visualizar performance\n",
    "if performance_results:\n",
    "    df_perf = pd.DataFrame(performance_results)\n",
    "    \n",
    "    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
    "    \n",
    "    # Gr√°fico de tempo de resposta\n",
    "    bars = ax1.bar(df_perf['test'], df_perf['avg_time'] * 1000, \n",
    "                   yerr=df_perf['std_time'] * 1000, capsize=5)\n",
    "    ax1.set_title('Tempo M√©dio de Resposta por Tipo de Consulta')\n",
    "    ax1.set_ylabel('Tempo (ms)')\n",
    "    ax1.set_xticklabels(df_perf['test'], rotation=45, ha='right')\n",
    "    \n",
    "    # Adicionar valores nas barras\n",
    "    for bar, time_val in zip(bars, df_perf['avg_time'] * 1000):\n",
    "        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,\n",
    "                f'{time_val:.1f}ms', ha='center', va='bottom')\n",
    "    \n",
    "    # Gr√°fico de n√∫mero de resultados\n",
    "    ax2.bar(df_perf['test'], df_perf['result_count'], color='lightgreen')\n",
    "    ax2.set_title('N√∫mero de Resultados por Tipo de Consulta')\n",
    "    ax2.set_ylabel('N√∫mero de Resultados')\n",
    "    ax2.set_xticklabels(df_perf['test'], rotation=45, ha='right')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    print(\"\\nüìä Resumo de Performance:\")\n",
    "    fastest = df_perf.loc[df_perf['avg_time'].idxmin()]\n",
    "    slowest = df_perf.loc[df_perf['avg_time'].idxmax()]\n",
    "    \n",
    "    print(f\"   Consulta mais r√°pida: {fastest['test']} ({fastest['avg_time']*1000:.1f}ms)\")\n",
    "    print(f\"   Consulta mais lenta: {slowest['test']} ({slowest['avg_time']*1000:.1f}ms)\")\n",
    "    print(f\"   Tempo m√©dio geral: {df_perf['avg_time'].mean()*1000:.1f}ms\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Insights e Recomenda√ß√µes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gerar insights finais\n",
    "insights = {\n",
    "    'total_documents': total_docs,\n",
    "    'analysis_timestamp': datetime.now().isoformat(),\n",
    "    'domain_analysis': {},\n",
    "    'content_insights': {},\n",
    "    'performance_insights': {},\n",
    "    'recommendations': []\n",
    "}\n",
    "\n",
    "# Insights de dom√≠nio\n",
    "for domain, data in domain_results.items():\n",
    "    total_docs_domain = sum(item['count'] for item in data)\n",
    "    insights['domain_analysis'][domain] = {\n",
    "        'total_documents': total_docs_domain,\n",
    "        'coverage_percentage': (total_docs_domain / total_docs * 100) if total_docs > 0 else 0,\n",
    "        'top_query': max(data, key=lambda x: x['count'])['query'] if data else None\n",
    "    }\n",
    "\n",
    "# Insights de conte√∫do\n",
    "if analytics:\n",
    "    insights['content_insights'] = {\n",
    "        'unique_categories': len(analytics.get('category_distribution', {})),\n",
    "        'languages_detected': len(analytics.get('language_distribution', {})),\n",
    "        'file_types': len(analytics.get('content_statistics', {}).get('file_type_distribution', {})),\n",
    "        'top_category': max(analytics.get('category_distribution', {}).items(), \n",
    "                           key=lambda x: x[1])[0] if analytics.get('category_distribution') else None\n",
    "    }\n",
    "\n",
    "# Insights de performance\n",
    "if performance_results:\n",
    "    avg_response_time = np.mean([r['avg_time'] for r in performance_results])\n",
    "    insights['performance_insights'] = {\n",
    "        'average_response_time_ms': avg_response_time * 1000,\n",
    "        'fastest_query_type': min(performance_results, key=lambda x: x['avg_time'])['test'],\n",
    "        'response_time_variance': np.var([r['avg_time'] for r in performance_results]) * 1000\n",
    "    }\n",
    "\n",
    "# Gerar recomenda√ß√µes\n",
    "recommendations = []\n",
    "\n",
    "if total_docs < 100:\n",
    "    recommendations.append(\"Considere adicionar mais documentos para melhorar a qualidade das an√°lises\")\n",
    "\n",
    "if performance_results:\n",
    "    avg_time = np.mean([r['avg_time'] for r in performance_results])\n",
    "    if avg_time > 1.0:  # Mais de 1 segundo\n",
    "        recommendations.append(\"Performance das consultas pode ser melhorada com otimiza√ß√£o de √≠ndices\")\n",
    "    \n",
    "    simple_time = next((r['avg_time'] for r in performance_results if r['test'] == 'Busca simples'), None)\n",
    "    complex_time = next((r['avg_time'] for r in performance_results if r['test'] == 'Busca complexa'), None)\n",
    "    \n",
    "    if simple_time and complex_time and complex_time > simple_time * 3:\n",
    "        recommendations.append(\"Consultas complexas est√£o significativamente mais lentas - considere otimiza√ß√£o\")\n",
    "\n",
    "if analytics:\n",
    "    languages = analytics.get('language_distribution', {})\n",
    "    if len(languages) > 3:\n",
    "        recommendations.append(\"M√∫ltiplos idiomas detectados - considere configurar analisadores espec√≠ficos por idioma\")\n",
    "    \n",
    "    categories = analytics.get('category_distribution', {})\n",
    "    if len(categories) > 10:\n",
    "        recommendations.append(\"Muitas categorias detectadas - considere consolida√ß√£o ou hierarquia de categorias\")\n",
    "\n",
    "if top_phrases:\n",
    "    top_phrase_freq = top_phrases[0][1] if top_phrases else 0\n",
    "    if top_phrase_freq > total_docs * 0.5:  # Mais de 50% dos documentos\n",
    "        recommendations.append(\"Algumas frases-chave s√£o muito comuns - considere ajustar filtros de extra√ß√£o\")\n",
    "\n",
    "insights['recommendations'] = recommendations\n",
    "\n",
    "# Exibir insights finais\n",
    "print(\"\\nüéØ INSIGHTS E RECOMENDA√á√ïES FINAIS\\n\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "print(f\"\\nüìä Resumo Geral:\")\n",
    "print(f\"   Total de documentos analisados: {total_docs:,}\")\n",
    "print(f\"   Categorias identificadas: {insights['content_insights'].get('unique_categories', 'N/A')}\")\n",
    "print(f\"   Idiomas detectados: {insights['content_insights'].get('languages_detected', 'N/A')}\")\n",
    "print(f\"   Tipos de arquivo: {insights['content_insights'].get('file_types', 'N/A')}\")\n",
    "\n",
    "print(f\"\\nüèÜ Performance:\")\n",
    "if insights['performance_insights']:\n",
    "    perf = insights['performance_insights']\n",
    "    print(f\"   Tempo m√©dio de resposta: {perf['average_response_time_ms']:.1f}ms\")\n",
    "    print(f\"   Tipo de consulta mais r√°pida: {perf['fastest_query_type']}\")\n",
    "\n",
    "print(f\"\\nüìà Cobertura por Dom√≠nio:\")\n",
    "for domain, data in insights['domain_analysis'].items():\n",
    "    print(f\"   {domain}: {data['total_documents']} docs ({data['coverage_percentage']:.1f}% do total)\")\n",
    "    if data['top_query']:\n",
    "        print(f\"      Melhor consulta: '{data['top_query']}'\")\n",
    "\n",
    "print(f\"\\nüí° Recomenda√ß√µes:\")\n",
    "if recommendations:\n",
    "    for i, rec in enumerate(recommendations, 1):\n",
    "        print(f\"   {i}. {rec}\")\n",
    "else:\n",
    "    print(\"   ‚úÖ Sistema est√° funcionando adequadamente!\")\n",
    "\n",
    "print(\"\\n\" + \"=\" * 50)\n",
    "print(\"An√°lise conclu√≠da! üéâ\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Exporta√ß√£o dos Resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Salvar insights e resultados\n",
    "os.makedirs('../data/processed', exist_ok=True)\n",
    "\n",
    "# Salvar insights completos\n",
    "with open('../data/processed/search_insights.json', 'w') as f:\n",
    "    json.dump(insights, f, indent=2, default=str)\n",
    "\n",
    "print(\"‚úÖ Insights salvos em: ../data/processed/search_insights.json\")\n",
    "\n",
    "# Exportar dados de performance\n",
    "if performance_results:\n",
    "    df_perf = pd.DataFrame(performance_results)\n",
    "    df_perf.to_csv('../data/processed/performance_analysis.csv', index=False)\n",
    "    print(\"‚úÖ Dados de performance salvos em: ../data/processed/performance_analysis.csv\")\n",
    "\n",
    "# Exportar an√°lise de dom√≠nios\n",
    "domain_summary = []\n",
    "for domain, data in domain_results.items():\n",
    "    for item in data:\n",
    "        domain_summary.append({\n",
    "            'domain': domain,\n",
    "            'query': item['query'],\n",
    "            'document_count': item['count']\n",
    "        })\n",
    "\n",
    "if domain_summary:\n",
    "    df_domains = pd.DataFrame(domain_summary)\n",
    "    df_domains.to_csv('../data/processed/domain_analysis.csv', index=False)\n",
    "    print(\"‚úÖ An√°lise de dom√≠nios salva em: ../data/processed/domain_analysis.csv\")\n",
    "\n",
    "# Exportar frases-chave e entidades\n",
    "if top_phrases:\n",
    "    df_phrases = pd.DataFrame(top_phrases, columns=['phrase', 'frequency'])\n",
    "    df_phrases.to_csv('../data/processed/top_keyphrases.csv', index=False)\n",
    "    print(\"‚úÖ Frases-chave salvas em: ../data/processed/top_keyphrases.csv\")\n",
    "\n",
    "if top_entities:\n",
    "    df_entities = pd.DataFrame(top_entities, columns=['entity', 'frequency'])\n",
    "    df_entities.to_csv('../data/processed/top_entities.csv', index=False)\n",
    "    print(\"‚úÖ Entidades salvas em: ../data/processed/top_entities.csv\")\n",
    "\n",
    "print(f\"\\nüìÅ Todos os resultados foram salvos na pasta: ../data/processed/\")\n",
    "print(f\"   Timestamp da an√°lise: {insights['analysis_timestamp']}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}