In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstra√ß√£o de Ingest√£o de Dados\n",
    "\n",
    "Este notebook demonstra como ingerir documentos no Azure Blob Storage para posterior indexa√ß√£o no Azure Cognitive Search.\n",
    "\n",
    "## Objetivos\n",
    "- Configurar o ambiente de ingest√£o\n",
    "- Processar documentos locais\n",
    "- Fazer upload para Azure Blob Storage\n",
    "- Analisar resultados da ingest√£o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports e configura√ß√µes\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.ingestion.data_ingestion import DocumentIngestion\n",
    "from src.ingestion.document_processor import DocumentProcessor\n",
    "from src.utils.helpers import setup_logging, get_file_metadata\n",
    "from config.azure_config import config\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "# Setup logging\n",
    "setup_logging(\"INFO\")\n",
    "\n",
    "print(\"Ambiente configurado com sucesso!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Verifica√ß√£o da Configura√ß√£o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verificar configura√ß√µes\n",
    "config_validation = config.validate_config()\n",
    "\n",
    "print(\"Status da Configura√ß√£o:\")\n",
    "for key, is_valid in config_validation.items():\n",
    "    status = \"‚úÖ\" if is_valid else \"‚ùå\"\n",
    "    print(f\"{status} {key}: {'Configurado' if is_valid else 'N√£o configurado'}\")\n",
    "\n",
    "if not all(config_validation.values()):\n",
    "    print(\"\\n‚ö†Ô∏è Configure as vari√°veis de ambiente no arquivo .env antes de continuar\")\n",
    "else:\n",
    "    print(\"\\n‚úÖ Todas as configura√ß√µes est√£o v√°lidas!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Prepara√ß√£o dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verificar documentos dispon√≠veis\n",
    "data_dir = \"../data/raw\"\n",
    "sample_files = []\n",
    "\n",
    "if os.path.exists(data_dir):\n",
    "    for file in os.listdir(data_dir):\n",
    "        file_path = os.path.join(data_dir, file)\n",
    "        if os.path.isfile(file_path):\n",
    "            metadata = get_file_metadata(file_path)\n",
    "            sample_files.append(metadata)\n",
    "\n",
    "if sample_files:\n",
    "    df_files = pd.DataFrame(sample_files)\n",
    "    print(f\"Encontrados {len(sample_files)} arquivos para processamento:\")\n",
    "    print(df_files[['filename', 'file_type', 'file_size', 'mime_type']].head(10))\n",
    "else:\n",
    "    print(\"Nenhum arquivo encontrado em ../data/raw/\")\n",
    "    print(\"Adicione alguns documentos de exemplo nesta pasta para continuar.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Processamento de Documentos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inicializar processador de documentos\n",
    "if config.form_recognizer_endpoint and config.form_recognizer_key:\n",
    "    processor = DocumentProcessor(\n",
    "        config.form_recognizer_endpoint,\n",
    "        config.form_recognizer_key\n",
    "    )\n",
    "    \n",
    "    # Processar alguns documentos de exemplo\n",
    "    processed_docs = []\n",
    "    \n",
    "    for file_info in sample_files[:3]:  # Processar apenas os primeiros 3\n",
    "        file_path = os.path.join(data_dir, file_info['filename'])\n",
    "        print(f\"Processando: {file_info['filename']}\")\n",
    "        \n",
    "        result = processor.process_document(file_path)\n",
    "        processed_docs.append(result)\n",
    "        \n",
    "        if result.get('success'):\n",
    "            content_preview = result.get('content', '')[:200] + \"...\"\n",
    "            print(f\"‚úÖ Sucesso! Preview: {content_preview}\")\n",
    "        else:\n",
    "            print(f\"‚ùå Erro: {result.get('error')}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"Form Recognizer n√£o configurado. Pulando processamento avan√ßado.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Ingest√£o para Azure Blob Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inicializar sistema de ingest√£o\n",
    "ingestion = DocumentIngestion(\n",
    "    storage_connection_string=config.storage_connection_string,\n",
    "    container_name=config.storage_container_name\n",
    ")\n",
    "\n",
    "print(f\"Sistema de ingest√£o inicializado para container: {config.storage_container_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fazer upload dos documentos\n",
    "if sample_files:\n",
    "    print(\"Iniciando upload dos documentos...\")\n",
    "    \n",
    "    results = []\n",
    "    for file_info in tqdm(sample_files, desc=\"Uploading files\"):\n",
    "        file_path = os.path.join(data_dir, file_info['filename'])\n",
    "        result = ingestion.upload_document(file_path)\n",
    "        results.append(result)\n",
    "    \n",
    "    # Analisar resultados\n",
    "    stats = ingestion.get_ingestion_stats(results)\n",
    "    \n",
    "    print(f\"\\nüìä Estat√≠sticas da Ingest√£o:\")\n",
    "    print(f\"Total de arquivos: {stats['total_files']}\")\n",
    "    print(f\"Sucessos: {stats['successful']} ({stats['success_rate']:.1f}%)\")\n",
    "    print(f\"Falhas: {stats['failed']}\")\n",
    "    \n",
    "    # Exibir erros se houver\n",
    "    failed_results = [r for r in results if not r['success']]\n",
    "    if failed_results:\n",
    "        print(\"\\n‚ùå Arquivos com falha:\")\n",
    "        for result in failed_results:\n",
    "            print(f\"- {result['file_path']}: {result['error']}\")\n",
    "else:\n",
    "    print(\"Nenhum arquivo para processar.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. An√°lise dos Resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Criar visualiza√ß√µes dos resultados\n",
    "if sample_files and results:\n",
    "    # Distribui√ß√£o por tipo de arquivo\n",
    "    file_types = [f['file_type'] for f in sample_files]\n",
    "    type_counts = pd.Series(file_types).value_counts()\n",
    "    \n",
    "    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))\n",
    "    \n",
    "    # Gr√°fico 1: Tipos de arquivo\n",
    "    type_counts.plot(kind='bar', ax=ax1, color='skyblue')\n",
    "    ax1.set_title('Distribui√ß√£o por Tipo de Arquivo')\n",
    "    ax1.set_ylabel('Quantidade')\n",
    "    ax1.tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    # Gr√°fico 2: Status dos uploads\n",
    "    success_counts = pd.Series([r['success'] for r in results]).value_counts()\n",
    "    success_labels = ['Sucesso' if k else 'Falha' for k in success_counts.index]\n",
    "    \n",
    "    ax2.pie(success_counts.values, labels=success_labels, autopct='%1.1f%%', \n",
    "            colors=['lightgreen', 'lightcoral'])\n",
    "    ax2.set_title('Status dos Uploads')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    # Tabela de resumo\n",
    "    summary_data = []\n",
    "    for i, (file_info, result) in enumerate(zip(sample_files, results)):\n",
    "        summary_data.append({\n",
    "            'Arquivo': file_info['filename'],\n",
    "            'Tipo': file_info['file_type'],\n",
    "            'Tamanho (KB)': round(file_info['file_size'] / 1024, 1),\n",
    "            'Status': '‚úÖ Sucesso' if result['success'] else '‚ùå Falha',\n",
    "            'URL': result.get('url', 'N/A')[:50] + '...' if result.get('url') else 'N/A'\n",
    "        })\n",
    "    \n",
    "    df_summary = pd.DataFrame(summary_data)\n",
    "    print(\"\\nüìã Resumo Detalhado:\")\n",
    "    print(df_summary.to_string(index=False))\n",
    "else:\n",
    "    print(\"N√£o h√° dados suficientes para gerar visualiza√ß√µes.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Pr√≥ximos Passos\n",
    "\n",
    "Com os documentos ingeridos no Azure Blob Storage, voc√™ pode agora:\n",
    "\n",
    "1. **Criar √≠ndices** usando o notebook `02_index_creation_tutorial.ipynb`\n",
    "2. **Configurar skills cognitivas** para enriquecimento autom√°tico\n",
    "3. **Indexar os documentos** para torn√°-los pesquis√°veis\n",
    "4. **Realizar consultas** usando o notebook `03_search_exploration.ipynb`\n",
    "\n",
    "### Comandos √∫teis para verificar o storage:\n",
    "\n",
    "```python\n",
    "# Listar blobs no container\n",
    "from azure.storage.blob import BlobServiceClient\n",
    "\n",
    "blob_service = BlobServiceClient.from_connection_string(config.storage_connection_string)\n",
    "container_client = blob_service.get_container_client(config.storage_container_name)\n",
    "\n",
    "blobs = list(container_client.list_blobs())\n",
    "print(f\"Total de blobs no container: {len(blobs)}\")\n",
    "for blob in blobs[:5]:  # Mostrar os primeiros 5\n",
    "    print(f\"- {blob.name} ({blob.size} bytes)\")\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Salvar resultados para uso posterior\n",
    "import json\n",
    "\n",
    "if results:\n",
    "    output_file = \"../data/processed/ingestion_results.json\"\n",
    "    os.makedirs(\"../data/processed\", exist_ok=True)\n",
    "    \n",
    "    with open(output_file, 'w') as f:\n",
    "        json.dump({\n",
    "            'timestamp': pd.Timestamp.now().isoformat(),\n",
    "            'statistics': stats,\n",
    "            'results': results\n",
    "        }, f, indent=2, default=str)\n",
    "    \n",
    "    print(f\"‚úÖ Resultados salvos em: {output_file}\")\n",
    "else:\n",
    "    print(\"Nenhum resultado para salvar.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}