In [None]:
# filepath: notebooks/04_advanced_analytics.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Análise Avançada e Machine Learning\n",
    "\n",
    "Este notebook demonstra técnicas avançadas de análise dos dados indexados, incluindo clustering, análise de sentimentos e detecção de padrões.\n",
    "\n",
    "## Objetivos\n",
    "- Aplicar técnicas de machine learning nos dados indexados\n",
    "- Identificar clusters e padrões nos documentos\n",
    "- Análise temporal e de tendências\n",
    "- Geração de recomendações inteligentes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports e configurações\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.search.search_engine import IntelligentSearch\n",
    "from src.search.result_processor import SearchResultProcessor\n",
    "from config.azure_config import config\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "\n",
    "# Machine Learning imports\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Text processing\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import SnowballStemmer\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "import json\n",
    "from datetime import datetime, timedelta\n",
    "from collections import Counter, defaultdict\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"Ambiente de análise avançada configurado!\")\n",
    "\n",
    "# Download NLTK data if needed\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "    nltk.data.find('corpora/stopwords')\n",
    "except LookupError:\n",
    "    print(\"Baixando dados NLTK...\")\n",
    "    nltk.download('punkt')\n",
    "    nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Carregamento e Preparação dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar configuração e inicializar search engine\n",
    "try:\n",
    "    with open('../data/processed/index_configuration.json', 'r') as f:\n",
    "        index_config = json.load(f)\n",
    "    index_name = index_config['index_name']\n",
    "except FileNotFoundError:\n",
    "    index_name = \"intelligent-documents-index\"\n",
    "\n",
    "search_engine = IntelligentSearch(\n",
    "    service_name=config.search_service_name,\n",
    "    query_key=config.search_query_key or config.search_admin_key,\n",
    "    index_name=index_name\n",
    ")\n",
    "\n",
    "# Buscar todos os documentos para análise\n",
    "print(\"Carregando documentos para análise...\")\n",
    "all_docs_result = search_engine.advanced_search(\n",
    "    query=\"*\",\n",
    "    top=1000,  # Ajustar conforme necessário\n",
    "    facets=[\"category\", \"file_type\", \"language\", \"sentiment\"]\n",
    ")\n",
    "\n",
    "if all_docs_result['success']:\n",
    "    documents = all_docs_result['documents']\n",
    "    total_count = all_docs_result['total_count']\n",
    "    \n",
    "    print(f\"✅ Carregados {len(documents)} documentos de {total_count} total\")\n",
    "    \n",
    "    # Converter para DataFrame para análise\n",
    "    df = pd.DataFrame(documents)\n",
    "    print(f\"   Colunas disponíveis: {list(df.columns)}\")\n",
    "    \n",
    "    # Estatísticas básicas\n",
    "    print(f\"\\n📊 Estatísticas Básicas:\")\n",
    "    print(f\"   Documentos únicos: {len(df)}\")\n",
    "    print(f\"   Campos com dados: {df.count().sum()}\")\n",
    "    print(f\"   Campos principais:\")\n",
    "    for col in ['title', 'content', 'category', 'language', 'sentiment']:\n",
    "        if col in df.columns:\n",
    "            non_null = df[col].notna().sum()\n",
    "            print(f\"     {col}: {non_null}/{len(df)} ({non_null/len(df)*100:.1f}%)\")\n",
    "else:\n",
    "    print(f\"❌ Erro ao carregar documentos: {all_docs_result.get('error')}\")\n",
    "    documents = []\n",
    "    df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Análise Temporal e Tendências"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty and 'modified_date' in df.columns:\n",
    "    # Processar datas\n",
    "    df['modified_date_parsed'] = pd.to_datetime(df['modified_date'], errors='coerce')\n",
    "    df_with_dates = df.dropna(subset=['modified_date_parsed'])\n",
    "    \n",
    "    if len(df_with_dates) > 0:\n",
    "        print(f\"\\n📅 Análise Temporal de {len(df_with_dates)} documentos com datas válidas\")\n",
    "        \n",
    "        # Adicionar colunas de tempo\n",
    "        df_with_dates['year'] = df_with_dates['modified_date_parsed'].dt.year\n",
    "        df_with_dates['month'] = df_with_dates['modified_date_parsed'].dt.month\n",
    "        df_with_dates['year_month'] = df_with_dates['modified_date_parsed'].dt.to_period('M')\n",
    "        df_with_dates['weekday'] = df_with_dates['modified_date_parsed'].dt.day_name()\n",
    "        \n",
    "        # Análise temporal\n",
    "        fig = make_subplots(\n",
    "            rows=2, cols=2,\n",
    "            subplot_titles=('Documentos por Mês', 'Documentos por Ano', \n",
    "                           'Documentos por Dia da Semana', 'Tendência Temporal'),\n",
    "            specs=[[{\"type\": \"scatter\"}, {\"type\": \"bar\"}],\n",
    "                   [{\"type\": \"bar\"}, {\"type\": \"scatter\"}]]\n",
    "        )\n",
    "        \n",
    "        # Gráfico 1: Por mês\n",
    "        monthly_counts = df_with_dates['year_month'].value_counts().sort_index()\n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=monthly_counts.index.astype(str), y=monthly_counts.values,\n",
    "                      mode='lines+markers', name='Por Mês'),\n",
    "            row=1, col=1\n",
    "        )\n",
    "        \n",
    "        # Gráfico 2: Por ano\n",
    "        yearly_counts = df_with_dates['year'].value_counts().sort_index()\n",
    "        fig.add_trace(\n",
    "            go.Bar(x=yearly_counts.index, y=yearly_counts.values, name='Por Ano'),\n",
    "            row=1, col=2\n",
    "        )\n",
    "        \n",
    "        # Gráfico 3: Por dia da semana\n",
    "        weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "        weekday_counts = df_with_dates['weekday'].value_counts().reindex(weekday_order, fill_value=0)\n",
    "        fig.add_trace(\n",
    "            go.Bar(x=weekday_counts.index, y=weekday_counts.values, name='Por Dia'),\n",
    "            row=2, col=1\n",
    "        )\n",
    "        \n",
    "        # Gráfico 4: Tendência com média móvel\n",
    "        daily_counts = df_with_dates.groupby(df_with_dates['modified_date_parsed'].dt.date).size()\n",
    "        daily_counts = daily_counts.reindex(pd.date_range(daily_counts.index.min(), \n",
    "                                                          daily_counts.index.max()), fill_value=0)\n",
    "        \n",
    "        # Média móvel de 7 dias\n",
    "        rolling_mean = daily_counts.rolling(window=7, center=True).mean()\n",
    "        \n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=daily_counts.index, y=daily_counts.values,\n",
    "                      mode='markers', name='Diário', opacity=0.6),\n",
    "            row=2, col=2\n",
    "        )\n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=rolling_mean.index, y=rolling_mean.values,\n",
    "                      mode='lines', name='Média Móvel 7d'),\n",
    "            row=2, col=2\n",
    "        )\n",
    "        \n",
    "        fig.update_layout(height=800, title_text=\"Análise Temporal dos Documentos\")\n",
    "        fig.show()\n",
    "        \n",
    "        # Insights temporais\n",
    "        print(f\"\\n🔍 Insights Temporais:\")\n",
    "        print(f\"   Período analisado: {df_with_dates['modified_date_parsed'].min().date()} a {df_with_dates['modified_date_parsed'].max().date()}\")\n",
    "        print(f\"   Mês com mais documentos: {monthly_counts.idxmax()} ({monthly_counts.max()} docs)\")\n",
    "        print(f\"   Dia da semana mais comum: {weekday_counts.idxmax()} ({weekday_counts.max()} docs)\")\n",
    "        print(f\"   Média de documentos por dia: {daily_counts.mean():.1f}\")\n",
    "        \n",
    "    else:\n",
    "        print(\"⚠️ Nenhuma data válida encontrada para análise temporal\")\n",
    "else:\n",
    "    print(\"⚠️ Campo 'modified_date' não disponível para análise temporal\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Análise de Clustering de Documentos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty and 'content' in df.columns:\n",
    "    # Preparar textos para análise\n",
    "    texts = df['content'].fillna('').astype(str)\n",
    "    valid_texts = [text for text in texts if len(text.strip()) > 50]  # Filtrar textos muito curtos\n",
    "    \n",
    "    print(f\"\\n🔤 Preparando {len(valid_texts)} textos para clustering...\")\n",
    "    \n",
    "    if len(valid_texts) >= 10:  # Precisa de pelo menos 10 documentos\n",
    "        # Configurar TF-IDF\n",
    "        portuguese_stopwords = set(stopwords.words('portuguese'))\n",
    "        english_stopwords = set(stopwords.words('english'))\n",
    "        all_stopwords = portuguese_stopwords.union(english_stopwords)\n# filepath: notebooks/04_advanced_analytics.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Análise Avançada e Machine Learning\n",
    "\n",
    "Este notebook demonstra técnicas avançadas de análise dos dados indexados, incluindo clustering, análise de sentimentos e detecção de padrões.\n",
    "\n",
    "## Objetivos\n",
    "- Aplicar técnicas de machine learning nos dados indexados\n",
    "- Identificar clusters e padrões nos documentos\n",
    "- Análise temporal e de tendências\n",
    "- Geração de recomendações inteligentes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports e configurações\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.search.search_engine import IntelligentSearch\n",
    "from src.search.result_processor import SearchResultProcessor\n",
    "from config.azure_config import config\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "\n",
    "# Machine Learning imports\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Text processing\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import SnowballStemmer\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "import json\n",
    "from datetime import datetime, timedelta\n",
    "from collections import Counter, defaultdict\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"Ambiente de análise avançada configurado!\")\n",
    "\n",
    "# Download NLTK data if needed\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "    nltk.data.find('corpora/stopwords')\n",
    "except LookupError:\n",
    "    print(\"Baixando dados NLTK...\")\n",
    "    nltk.download('punkt')\n",
    "    nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Carregamento e Preparação dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar configuração e inicializar search engine\n",
    "try:\n",
    "    with open('../data/processed/index_configuration.json', 'r') as f:\n",
    "        index_config = json.load(f)\n",
    "    index_name = index_config['index_name']\n",
    "except FileNotFoundError:\n",
    "    index_name = \"intelligent-documents-index\"\n",
    "\n",
    "search_engine = IntelligentSearch(\n",
    "    service_name=config.search_service_name,\n",
    "    query_key=config.search_query_key or config.search_admin_key,\n",
    "    index_name=index_name\n",
    ")\n",
    "\n",
    "# Buscar todos os documentos para análise\n",
    "print(\"Carregando documentos para análise...\")\n",
    "all_docs_result = search_engine.advanced_search(\n",
    "    query=\"*\",\n",
    "    top=1000,  # Ajustar conforme necessário\n",
    "    facets=[\"category\", \"file_type\", \"language\", \"sentiment\"]\n",
    ")\n",
    "\n",
    "if all_docs_result['success']:\n",
    "    documents = all_docs_result['documents']\n",
    "    total_count = all_docs_result['total_count']\n",
    "    \n",
    "    print(f\"✅ Carregados {len(documents)} documentos de {total_count} total\")\n",
    "    \n",
    "    # Converter para DataFrame para análise\n",
    "    df = pd.DataFrame(documents)\n",
    "    print(f\"   Colunas disponíveis: {list(df.columns)}\")\n",
    "    \n",
    "    # Estatísticas básicas\n",
    "    print(f\"\\n📊 Estatísticas Básicas:\")\n",
    "    print(f\"   Documentos únicos: {len(df)}\")\n",
    "    print(f\"   Campos com dados: {df.count().sum()}\")\n",
    "    print(f\"   Campos principais:\")\n",
    "    for col in ['title', 'content', 'category', 'language', 'sentiment']:\n",
    "        if col in df.columns:\n",
    "            non_null = df[col].notna().sum()\n",
    "            print(f\"     {col}: {non_null}/{len(df)} ({non_null/len(df)*100:.1f}%)\")\n",
    "else:\n",
    "    print(f\"❌ Erro ao carregar documentos: {all_docs_result.get('error')}\")\n",
    "    documents = []\n",
    "    df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Análise Temporal e Tendências"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty and 'modified_date' in df.columns:\n",
    "    # Processar datas\n",
    "    df['modified_date_parsed'] = pd.to_datetime(df['modified_date'], errors='coerce')\n",
    "    df_with_dates = df.dropna(subset=['modified_date_parsed'])\n",
    "    \n",
    "    if len(df_with_dates) > 0:\n",
    "        print(f\"\\n📅 Análise Temporal de {len(df_with_dates)} documentos com datas válidas\")\n",
    "        \n",
    "        # Adicionar colunas de tempo\n",
    "        df_with_dates['year'] = df_with_dates['modified_date_parsed'].dt.year\n",
    "        df_with_dates['month'] = df_with_dates['modified_date_parsed'].dt.month\n",
    "        df_with_dates['year_month'] = df_with_dates['modified_date_parsed'].dt.to_period('M')\n",
    "        df_with_dates['weekday'] = df_with_dates['modified_date_parsed'].dt.day_name()\n",
    "        \n",
    "        # Análise temporal\n",
    "        fig = make_subplots(\n",
    "            rows=2, cols=2,\n",
    "            subplot_titles=('Documentos por Mês', 'Documentos por Ano', \n",
    "                           'Documentos por Dia da Semana', 'Tendência Temporal'),\n",
    "            specs=[[{\"type\": \"scatter\"}, {\"type\": \"bar\"}],\n",
    "                   [{\"type\": \"bar\"}, {\"type\": \"scatter\"}]]\n",
    "        )\n",
    "        \n",
    "        # Gráfico 1: Por mês\n",
    "        monthly_counts = df_with_dates['year_month'].value_counts().sort_index()\n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=monthly_counts.index.astype(str), y=monthly_counts.values,\n",
    "                      mode='lines+markers', name='Por Mês'),\n",
    "            row=1, col=1\n",
    "        )\n",
    "        \n",
    "        # Gráfico 2: Por ano\n",
    "        yearly_counts = df_with_dates['year'].value_counts().sort_index()\n",
    "        fig.add_trace(\n",
    "            go.Bar(x=yearly_counts.index, y=yearly_counts.values, name='Por Ano'),\n",
    "            row=1, col=2\n",
    "        )\n",
    "        \n",
    "        # Gráfico 3: Por dia da semana\n",
    "        weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "        weekday_counts = df_with_dates['weekday'].value_counts().reindex(weekday_order, fill_value=0)\n",
    "        fig.add_trace(\n",
    "            go.Bar(x=weekday_counts.index, y=weekday_counts.values, name='Por Dia'),\n",
    "            row=2, col=1\n",
    "        )\n",
    "        \n",
    "        # Gráfico 4: Tendência com média móvel\n",
    "        daily_counts = df_with_dates.groupby(df_with_dates['modified_date_parsed'].dt.date).size()\n",
    "        daily_counts = daily_counts.reindex(pd.date_range(daily_counts.index.min(), \n",
    "                                                          daily_counts.index.max()), fill_value=0)\n",
    "        \n",
    "        # Média móvel de 7 dias\n",
    "        rolling_mean = daily_counts.rolling(window=7, center=True).mean()\n",
    "        \n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=daily_counts.index, y=daily_counts.values,\n",
    "                      mode='markers', name='Diário', opacity=0.6),\n",
    "            row=2, col=2\n",
    "        )\n",
    "        fig.add_trace(\n",
    "            go.Scatter(x=rolling_mean.index, y=rolling_mean.values,\n",
    "                      mode='lines', name='Média Móvel 7d'),\n",
    "            row=2, col=2\n",
    "        )\n",
    "        \n",
    "        fig.update_layout(height=800, title_text=\"Análise Temporal dos Documentos\")\n",
    "        fig.show()\n",
    "        \n",
    "        # Insights temporais\n",
    "        print(f\"\\n🔍 Insights Temporais:\")\n",
    "        print(f\"   Período analisado: {df_with_dates['modified_date_parsed'].min().date()} a {df_with_dates['modified_date_parsed'].max().date()}\")\n",
    "        print(f\"   Mês com mais documentos: {monthly_counts.idxmax()} ({monthly_counts.max()} docs)\")\n",
    "        print(f\"   Dia da semana mais comum: {weekday_counts.idxmax()} ({weekday_counts.max()} docs)\")\n",
    "        print(f\"   Média de documentos por dia: {daily_counts.mean():.1f}\")\n",
    "        \n",
    "    else:\n",
    "        print(\"⚠️ Nenhuma data válida encontrada para análise temporal\")\n",
    "else:\n",
    "    print(\"⚠️ Campo 'modified_date' não disponível para análise temporal\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Análise de Clustering de Documentos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty and 'content' in df.columns:\n",
    "    # Preparar textos para análise\n",
    "    texts = df['content'].fillna('').astype(str)\n",
    "    valid_texts = [text for text in texts if len(text.strip()) > 50]  # Filtrar textos muito curtos\n",
    "    \n",
    "    print(f\"\\n🔤 Preparando {len(valid_texts)} textos para clustering...\")\n",
    "    \n",
    "    if len(valid_texts) >= 10:  # Precisa de pelo menos 10 documentos\n",
    "        # Configurar TF-IDF\n",
    "        portuguese_stopwords = set(stopwords.words('portuguese'))\n",
    "        english_stopwords = set(stopwords.words('english'))\n",
    "        all_stopwords = portuguese_stopwords.union(english_stopwords)\n