In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Configuração Inicial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importações básicas\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Configurações de visualização\n",
    "%matplotlib inline\n",
    "plt.style.use('ggplot')\n",
    "pd.set_option('display.max_columns', 50)\n",
    "\n",
    "# Ignorar warnings\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Análise Exploratória Inicial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar os dados\n",
    "df = pd.read_csv('../data/raw/diabetic_data.csv')\n",
    "mapping = pd.read_csv('../data/raw/IDS_mapping.csv', sep=',', skip_blank_lines=True)\n",
    "\n",
    "# Visualização inicial\n",
    "print(\"Dimensões do dataset:\", df.shape)\n",
    "print(\"\\nPrimeiras linhas:\")\n",
    "display(df.head(3))\n",
    "\n",
    "print(\"\\nInformações do dataset:\")\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualizações dos Dados Brutos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Distribuição de variáveis categóricas\n",
    "plt.figure(figsize=(15, 5))\n",
    "plt.subplot(1, 3, 1)\n",
    "df['race'].value_counts().plot(kind='bar', title='Distribuição por Raça')\n",
    "\n",
    "plt.subplot(1, 3, 2)\n",
    "df['gender'].value_counts().plot(kind='bar', title='Distribuição por Gênero')\n",
    "\n",
    "plt.subplot(1, 3, 3)\n",
    "df['age'].value_counts().sort_index().plot(kind='bar', title='Distribuição por Faixa Etária')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Distribuição de variáveis numéricas\n",
    "plt.figure(figsize=(15, 5))\n",
    "plt.subplot(1, 3, 1)\n",
    "sns.histplot(df['time_in_hospital'], bins=20, kde=True)\n",
    "plt.title('Tempo de Hospitalização')\n",
    "\n",
    "plt.subplot(1, 3, 2)\n",
    "sns.histplot(df['num_lab_procedures'], bins=20, kde=True)\n",
    "plt.title('Número de Procedimentos Laboratoriais')\n",
    "\n",
    "plt.subplot(1, 3, 3)\n",
    "sns.histplot(df['num_medications'], bins=20, kde=True)\n",
    "plt.title('Número de Medicações')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Testes das Funções Modularizadas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importando funções dos módulos\n",
    "from src.preprocessing import load_and_preprocess_data\n",
    "from src.clustering import find_optimal_clusters, train_kmeans_model\n",
    "from src.visualization import plot_elbow_method, plot_cluster_analysis\n",
    "from src.inference import predict_cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Teste do pré-processamento\n",
    "scaled_data, processed_df, scaler = load_and_preprocess_data('../data/raw/diabetic_data.csv')\n",
    "print(\"Dados após pré-processamento:\")\n",
    "display(processed_df.head(3))\n",
    "print(\"\\nDimensões dos dados escalados:\", scaled_data.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Experimentos com Números de Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Método do cotovelo\n",
    "wcss = find_optimal_clusters(scaled_data, max_k=10)\n",
    "plot_elbow_method(wcss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Testando diferentes números de clusters\n",
    "for k in [3, 4, 5]:\n",
    "    print(f\"\\n=== Clusterização com k={k} ===\")\n",
    "    model, clustered_df = train_kmeans_model(scaled_data, k)\n",
    "    \n",
    "    # Visualização\n",
    "    plt.figure(figsize=(8, 5))\n",
    "    sns.scatterplot(x=scaled_data[:, 0], y=scaled_data[:, 1], hue=clustered_df['cluster'], palette='viridis')\n",
    "    plt.title(f'Visualização dos Clusters (k={k})')\n",
    "    plt.show()\n",
    "    \n",
    "    # Estatísticas\n",
    "    print(\"Tamanho dos clusters:\")\n",
    "    print(clustered_df['cluster'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Análise dos Clusters Selecionados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clusterização final com k=4\n",
    "final_model, final_clustered_df = train_kmeans_model(scaled_data, 4)\n",
    "\n",
    "# Análise detalhada\n",
    "plot_cluster_analysis(final_clustered_df)\n",
    "\n",
    "# Estatísticas por cluster\n",
    "print(\"\\nEstatísticas descritivas por cluster:\")\n",
    "cluster_stats = final_clustered_df.groupby('cluster').describe().T\n",
    "display(cluster_stats.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Teste do Módulo de Inferência"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Criando um exemplo fictício\n",
    "sample_data = pd.DataFrame([{\n",
    "    'race': 'Caucasian',\n",
    "    'gender': 'Female',\n",
    "    'age': '[50-60)',\n",
    "    'admission_type_id': 1,\n",
    "    'discharge_disposition_id': 1,\n",
    "    'admission_source_id': 7,\n",
    "    'time_in_hospital': 4,\n",
    "    'num_lab_procedures': 45,\n",
    "    'num_procedures': 2,\n",
    "    'num_medications': 15,\n",
    "    'number_outpatient': 0,\n",
    "    'number_emergency': 0,\n",
    "    'number_inpatient': 0,\n",
    "    'number_diagnoses': 7,\n",
    "    'max_glu_serum': 'None',\n",
    "    'A1Cresult': 'None',\n",
    "    'metformin': 'No',\n",
    "    'change': 'No',\n",
    "    'diabetesMed': 'Yes',\n",
    "    'readmitted': 'NO'\n",
    "}])\n",
    "\n",
    "# Predizendo o cluster\n",
    "cluster, stats = predict_cluster(sample_data, final_model, scaler, processed_df)\n",
    "print(f\"\\nO exemplo foi classificado no Cluster {cluster}\")\n",
    "print(\"\\nEstatísticas deste cluster:\")\n",
    "display(stats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Conclusões e Próximos Passos"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Principais descobertas:**\n",
    "- [Descreva os padrões encontrados nos clusters]\n",
    "- [Comente sobre as diferenças entre os grupos]\n",
    "- [Destaque insights relevantes]\n",
    "\n",
    "**Próximos passos:**\n",
    "- [Sugira melhorias para o modelo]\n",
    "- [Indique análises adicionais que poderiam ser feitas]\n",
    "- [Mencione limitações do estudo]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}