In [1]:
import os
import sqlite3

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", None)

In [2]:
srag_dtypes = {
    # --- Dtypes para carregar de CSV ou para uso geral em Pandas ---
    "csv_pandas": {
        # Identificadores (tratar como string para evitar perda de zeros ou convers√£o)
        "NU_NOTIFIC": "string",
        "CO_MUN_NOT": "string",
        "CO_UNI_NOT": "string",
        "NU_CPF": "string",
        "NU_CNS": "string",
        "CO_MUN_RES": "string",
        "CO_RG_RESI": "string",
        "NM_LOGRADO": "string",
        "NU_NUMERO": "string",
        "CO_PAIS": "string",
        "CO_LAB_AN": "string",
        "REQUI_GAL": "string",
        "CO_LAB_PCR": "string",
        "VG_CODLAB": "string",
        "VG_CODDEST": "string",
        "REG_PROF": "string",
        # Datas (converter para o tipo datetime do pandas)
        "DT_NOTIFIC": "datetime64[ns]",
        "DT_SIN_PRI": "datetime64[ns]",
        "DT_NASC": "datetime64[ns]",
        "DOSE_1_COV": "datetime64[ns]",
        "DOSE_2_COV": "datetime64[ns]",
        "DOSE_REF": "datetime64[ns]",
        "DOSE_2REF": "datetime64[ns]",
        "DOSE_ADIC": "datetime64[ns]",
        "DOS_RE_BI": "datetime64[ns]",
        "DT_UT_DOSE": "datetime64[ns]",
        "DT_VAC_MAE": "datetime64[ns]",
        "DT_DOSEUNI": "datetime64[ns]",
        "DT_1_DOSE": "datetime64[ns]",
        "DT_2_DOSE": "datetime64[ns]",
        "DT_ANTIVIR": "datetime64[ns]",
        "DT_TRT_COV": "datetime64[ns]",
        "DT_INTERNA": "datetime64[ns]",
        "DT_ENTUTI": "datetime64[ns]",
        "DT_SAIDUTI": "datetime64[ns]",
        "DT_RAIOX": "datetime64[ns]",
        "DT_TOMO": "datetime64[ns]",
        "DT_COLETA": "datetime64[ns]",
        "DT_RES_AN": "datetime64[ns]",
        "DT_PCR": "datetime64[ns]",
        "DT_CO_SOR": "datetime64[ns]",
        "DT_RES": "datetime64[ns]",
        "DT_EVOLUCA": "datetime64[ns]",
        "DT_ENCERRA": "datetime64[ns]",
        "DT_DIGITA": "datetime64[ns]",
        "VG_DTRES": "datetime64[ns]",
        # Campos Categ√≥ricos (c√≥digos num√©ricos ou de texto com poucas varia√ß√µes)
        "SEM_NOT": "string",  # Ex: 202301
        "SEM_PRI": "string",
        "SG_UF_NOT": "category",
        "CS_SEXO": "category",
        "TP_IDADE": "category",
        "CS_GESTANT": "category",
        "CS_RACA": "category",
        "CS_ETINIA": "category",
        "CS_ESCOL_N": "category",
        "SG_UF": "category",
        "CS_ZONA": "category",
        "NOSOCOMIAL": "category",
        "FEBRE": "category",
        "TOSSE": "category",
        "GARGANTA": "category",
        "DISPNEIA": "category",
        "DESC_RESP": "category",
        "SATURACAO": "category",
        "DIARREIA": "category",
        "VOMITO": "category",
        "DOR_ABD": "category",
        "FADIGA": "category",
        "PERD_OLFT": "category",
        "PERD_PALA": "category",
        "FATOR_RISC": "category",
        "PUERPERA": "category",
        "CARDIOPATI": "category",
        "HEMATOLOGI": "category",
        "SIND_DOWN": "category",
        "HEPATICA": "category",
        "ASMA": "category",
        "DIABETES": "category",
        "NEUROLOGIC": "category",
        "PNEUMOPATI": "category",
        "IMUNODEPRE": "category",
        "RENAL": "category",
        "OBESIDADE": "category",
        "TABAG": "category",
        "VACINA_COV": "category",
        "VACINA": "category",
        "ANTIVIRAL": "category",
        "TP_ANTIVIR": "category",
        "HOSPITAL": "category",
        "UTI": "category",
        "SUPORT_VEN": "category",
        "RAIOX_RES": "category",
        "TOMO_RES": "category",
        "AMOSTRA": "category",
        "TP_AMOSTRA": "category",
        "TP_TES_AN": "category",
        "RES_AN": "category",
        "PCR_RESUL": "category",
        "CLASSI_FIN": "category",
        "CRITERIO": "category",
        "EVOLUCAO": "category",
        "VG_OMS": "category",
        "VG_ENC": "category",
        "VG_REINF": "category",
        # N√∫meros Inteiros (usar tipo nullable Int64 para suportar valores ausentes)
        "NU_IDADE_N": "Int64",
        # N√∫meros de Ponto Flutuante (decimais)
        "OBES_IMC": "float64",
        # Texto livre
        "NM_PACIENT": "string",
        "NM_MAE_PAC": "string",
        "OUTRO_DES": "string",
        "MORB_DESC": "string",
        "OUT_ANIM": "string",
        "RAIOX_OUT": "string",
        "TOMO_OUT": "string",
        "OUT_AMOST": "string",
        "DS_AN_OUT": "string",
        "CLASSI_OUT": "string",
        "OBSERVA": "string",
        "NOME_PROF": "string",
        "VG_OMSOUT": "string",
        "VG_LIN": "string",
        "VG_METOUT": "string",
        "VG_LAB": "string",
        "VG_PROF": "string",
        "VG_EST": "string",
    },
    # --- Tipos para criar tabela em SQLite ---
    "sqlite": {
        # Identificadores e texto
        "NU_NOTIFIC": "TEXT",
        "NM_PACIENT": "TEXT",
        "DT_NOTIFIC": "TEXT",
        "CO_MUN_NOT": "TEXT",
        "CO_UNI_NOT": "TEXT",
        "NU_CPF": "TEXT",
        "NU_CNS": "TEXT",
        "NM_MAE_PAC": "TEXT",
        "NM_LOGRADO": "TEXT",
        "NU_NUMERO": "TEXT",
        "CO_PAIS": "TEXT",
        "OUTRO_DES": "TEXT",
        "MORB_DESC": "TEXT",
        "OUT_ANIM": "TEXT",
        "RAIOX_OUT": "TEXT",
        "TOMO_OUT": "TEXT",
        "OUT_AMOST": "TEXT",
        "DS_AN_OUT": "TEXT",
        "CLASSI_OUT": "TEXT",
        "OBSERVA": "TEXT",
        "NOME_PROF": "TEXT",
        "REG_PROF": "TEXT",
        "VG_OMSOUT": "TEXT",
        "VG_LIN": "TEXT",
        "VG_METOUT": "TEXT",
        "VG_LAB": "TEXT",
        "VG_PROF": "TEXT",
        "VG_EST": "TEXT",
        "CO_LAB_AN": "TEXT",
        "REQUI_GAL": "TEXT",
        "CO_LAB_PCR": "TEXT",
        "VG_CODLAB": "TEXT",
        "VG_CODDEST": "TEXT",
        "SEM_NOT": "TEXT",
        "SEM_PRI": "TEXT",
        # Datas (armazenar como TEXT no formato ISO 8601 'YYYY-MM-DD')
        "DT_SIN_PRI": "TEXT",
        "DT_NASC": "TEXT",
        "DOSE_1_COV": "TEXT",
        "DOSE_2_COV": "TEXT",
        "DOSE_REF": "TEXT",
        "DOSE_2REF": "TEXT",
        "DOSE_ADIC": "TEXT",
        "DOS_RE_BI": "TEXT",
        "DT_UT_DOSE": "TEXT",
        "DT_VAC_MAE": "TEXT",
        "DT_DOSEUNI": "TEXT",
        "DT_1_DOSE": "TEXT",
        "DT_2_DOSE": "TEXT",
        "DT_ANTIVIR": "TEXT",
        "DT_TRT_COV": "TEXT",
        "DT_INTERNA": "TEXT",
        "DT_ENTUTI": "TEXT",
        "DT_SAIDUTI": "TEXT",
        "DT_RAIOX": "TEXT",
        "DT_TOMO": "TEXT",
        "DT_COLETA": "TEXT",
        "DT_RES_AN": "TEXT",
        "DT_PCR": "TEXT",
        "DT_CO_SOR": "TEXT",
        "DT_RES": "TEXT",
        "DT_EVOLUCA": "TEXT",
        "DT_ENCERRA": "TEXT",
        "DT_DIGITA": "TEXT",
        "VG_DTRES": "TEXT",
        # C√≥digos e categorias (armazenar como INTEGER √© mais eficiente)
        "SG_UF_NOT": "INTEGER",
        "CS_SEXO": "INTEGER",
        "TP_IDADE": "INTEGER",
        "CS_GESTANT": "INTEGER",
        "CS_RACA": "INTEGER",
        "CS_ETINIA": "INTEGER",
        "CS_ESCOL_N": "INTEGER",
        "SG_UF": "INTEGER",
        "CS_ZONA": "INTEGER",
        "NOSOCOMIAL": "INTEGER",
        "FEBRE": "INTEGER",
        "TOSSE": "INTEGER",
        "GARGANTA": "INTEGER",
        "DISPNEIA": "INTEGER",
        "DESC_RESP": "INTEGER",
        "SATURACAO": "INTEGER",
        "DIARREIA": "INTEGER",
        "VOMITO": "INTEGER",
        "DOR_ABD": "INTEGER",
        "FADIGA": "INTEGER",
        "PERD_OLFT": "INTEGER",
        "PERD_PALA": "INTEGER",
        "FATOR_RISC": "INTEGER",
        "PUERPERA": "INTEGER",
        "CARDIOPATI": "INTEGER",
        "HEMATOLOGI": "INTEGER",
        "SIND_DOWN": "INTEGER",
        "HEPATICA": "INTEGER",
        "ASMA": "INTEGER",
        "DIABETES": "INTEGER",
        "NEUROLOGIC": "INTEGER",
        "PNEUMOPATI": "INTEGER",
        "IMUNODEPRE": "INTEGER",
        "RENAL": "INTEGER",
        "OBESIDADE": "INTEGER",
        "TABAG": "INTEGER",
        "VACINA_COV": "INTEGER",
        "VACINA": "INTEGER",
        "ANTIVIRAL": "INTEGER",
        "TP_ANTIVIR": "INTEGER",
        "HOSPITAL": "INTEGER",
        "UTI": "INTEGER",
        "SUPORT_VEN": "INTEGER",
        "RAIOX_RES": "INTEGER",
        "TOMO_RES": "INTEGER",
        "AMOSTRA": "INTEGER",
        "TP_AMOSTRA": "INTEGER",
        "TP_TES_AN": "INTEGER",
        "RES_AN": "INTEGER",
        "PCR_RESUL": "INTEGER",
        "CLASSI_FIN": "INTEGER",
        "CRITERIO": "INTEGER",
        "EVOLUCAO": "INTEGER",
        "VG_OMS": "INTEGER",
        "VG_ENC": "INTEGER",
        "VG_REINF": "INTEGER",
        # N√∫meros
        "NU_IDADE_N": "INTEGER",
        "OBES_IMC": "REAL",
    },
    # --- Dtypes para salvar em Parquet (via Pandas) ---
    # √â muito similar ao de CSV, mas refor√ßa o uso de 'category' e 'boolean'
    # que s√£o tipos nativos e eficientes no formato Parquet.
    "parquet_pandas": {
        # Identificadores
        "NU_NOTIFIC": "string",
        "CO_MUN_NOT": "string",
        "CO_UNI_NOT": "string",
        "NU_CPF": "string",
        "NU_CNS": "string",
        "CO_MUN_RES": "string",
        "CO_RG_RESI": "string",
        "NM_LOGRADO": "string",
        "NU_NUMERO": "string",
        "CO_PAIS": "string",
        "CO_LAB_AN": "string",
        "REQUI_GAL": "string",
        "CO_LAB_PCR": "string",
        "VG_CODLAB": "string",
        "VG_CODDEST": "string",
        "REG_PROF": "string",
        "SEM_NOT": "string",
        "SEM_PRI": "string",
        # Datas
        "DT_NOTIFIC": "datetime64[ns]",
        "DT_SIN_PRI": "datetime64[ns]",
        "DT_NASC": "datetime64[ns]",
        "DOSE_1_COV": "datetime64[ns]",
        "DOSE_2_COV": "datetime64[ns]",
        "DOSE_REF": "datetime64[ns]",
        "DOSE_2REF": "datetime64[ns]",
        "DOSE_ADIC": "datetime64[ns]",
        "DOS_RE_BI": "datetime64[ns]",
        "DT_UT_DOSE": "datetime64[ns]",
        "DT_VAC_MAE": "datetime64[ns]",
        "DT_DOSEUNI": "datetime64[ns]",
        "DT_1_DOSE": "datetime64[ns]",
        "DT_2_DOSE": "datetime64[ns]",
        "DT_ANTIVIR": "datetime64[ns]",
        "DT_TRT_COV": "datetime64[ns]",
        "DT_INTERNA": "datetime64[ns]",
        "DT_ENTUTI": "datetime64[ns]",
        "DT_SAIDUTI": "datetime64[ns]",
        "DT_RAIOX": "datetime64[ns]",
        "DT_TOMO": "datetime64[ns]",
        "DT_COLETA": "datetime64[ns]",
        "DT_RES_AN": "datetime64[ns]",
        "DT_PCR": "datetime64[ns]",
        "DT_CO_SOR": "datetime64[ns]",
        "DT_RES": "datetime64[ns]",
        "DT_EVOLUCA": "datetime64[ns]",
        "DT_ENCERRA": "datetime64[ns]",
        "DT_DIGITA": "datetime64[ns]",
        "VG_DTRES": "datetime64[ns]",
        # Campos Categ√≥ricos (muito eficiente em Parquet)
        "SG_UF_NOT": "category",
        "CS_SEXO": "category",
        "TP_IDADE": "category",
        "CS_GESTANT": "category",
        "CS_RACA": "category",
        "CS_ETINIA": "category",
        "CS_ESCOL_N": "category",
        "SG_UF": "category",
        "CS_ZONA": "category",
        "NOSOCOMIAL": "category",
        "FATOR_RISC": "category",
        "PUERPERA": "category",
        "CARDIOPATI": "category",
        "HEMATOLOGI": "category",
        "SIND_DOWN": "category",
        "HEPATICA": "category",
        "ASMA": "category",
        "DIABETES": "category",
        "NEUROLOGIC": "category",
        "PNEUMOPATI": "category",
        "IMUNODEPRE": "category",
        "RENAL": "category",
        "OBESIDADE": "category",
        "TABAG": "category",
        "VACINA_COV": "category",
        "VACINA": "category",
        "ANTIVIRAL": "category",
        "TP_ANTIVIR": "category",
        "HOSPITAL": "category",
        "UTI": "category",
        "SUPORT_VEN": "category",
        "RAIOX_RES": "category",
        "TOMO_RES": "category",
        "AMOSTRA": "category",
        "TP_AMOSTRA": "category",
        "TP_TES_AN": "category",
        "RES_AN": "category",
        "PCR_RESUL": "category",
        "CLASSI_FIN": "category",
        "CRITERIO": "category",
        "EVOLUCAO": "category",
        "VG_OMS": "category",
        "VG_ENC": "category",
        "VG_REINF": "category",
        # Booleanos (Sim/N√£o pode ser convertido para booleano)
        "FEBRE": "boolean",
        "TOSSE": "boolean",
        "GARGANTA": "boolean",
        "DISPNEIA": "boolean",
        "DESC_RESP": "boolean",
        "SATURACAO": "boolean",
        "DIARREIA": "boolean",
        "VOMITO": "boolean",
        "DOR_ABD": "boolean",
        "FADIGA": "boolean",
        "PERD_OLFT": "boolean",
        "PERD_PALA": "boolean",
        # N√∫meros Inteiros (nullable)
        "NU_IDADE_N": "Int64",
        # N√∫meros de Ponto Flutuante
        "OBES_IMC": "float64",
        # Texto
        "NM_PACIENT": "string",
        "NM_MAE_PAC": "string",
        "OUTRO_DES": "string",
        "MORB_DESC": "string",
        "OUT_ANIM": "string",
        "RAIOX_OUT": "string",
        "TOMO_OUT": "string",
        "OUT_AMOST": "string",
        "DS_AN_OUT": "string",
        "CLASSI_OUT": "string",
        "OBSERVA": "string",
        "NOME_PROF": "string",
        "VG_OMSOUT": "string",
        "VG_LIN": "string",
        "VG_METOUT": "string",
        "VG_LAB": "string",
        "VG_PROF": "string",
        "VG_EST": "string",
    },
}

In [6]:
input_csv = "../data/interim/srag_2019_2024.csv"
base_path, _ = os.path.splitext(input_csv)
output_csv = base_path + ".csv"
output_db = base_path + ".db"
output_parquet = base_path + ".parquet"

print(f"Iniciando o processamento do arquivo: {input_csv}")

Iniciando o processamento do arquivo: ../data/interim/srag_2019_2024.csv


In [7]:
print("üîç Analisando colunas do CSV...")
csv_cols = pd.read_csv(input_csv, nrows=0, encoding="ISO-8859-1").columns.tolist()

# Filtra o dicion√°rio de dtypes para conter apenas colunas existentes
dtypes_para_leitura = {
    col: dtype
    for col, dtype in srag_dtypes["csv_pandas"].items()
    if col in csv_cols and not str(dtype).startswith("datetime")
}
date_cols = [
    col
    for col, dtype in srag_dtypes["csv_pandas"].items()
    if col in csv_cols and str(dtype).startswith("datetime")
]

print("üîÑ Carregando e convertendo dados (isso pode levar alguns minutos)...")
df = pd.read_csv(
    input_csv,
    dtype=dtypes_para_leitura,
    parse_dates=date_cols,
    encoding="ISO-8859-1",
    low_memory=False,
)
# Garante que datas inv√°lidas se tornem NaT (Not a Time)
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

print(
    f"‚úÖ Arquivo carregado com sucesso. {df.shape[0]} linhas e {df.shape[1]} colunas."
)
print(
    "Mem√≥ria usada pelo DataFrame:",
    f"{df.memory_usage(deep=True).sum() / 1e9:.2f} GB",
)

üîç Analisando colunas do CSV...
üîÑ Carregando e convertendo dados (isso pode levar alguns minutos)...
‚úÖ Arquivo carregado com sucesso. 633398 linhas e 194 colunas.
Mem√≥ria usada pelo DataFrame: 1.97 GB


In [8]:
print("\n--- Processando CSV ---")
df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"‚úÖ Arquivo CSV otimizado salvo em: {output_csv}")

print("\n--- Processando SQLite ---")
with sqlite3.connect(output_db) as conn:
    # O to_sql faz um bom trabalho em mapear os dtypes do pandas para os do SQLite
    df.to_sql("srag", conn, if_exists="replace", index=False, chunksize=50000)
print(f"‚úÖ Banco de dados SQLite salvo em: {output_db}")

print("\n--- Processando Parquet ---")
df_parquet = df.copy()

# Converte colunas Sim/N√£o (1/2) para booleano (True/False)
# que √© um tipo nativo e muito eficiente em Parquet.
if boolean_cols := [
    col for col in srag_dtypes["parquet_pandas"] if col in df_parquet.columns
]:
    print(f"üîÑ Convertendo {len(boolean_cols)} colunas para booleano...")
    for col in boolean_cols:
        # Usa .map que lida bem com valores ausentes e inesperados
        df_parquet[col] = (
            df_parquet[col]
            .map(
                {  # Suporta tanto texto quanto n√∫mero
                    "1": True,
                    "2": False,
                    1: True,
                    2: False,
                }
            )
            .astype("boolean")
        )

df_parquet.to_parquet(
    output_parquet, index=False, engine="pyarrow", compression="snappy"
)
print(f"‚úÖ Arquivo Parquet salvo em: {output_parquet}")


--- Processando CSV ---
‚úÖ Arquivo CSV otimizado salvo em: ../data/interim/srag_2019_2024.csv

--- Processando SQLite ---
‚úÖ Banco de dados SQLite salvo em: ../data/interim/srag_2019_2024.db

--- Processando Parquet ---
üîÑ Convertendo 106 colunas para booleano...
‚úÖ Arquivo Parquet salvo em: ../data/interim/srag_2019_2024.parquet


In [9]:
colunas_necessarias = [
    # M√©tricas principais
    "DT_SIN_PRI",  # Data do primeiro sintoma
    "DT_NOTIFIC",  # Data da notifica√ß√£o (apoio)
    "CLASSI_FIN",  # Classifica√ß√£o final
    "EVOLUCAO",  # Evolu√ß√£o do caso
    "DT_EVOLUCA",  # Data da alta/√≥bito
    "UTI",  # Internado em UTI
    "DT_ENTUTI",  # Data entrada UTI
    "DT_SAIDUTI",  # Data sa√≠da UTI
    "VACINA_COV",  # Vacina COVID-19
    "DOSE_1_COV",  # Data 1¬™ dose
    "DOSE_2_COV",  # Data 2¬™ dose
    "DOSE_REF",  # Data dose refor√ßo
    "DOSE_2REF",  # Data 2¬∫ refor√ßo
    "DOSE_ADIC",  # Data dose adicional
    "VACINA",  # Vacina influenza
    "DT_UT_DOSE",  # Data √∫ltima dose influenza
    # Dados demogr√°ficos
    "CS_SEXO",  # Sexo
    "DT_NASC",  # Data nascimento
    "NU_IDADE_N",  # Idade
    "CS_RACA",  # Ra√ßa/cor
    "CS_ESCOL_N",  # Escolaridade
    # Comorbidades
    "OBESIDADE",  # Obesidade
    "RENAL",  # Doen√ßa renal
    "IMUNODEPRE",  # Imunodepress√£o
    "TABAG",  # Tabagismo
    "PNEUMOPATI",  # Pneumopatia
    # Gesta√ß√£o
    "CS_GESTANT",  # Gestante
    # Testes laboratoriais
    "PCR_RESUL",  # Resultado PCR
    "RES_AN",  # Resultado ant√≠geno
    "RES_IGG",  # Resultado IgG
    "RES_IGM",  # Resultado IgM
    # Localiza√ß√£o
    "CO_MU_INTE",  # C√≥digo munic√≠pio interna√ß√£o
    # "CO_UN_INTE",  # C√≥digo unidade interna√ß√£o
]
if colunas_faltantes := [col for col in colunas_necessarias if col not in df.columns]:
    print(f"Colunas n√£o encontradas no dataset: {colunas_faltantes}")

df_filtrado = df[colunas_necessarias].copy()

print("\n=== Dataset filtrado ===")
print(f"Dimens√µes: {df_filtrado.shape[0]} linhas e {df_filtrado.shape[1]} colunas")
df_filtrado.head()


=== Dataset filtrado ===
Dimens√µes: 633398 linhas e 32 colunas


Unnamed: 0,DT_SIN_PRI,DT_NOTIFIC,CLASSI_FIN,EVOLUCAO,DT_EVOLUCA,UTI,DT_ENTUTI,DT_SAIDUTI,VACINA_COV,DOSE_1_COV,DOSE_2_COV,DOSE_REF,DOSE_2REF,DOSE_ADIC,VACINA,DT_UT_DOSE,CS_SEXO,DT_NASC,NU_IDADE_N,CS_RACA,CS_ESCOL_N,OBESIDADE,RENAL,IMUNODEPRE,TABAG,PNEUMOPATI,CS_GESTANT,PCR_RESUL,RES_AN,RES_IGG,RES_IGM,CO_MU_INTE
0,NaT,NaT,4.0,1.0,NaT,2.0,NaT,NaT,,NaT,NaT,NaT,NaT,NaT,2.0,NaT,M,NaT,30,1.0,2.0,,,,,,6,4.0,4.0,,,310620.0
1,NaT,NaT,4.0,1.0,NaT,1.0,NaT,NaT,,NaT,NaT,NaT,NaT,NaT,2.0,NaT,F,NaT,7,1.0,5.0,,,,,,6,2.0,4.0,,,355030.0
2,NaT,NaT,4.0,1.0,NaT,2.0,NaT,NaT,,NaT,NaT,NaT,NaT,NaT,2.0,NaT,M,NaT,1,4.0,5.0,,,,,,6,2.0,2.0,,,261160.0
3,NaT,NaT,2.0,1.0,NaT,1.0,NaT,NaT,,NaT,NaT,NaT,NaT,NaT,2.0,NaT,F,NaT,5,1.0,5.0,,,,,,6,2.0,1.0,,,350950.0
4,NaT,NaT,4.0,2.0,NaT,9.0,NaT,NaT,,NaT,NaT,NaT,NaT,NaT,2.0,NaT,F,NaT,3,4.0,5.0,,,,,,6,2.0,4.0,,,260410.0


In [None]:
output_db = "../data/processed/srag_analytics.db"
# Salvar o DataFrame filtrado no novo banco SQLite
print(f"\nSalvando dados filtrados em {output_db}")
with sqlite3.connect(output_db) as conn_output:
    df_filtrado.to_sql("srag_analytics", conn_output, if_exists="replace", index=False)

print("‚úÖ Processo conclu√≠do com sucesso!")

# Informa√ß√µes adicionais sobre o dataset
print("\n=== Resumo final ===")
print(
    f"Redu√ß√£o de colunas: {df.shape[1]} ‚Üí {df_filtrado.shape[1]} ({df.shape[1] - df_filtrado.shape[1]} colunas removidas)"
)
print(
    f"Porcentagem de colunas mantidas: {(df_filtrado.shape[1] / df.shape[1]) * 100:.1f}%"
)


Salvando dados filtrados em ../data/processed/srag_analytics.db
‚úÖ Processo conclu√≠do com sucesso!

=== Resumo final ===
Redu√ß√£o de colunas: 194 ‚Üí 32 (162 colunas removidas)
Porcentagem de colunas mantidas: 16.5%


In [26]:
def load_data(db_path: str) -> pd.DataFrame:
    """
    Carrega os dados do banco de dados SQLite e os prepara para an√°lise.
    """
    print(f"üîÑ Carregando dados de '{db_path}'...")
    with sqlite3.connect(db_path) as conn:
        df = pd.read_sql_query("SELECT * FROM srag_analytics", conn)

    # --- Pr√©-processamento Essencial ---
    # Converte a coluna de data para o formato datetime do pandas
    # 'coerce' transforma datas inv√°lidas em NaT (Not a Time)
    df["DT_SIN_PRI"] = pd.to_datetime(df["DT_SIN_PRI"], errors="coerce")

    # Remove linhas onde a data do primeiro sintoma √© nula
    # Define a data como o √≠ndice do DataFrame para facilitar an√°lises de tempo
    df.dropna(subset=["DT_SIN_PRI"]).set_index("DT_SIN_PRI", inplace=True)

    print(f"‚úÖ Dados carregados com sucesso. {df.shape[0]} linhas.")
    return df


def calc_metric(df: pd.DataFrame, col: str, value: int | str) -> float:
    total_covid_cases = len(df)
    return (
        (df[df[col] == value].shape[0] / total_covid_cases) * 100
        if total_covid_cases > 0
        else 0
    )


def calculate_main_metrics(df: pd.DataFrame):
    """
    Calcula as m√©tricas chave com base nos casos confirmados de COVID-19.
    """
    print("\n--- üìä Calculando M√©tricas Principais ---")
    return {
        "total_casos": len(df),
        # EVOLUCAO = 2 corresponde a "√ìbito"
        "taxa_mortalidade": calc_metric(df, "EVOLUCAO", 2),
        # UTI = 1 corresponde a "Sim"
        "taxa_ocupacao_uti": calc_metric(df, "UTI", 1),
        # VACINA_COV = 1 corresponde a "Sim"
        "taxa_vacinacao_casos": calc_metric(df, "VACINA_COV", 1),
    }

In [27]:
def plot_time_series_graphs(df):
    """
    Gera e exibe gr√°ficos de s√©ries temporais para casos di√°rios e mensais.
    """
    print("\n--- üìà Gerando Gr√°ficos ---")

    covid_cases = df[df["CLASSI_FIN"] == 5].copy()
    if covid_cases.empty:
        print("‚ö†Ô∏è N√£o foram encontrados casos para gerar os gr√°ficos.")
        return

    sns.set_theme(style="whitegrid")

    # --- Gr√°fico Di√°rio (√öltimos 30 dias) ---
    last_date = covid_cases.index.max()
    daily_counts = (
        covid_cases.loc[last_date - pd.Timedelta(days=30) : last_date]
        .resample("D")
        .size()
    )
    plt.figure(figsize=(15, 6))
    ax1 = sns.lineplot(
        x=daily_counts.index, y=daily_counts.values, marker="o", label="N¬∫ de Casos"
    )
    ax1.set_title("Casos Di√°rios de SRAG por COVID-19 (√öltimos 30 Dias)", fontsize=16)
    ax1.set_xlabel("Data", fontsize=12)
    ax1.set_ylabel("N√∫mero de Casos", fontsize=12)
    plt.xticks(rotation=45)

    # --- Gr√°fico Mensal (√öltimos 12 meses) ---
    # Resample por m√™s e pega os √∫ltimos 12 meses de dados
    monthly_counts = covid_cases.resample("M").size().last("12M")

    plt.figure(figsize=(15, 6))
    # Formata o √≠ndice para mostrar 'Ano-M√™s'
    bar_labels = monthly_counts.index.strftime("%Y-%m")
    ax2 = sns.barplot(x=bar_labels, y=monthly_counts.values, palette="viridis")

    ax2.set_title("Casos Mensais de SRAG por COVID-19 (√öltimos 12 Meses)", fontsize=16)
    ax2.set_xlabel("M√™s", fontsize=12)
    ax2.set_ylabel("N√∫mero de Casos", fontsize=12)
    plt.xticks(rotation=45)

    # Exibe os gr√°ficos
    print("‚úÖ Gr√°ficos gerados. Exibindo agora...")
    plt.tight_layout()  # Ajusta o layout para evitar sobreposi√ß√£o
    plt.show()

In [28]:
if (srag_df := load_data(output_db)) is not None:
    if main_metrics := calculate_main_metrics(srag_df):
        print("\n--- Resultados das M√©tricas ---")
        print(
            f"\t- Taxa de Mortalidade (entre casos confirmados): {main_metrics['taxa_mortalidade']:.2f}%"
        )
        print(
            f"\t- Taxa de Interna√ß√£o em UTI (entre casos confirmados): {main_metrics['taxa_ocupacao_uti']:.2f}%"
        )
        print(
            f"\t- Taxa de Vacinados (entre casos confirmados): {main_metrics['taxa_vacinacao_casos']:.2f}%"
        )
    plot_time_series_graphs(srag_df)

üîÑ Carregando dados de '../data/processed/srag_analytics.db'...
‚úÖ Dados carregados com sucesso. 633398 linhas.

--- üìä Calculando M√©tricas Principais ---

--- Resultados das M√©tricas ---
	- Taxa de Mortalidade (entre casos confirmados): 0.00%
	- Taxa de Interna√ß√£o em UTI (entre casos confirmados): 0.00%
	- Taxa de Vacinados (entre casos confirmados): 0.00%

--- üìà Gerando Gr√°ficos ---
‚ö†Ô∏è N√£o foram encontrados casos para gerar os gr√°ficos.
