# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

Collecting gcloud
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/454.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcloud
  Building wheel for gcloud (setup.py) ... [?25l[?25hdone
  Created wheel for gcloud: filename=gcloud-0.18.3-py3-none-any.whl size=602927 sha256=cde60d93e38233b1b82072240f5bad316bc9361ad09901503db961b724ba0037
  Stored in directory: /root/.cache/pip/wheels/2a/62/75/3d74209bfebb8805823ae74afa28653aa1ea76d8b5a9d741ff
Successfully built gcloud
Installing collected packages: gcloud
Successfully installed gcloud-0.18.3
Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6h

# Tratamento

In [None]:
# Import necessary libraries
import pandas as pd
import pandas_gbq

# Define the SQL query to be executed in Google BigQuery.
# This query combines military pensioner data from multiple tables (one for each year from 2020 to 2025) using UNION ALL.
# For each record, it selects the year, month, and pension type.
# It also adds a static 'total' column with a value of 1, which can be used for counting records.
query = """
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2020_abr_militares_pensionistas_cadastro`
            UNION ALL
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2021_abr_militares_pensionistas_cadastro`
            UNION ALL
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2022_abr_militares_pensionistas_cadastro`
            UNION ALL
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2023_abr_militares_pensionistas_cadastro`
            UNION ALL
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2024_abr_militares_pensionistas_cadastro`
            UNION ALL
        SELECT ANO as ano, MES as mes, TIPO_PENSAO as tipo_pensao,
            1 as total FROM `repositoriodedadosgpsp.portal_transparencia_cgu.2025_4_militares_pensionistas_cadastro`
        """
# Execute the query using pandas_gbq.read_gbq and load the result into a pandas DataFrame called 'df'.
df = pandas_gbq.read_gbq(query, project_id='repositoriodedadosgpsp')

# Create a dictionary to map detailed military pension types (keys) to broader, grouped categories (values).
# This also handles variations in the source data, such as missing accent characters (e.g., "C njuge" for "Cônjuge").
pensoes = {
    "Filho" : "Filhos, netos, enteados ou menores em tutela",
    "Filha" : "Filhos, netos, enteados ou menores em tutela",
    "Menor sob guarda ou tutela" : "Filhos, netos, enteados ou menores em tutela",
    "Neto (a)" : "Filhos, netos, enteados ou menores em tutela",
    "Filho(a) adotivo ou Enteado(a)" : "Filhos, netos, enteados ou menores em tutela",
    "Cônjuge / Viúva (o)" : "Cônjuge ou ex-cônjuge",
    "C njuge / Vi va (o)" : "Cônjuge ou ex-cônjuge",
    "Companheiro (a)" : "Cônjuge ou ex-cônjuge",
    "Pessoa desquitada, separada judicialmente, divorciada do instituidor ou ex-convivente" : "Cônjuge ou ex-cônjuge",
    "Mãe":"Pais",
    "M e":"Pais",
    "Pai":"Pais",
    "Irmão (ã)":"Irmão (ã)",
    "Irm o ( )":"Irmão (ã)",
    "Ex-combatente (o próprio)":"Ex-combatente (o próprio)",
    "Ex-combatente (o pr prio)":"Ex-combatente (o próprio)",
    "Pessoa designada (Beneficiário instituído)":"Pessoa designada (Beneficiário instituído)",
    "Pessoa designada (Benefici rio institu do)":"Pessoa designada (Beneficiário instituído)",
    "Não informado" : "Não informado",
    "N o informado" : "Não informado",
    "Outros (Pessoas sem Vínculo Militar)" : "Outros (Pessoas sem Vínculo Militar)",
    "Outros (Pessoas sem V nculo Militar)" : "Outros (Pessoas sem Vínculo Militar)"
    }

# Define a function that takes a pension type and returns its corresponding category from the dictionary.
def categorizando(x):
    # Check if the input value 'x' exists as a key in the 'pensoes' dictionary.
    if x in pensoes:
        # If it exists, return the corresponding category (the value).
        return pensoes[x]

# Create a new column 'tipo_pensao_agrupado' by applying the 'categorizando' function
# to each value in the 'tipo_pensao' column, effectively grouping them into standardized categories.
df['tipo_pensao_agrupado'] = df['tipo_pensao'].apply(categorizando)

# Select and reorder the columns of the DataFrame to create the final desired structure.
df= df[['ano', 'mes', 'tipo_pensao','tipo_pensao_agrupado', 'total']]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1402272 entries, 0 to 1402271
Data columns (total 5 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   ano                   1402272 non-null  Int64 
 1   mes                   1402272 non-null  Int64 
 2   tipo_pensao           1402272 non-null  object
 3   tipo_pensao_agrupado  1402272 non-null  object
 4   total                 1402272 non-null  Int64 
dtypes: Int64(3), object(2)
memory usage: 57.5+ MB


# Upload

In [None]:
# Import the bigquery library from google.cloud
from google.cloud import bigquery

# Initialize the BigQuery client, specifying the Google Cloud project ID.
# This client object is the main entry point for interacting with the BigQuery API.
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create a reference to the BigQuery dataset named 'perfil_remuneracao'.
# This object points to the dataset where the table will be created or updated.
dataset_ref = client.dataset('perfil_remuneracao')

# Define the schema for the destination BigQuery table.
# The schema is a list of SchemaField objects, where each object defines a column's:
# 1. Name (e.g., 'ano')
# 2. Data type (e.g., 'INTEGER')
# 3. Description (e.g., 'Ano de referência da observação')
schema=[bigquery.SchemaField('ano','INTEGER',description='Ano de referência da observação'),
        bigquery.SchemaField('mes','INTEGER',description='Mês de referência da observação'),
        bigquery.SchemaField('tipo_pensao','STRING',description='Tipo de pensão'),
        bigquery.SchemaField('tipo_pensao_agrupado','STRING',description='Categorização das pensões'),
        bigquery.SchemaField('total','INTEGER',description='Quantidade total da observação')
        ]

# Create a reference to the target table within the dataset specified earlier.
# The table will be named 'MILITARES_pensionistas_total_v1'.
table_ref = dataset_ref.table('MILITARES_pensionistas_total_v1')

# Configure the load job by creating a LoadJobConfig object.
# Here, we specify the schema that BigQuery should use for the table. This ensures
# that the columns in BigQuery have the correct data types and descriptions.
job_config = bigquery.LoadJobConfig(schema=schema)

# Start the job to load data from the pandas DataFrame 'df' into the specified BigQuery table ('table_ref').
# The job is configured with the previously defined 'job_config'. This command sends the data to BigQuery.
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)

# Wait for the load job to complete and retrieve its result.
# This line is blocking and will pause the script's execution until the data upload is finished.
# It's crucial for ensuring the data is fully loaded before the script ends or proceeds.
job.result()