# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Import the pandas library for data manipulation.
import pandas as pd

# Read data from a semicolon-separated CSV file into a pandas DataFrame.
df = pd.read_csv('5680-085totalvinculospodernivelbrgrufdadosbrasil.csv', sep=';')

# 1. Create a list of columns to keep fixed (year)
# The line above is the original comment in Portuguese.
# Define the identifier variable(s) that will remain as columns during the unpivoting process.
id_vars = ['ano']

# 2. Derive sphere and power from the column names
# First we do the melt to transform the columns into rows
# The lines above are original comments in Portuguese.
# Unpivot the DataFrame from a wide to a long format using the 'melt' function.
# This converts all columns except 'ano' into rows, creating two new columns:
# 'variavel' (original column names) and 'quantidade_vinculos' (their values).
df_long = pd.melt(
    df,
    id_vars=id_vars,
    var_name='variavel',
    value_name='quantidade_vinculos'
)

# 3. Extract 'esfera' and 'poderes' from the column names
# The line above is the original comment in Portuguese.
# Create a 'poderes' (powers) column by extracting the text between 'vinculos_' and the next '_' from 'variavel' and capitalizing it.
df_long['poderes'] = df_long['variavel'].str.extract(r'vinculos_(.*?)_')[0].str.capitalize()
# Create an 'esfera' (sphere) column by extracting the text after the last '_' from 'variavel' and capitalizing it.
df_long['esfera'] = df_long['variavel'].str.extract(r'_(.*?)$')[0].str.capitalize()

# 4. Correct special cases (like 'municipal' becoming 'Municipal')
# The line above is the original comment in Portuguese.
# Ensure the values in the 'esfera' column are consistently named.
df_long['esfera'] = df_long['esfera'].replace({
    'Federal': 'Federal',
    'Estadual': 'Estadual',
    'Municipal': 'Municipal'
})

# 5. Calculate the annual proportion (prop_ano)
# The line above is the original comment in Portuguese.
# Calculate the total 'quantidade_vinculos' for each year. 'transform' aligns the result back to the original DataFrame's shape.
total_por_ano = df_long.groupby('ano')['quantidade_vinculos'].transform('sum')
# Create a new column 'prop_ano' by dividing the links in each row by the total for that year.
df_long['prop_ano'] = df_long['quantidade_vinculos'] / total_por_ano

# 6. Select and order the desired columns
# The line above is the original comment in Portuguese.
# Define the final list of columns to be included in the new DataFrame.
colunas_finais = ['ano', 'quantidade_vinculos', 'prop_ano', 'esfera', 'poderes']
# Create the final DataFrame by selecting the specified columns, sorting them, and resetting the index.
df_final = df_long[colunas_finais].sort_values(['ano', 'esfera', 'poderes']).reset_index(drop=True)

# 7. View the result
# The line above is the original comment in Portuguese.
# Display the first 10 rows of the resulting DataFrame.
df_final.head(10)

# Assuming df_final is your current DataFrame
# The line above is the original comment in Portuguese.

# Step 1: Correct the 'esfera' column
# The line above is the original comment in Portuguese.
# This block appears to re-process the 'esfera' column, possibly to correct an earlier mistake.
df_final['esfera'] = (
    df_final['esfera']
    .astype(str)  # Ensures it is treated as a string
    .str.extract(r'_([^_]+)$')[0]  # Extracts text after the last underscore
    .str.capitalize()  # Capitalizes the first letter
)

# Step 2: Correct the 'poderes' column to the Brazilian standard
# The line above is the original comment in Portuguese.
# Replace values in the 'poderes' column to add the correct Portuguese accent marks.
df_final['poderes'] = df_final['poderes'].replace({
    'Judiciario': 'Judiciário',
    'Legislativo': 'Legislativo',
    'Executivo': 'Executivo'
})

# Define the order of the categories
# The line above is the original comment in Portuguese.
# Create lists that define a custom, logical sort order for the 'poderes' and 'esfera' columns.
ordem_poderes = ['Executivo', 'Legislativo', 'Judiciário']
ordem_esfera = ['Federal', 'Estadual', 'Municipal']

# Convert columns to categorical type with ordering
# The line above is the original comment in Portuguese.
# Convert the 'poderes' column to an ordered Categorical type based on the defined order.
df_final['poderes'] = pd.Categorical(
    df_final['poderes'],
    categories=ordem_poderes,
    ordered=True
)
# Convert the 'esfera' column to an ordered Categorical type based on the defined order.
df_final['esfera'] = pd.Categorical(
    df_final['esfera'],
    categories=ordem_esfera,
    ordered=True
)

# Sort the DataFrame
# The line above is the original comment in Portuguese.
# Sort the DataFrame first by the custom 'poderes' order, then by the custom 'esfera' order, and finally by 'ano'.
df_ordenado = df_final.sort_values(
    ['poderes', 'esfera', 'ano'],
    ascending=[True, True, True]  # Year in ascending order
).reset_index(drop=True)

# View the result
# The line above is the original comment in Portuguese.
# Display the first 15 rows of the fully sorted DataFrame to see the pattern.
df_ordenado.head(15)

# Upload

In [None]:
# Import the bigquery library from google.cloud
from google.cloud import bigquery

# Define the schema for the destination BigQuery table.
# The schema is a list of SchemaField objects, where each object defines a column's:
# 1. Name (e.g., 'ano')
# 2. Data type (e.g., 'INTEGER')
# 3. Description (e.g., 'Ano de referência da observação')
schema = [bigquery.SchemaField('ano', 'INTEGER', description= 'Ano de referência da observação'),
          bigquery.SchemaField('quantidade_vinculos', 'INTEGER', description= 'Número total de vinculos observados'),
          bigquery.SchemaField('prop_ano', 'FLOAT', description= 'proporção de vínculo em relação ao total naquele ano'),
          bigquery.SchemaField('esfera', 'STRING', description= 'Nível da esfera do governo referente da observação'),
          bigquery.SchemaField('poderes', 'STRING', description= 'Poder abrangente ao nível de esfera referente a observação')
          ]

## Uploading to datalake
# The line above is the original comment in Portuguese.

# Initialize the BigQuery client, specifying the Google Cloud project ID.
# This client object is the main entry point for interacting with the BigQuery API.
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create a reference to the BigQuery dataset named 'perfil_remuneracao'.
# This object points to the dataset where the table will be created or updated.
dataset_ref = client.dataset('perfil_remuneracao')

# Create a reference to the target table within the dataset specified earlier.
# The table will be named 'RAIS_poder_nivel_vinculos_v4'.
table_ref = dataset_ref.table('RAIS_poder_nivel_vinculos_v4') # table name in the format SOURCE_something_intuitive_data

# Configure the load job by creating a LoadJobConfig object.
# Here, we specify the schema that BigQuery should use for the table. This ensures
# that the columns in BigQuery have the correct data types and descriptions.
job_config = bigquery.LoadJobConfig(schema=schema)

# Start the job to load data from the pandas DataFrame 'df_ordenado' into the specified BigQuery table ('table_ref').
# The job is configured with the previously defined 'job_config'. This command sends the data to BigQuery.
job = client.load_table_from_dataframe(df_ordenado, table_ref, job_config=job_config)

# Wait for the load job to complete and retrieve its result.
# This line is blocking and will pause the script's execution until the data upload is finished.
# It's crucial for ensuring the data is fully loaded before the script ends or proceeds.
job.result()