# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Set the Google Cloud project ID.
project_id = "repositoriodedadosgpsp"

# Define the SQL query to select all data from the specified BigQuery table.
query = """
SELECT * FROM `repositoriodedadosgpsp.Datalake.RAIS_remuneracao_vinculos_publicos_v4`
"""

# Execute the query and load the results into a pandas DataFrame.
df = pandas_gbq.read_gbq(query, project_id=project_id)

# Display the first few rows of the DataFrame to verify the data.
print(df.head())

# 1. Load the original IPCA (inflation index) data from a CSV file.
df_ipca = pd.read_csv('it-ipca-formatado.csv')

# 2. Extract the years from the column headers and create a list.
anos = df_ipca.columns.str.extract('(\d{4})')[0].tolist()
# Extract the IPCA values from the first row and create a list.
valores = df_ipca.iloc[0].tolist()

# 3. Create a new DataFrame from the extracted years and values.
df_ipca_transformado = pd.DataFrame({
    'Ano': anos,
    'IPCA_acumulado': valores
})

# 4. Define a function to clean and convert IPCA values.
def converter_valor(valor):
    # Check if the value is a string.
    if isinstance(valor, str):
        # Return None for '...' placeholder.
        if valor == '...':
            return None
        # Replace comma with a dot for decimal conversion and cast to float.
        return float(valor.replace(',', '.'))
    # Return the value as is if it's not a string.
    return valor

# Apply the cleaning function to the 'IPCA_acumulado' column.
df_ipca_transformado['IPCA_acumulado'] = df_ipca_transformado['IPCA_acumulado'].apply(converter_valor)

# 5. Remove rows with null values (optional step).
df_ipca_transformado = df_ipca_transformado.dropna()

# 6. Convert the 'Ano' column to an integer data type.
df_ipca_transformado['Ano'] = df_ipca_transformado['Ano'].astype(int)

# 7. Display the transformed and cleaned IPCA DataFrame.
print(df_ipca_transformado)

# 8. Save the transformed DataFrame to a new CSV file, without the index.
df_ipca_transformado.to_csv('ipca_simplificado.csv', index=False)

# Rename the 'Ano' column to 'ano' to match the main DataFrame for merging.
df_ipca_transformado = df_ipca_transformado.rename(columns={'Ano': 'ano'})
# Ensure the 'ano' column in the IPCA DataFrame is of integer type.
df_ipca_transformado['ano'] = df_ipca_transformado['ano'].astype(int)
# Ensure the 'ano' column in the main DataFrame is of integer type.
df['ano'] = df['ano'].astype(int)

# Merge the main DataFrame with the IPCA DataFrame based on the 'ano' column.
df_completo = pd.merge(
    df,
    df_ipca_transformado,
    on='ano',
    how='left' # Use a left join to keep all records from the main DataFrame.
)

# Calculate the correction factor based on a reference value and the accumulated IPCA.
df_completo['fator_correcao'] = (7100.5 / df_completo['IPCA_acumulado'])

# Calculate the adjusted average remuneration by applying the correction factor.
df_completo['media_remuneracao_ajustada']= df_completo['media_remuneracao'] * (df_completo['fator_correcao'])

# Select and reorder the columns for the final DataFrame.
df_completo = df_completo[['ano', 'variavel', 'categoria', 'media_remuneracao',
       'fator_correcao', 'media_remuneracao_ajustada']]

In [None]:
df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519 entries, 0 to 518
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ano                         519 non-null    int64  
 1   variavel                    519 non-null    object 
 2   categoria                   519 non-null    object 
 3   media_remuneracao           519 non-null    float64
 4   fator_correcao              519 non-null    float64
 5   media_remuneracao_ajustada  519 non-null    float64
dtypes: float64(3), int64(1), object(2)
memory usage: 24.5+ KB


# Upload

In [None]:
# Define the schema for the BigQuery table, specifying column names, data types, and descriptions.
schema=[bigquery.SchemaField('ano','INTEGER',description='Ano de referência'),
 bigquery.SchemaField('variavel','STRING',description='De qual variável aquela remuneração se refere'),
 bigquery.SchemaField('categoria','STRING',description='Categoria dentro daquela variável'),
 bigquery.SchemaField('media_remuneracao','FLOAT',description='Valor nominal da média de remuneração daquele ano'),
bigquery.SchemaField('fator_correcao','FLOAT',description='índice para correção monetária baseando-se no IPCA para dezembro de 2022'),
bigquery.SchemaField('media_remuneracao_ajustada','FLOAT',description='Valor nominal da média de remuneração daquele ano ajustado para dezembro de 2022')
 ]

 ## Uploading to the datalake
 # Initialize the BigQuery client with the specified project ID.
client = bigquery.Client(project='repositoriodedadosgpsp')
# Get a reference to the 'perfil_remuneracao' dataset.
dataset_ref = client.dataset('perfil_remuneracao')

# Get a reference to the target table within the dataset.
table_ref = dataset_ref.table('RAIS_remuneracao_vinculos_publicos_v3') # Table name follows the pattern SOURCE_intuitive_name_data
# Configure the load job, applying the predefined schema.
job_config = bigquery.LoadJobConfig(schema=schema)
# Start the job to load the DataFrame into the specified BigQuery table.
job = client.load_table_from_dataframe(df_completo, table_ref, job_config=job_config)
# Wait for the load job to complete and get the result.
job.result()