In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
# !pip install gcloud
# !gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Set Google Cloud project ID
project_id = "repositoriodedadosgpsp"

# SQL query to fetch all data from the ILOSTAT table
query = """
SELECT * FROM `repositoriodedadosgpsp.perfil_remuneracao.ILOSTAT_todos_paises_v3`
"""

# Execute query and load results into DataFrame
df = pandas_gbq.read_gbq(query, project_id=project_id)

# Display first few rows of the data
df.head()

# List of selected countries for analysis
paises_selecionados = ['South Africa', 'Argentina', 'Bolivia (Plurinational State of)',
                      'Brazil', 'Chile', 'Colombia', 'United States of America',
                      'France', 'Mexico', 'Peru', 'Uruguay']

# Step 1: Find most recent year for each country (public sector only)
ano_max_por_pais = df[df['setor'] == 'Public'].groupby('pais')['ano'].max().reset_index()

# Step 2: Filter for only selected countries
ano_max_selecionados = ano_max_por_pais[ano_max_por_pais['pais'].isin(paises_selecionados)]

# Step 3: Merge with original DataFrame to get complete data
df_final = pd.merge(
    df,
    ano_max_selecionados,
    on=['pais', 'ano'],
    how='inner'  # Keep only matching rows
)

# Step 4: Ensure only public sector data is included
df_final = df_final[df_final['setor'] == 'Public'].reset_index(drop=True)

# Country name mapping to Portuguese
mapeamento_paises = {
    'Argentina': 'Argentina',
    'Bolivia (Plurinational State of)': 'Bolívia',
    'Brazil': 'Brasil',
    'Chile': 'Chile',
    'Colombia': 'Colômbia',
    'France': 'França',
    'Mexico': 'México',
    'Peru': 'Peru',
    'Uruguay': 'Uruguai',
    'United States of America': 'Estados Unidos',
    'South Africa': 'África do Sul'
}

# Apply country name mapping
df_final['pais'] = df_final['pais'].replace(mapeamento_paises)

# Select final columns
df_final = df_final[['pais', 'fonte_pesquisa', 'ano', 'prop']]

# Rename columns for clarity
df_final = df_final.rename(columns={
    'ano': 'ano_pesquisa',
    'prop': 'prop_vinculos_publicos'
})

# Upload

In [None]:
# Define the BigQuery table schema with Portuguese descriptions
schema = [
    bigquery.SchemaField('pais', 'STRING', description='Nome do país selecionado'),
    bigquery.SchemaField('fonte_pesquisa', 'STRING',
                       description='De qual pesquisa nacional foi extraído aquele dado.'),
    bigquery.SchemaField('ano_pesquisa', 'FLOAT',
                       description='Ano de coleta da informação'),
    bigquery.SchemaField('prop_vinculos_publicos', 'FLOAT',
                       description='Proporção de vínculos públicos daquele país')
]

# Initialize BigQuery client connection
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('perfil_remuneracao')

# Create reference to target table with standardized naming convention:
# FONTE_algo_intuitivo_dado (ILOSTAT_proporcao_publicos_selecionados_v2)
table_ref = dataset_ref.table('ILOSTAT_proporcao_publicos_selecionados_v2')

# Configure the load job with our schema definition
job_config = bigquery.LoadJobConfig(schema=schema)

# Execute the load job to upload DataFrame to BigQuery
job = client.load_table_from_dataframe(
    dataframe=df_final,
    destination=table_ref,
    job_config=job_config
)

# Wait for the job to complete
job.result()