In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
# !pip install gcloud
# !gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Import the pandas library, which is essential for data manipulation and analysis.
import pandas as pd

# Load the dataset from a CSV file into a pandas DataFrame.
df = pd.read_csv("ilostat_todos_paises_genero.csv")

# This line was commented out, it was likely intended to scale a value.
#df['obs_value'] = df['obs_value'] * 1000

# This line would display the DataFrame in an interactive environment like Jupyter.
df

# Select a subset of the original columns to keep for the analysis.
# This filters the DataFrame to only include the specified columns.
df = df[['ref_area.label', 'source.label', 'time', 'sex.label', 'classif2.label', 'obs_value']]

# Rename the selected columns to more descriptive and user-friendly names in Portuguese.
df = df.rename(columns={
    'ref_area.label':'pais',
    'source.label':'fonte_pesquisa',
    'time':'ano_pesquisa',
    'sex.label':'genero',
    'classif2.label':'setor',
    'obs_value':'quantidade'
})

# Clean the 'setor' column by removing the prefix 'Institutional sector: ' from its values.
df['setor'] = df['setor'].str.replace('Institutional sector: ', '')

# Define a list of specific countries to be included in the final analysis.
paises_selecionados = ['South Africa', 'Argentina', 'Bolivia (Plurinational State of)', 'Brazil', 'Chile', 'Colombia', 'United States of America', 'France', 'Mexico', 'Peru', 'Uruguay']

# Step 1: Find the most recent year of data available for each country.
# This is done by grouping by 'pais' and finding the maximum 'ano_pesquisa'.
ano_max_por_pais = df.groupby('pais')['ano_pesquisa'].max().reset_index()

# Step 2: Filter the result from the previous step to include only the countries
# that are present in the 'paises_selecionados' list.
ano_max_selecionados = ano_max_por_pais[ano_max_por_pais['pais'].isin(paises_selecionados)]

# Step 3: Merge the original DataFrame with the filtered maximum year data.
# The 'inner' join ensures that only rows for the selected countries and their most recent year are kept.
df_final = pd.merge(
    df,
    ano_max_selecionados,
    on=['pais', 'ano_pesquisa'],
    how='inner'
)

# Step 4: Further filter the DataFrame to include only the 'Private' and 'Public' sectors.
# After filtering, the index is reset to be sequential.
df_final = df_final[df_final['setor'].isin(['Private', 'Public'])].reset_index(drop=True)

# Create a dictionary to map the original country names to their Portuguese translations.
mapeamento_paises = {
    'Argentina': 'Argentina',
    'Bolivia (Plurinational State of)': 'Bolívia',
    'Brazil': 'Brasil',
    'Chile': 'Chile',
    'Colombia': 'Colômbia',
    'France': 'França',
    'Mexico': 'México',
    'Peru': 'Peru',
    'Uruguay': 'Uruguai',
    'United States of America': 'Estados Unidos',
    'South Africa': 'África do Sul'
}

# Apply the mapping to the 'pais' column to translate the country names.
df_final['pais'] = df_final['pais'].replace(mapeamento_paises)

# Scale the 'quantidade' column by multiplying its values by 1000.
# This is often done when the original unit is in thousands.
df_final['quantidade'] = df_final['quantidade'] * 1000

# Translate the values in the 'genero' column from English to Portuguese.
df_final['genero'] = df_final['genero'].replace({
    'Female': 'Mulher',
    'Male': 'Homem'
})

# Translate the values in the 'setor' column from English to Portuguese.
df_final['setor'] = df_final['setor'].replace({
    'Public': 'Público',
    'Private': 'Privado'
})

# Convert the data type of the 'quantidade' column to integer.
df_final['quantidade'] = df_final['quantidade'].astype(int)

In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pais            44 non-null     object
 1   fonte_pesquisa  44 non-null     object
 2   ano_pesquisa    44 non-null     int64 
 3   genero          44 non-null     object
 4   setor           44 non-null     object
 5   quantidade      44 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 2.2+ KB


# Upload

In [None]:
# Define the schema for the BigQuery table.
# This specifies each column's name, data type (e.g., STRING, INTEGER), and a description.
schema=[
    bigquery.SchemaField('pais','STRING',description='Nome do país selecionado'),
    bigquery.SchemaField('fonte_pesquisa','STRING',description='De qual pesquisa nacional foi extraído aquele dado.'),
    bigquery.SchemaField('genero','STRING',description='Gênero da pessoa'),
    bigquery.SchemaField('quantidade','INTEGER',description='total de pessoas que trabalham'),
    bigquery.SchemaField('setor','STRING',description='se o setor é público ou privado'),
    bigquery.SchemaField('ano_pesquisa','INTEGER',description='Ano de coleta da informação'),
]

# A comment in Portuguese indicating the purpose of this section: "Uploading to datalake".
## Subindo para datalake

# Initialize the BigQuery client, specifying the Google Cloud project ID to connect to.
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create a reference to the target BigQuery dataset named 'perfil_remuneracao'.
dataset_ref = client.dataset('perfil_remuneracao')

# Create a reference to the target table within the dataset.
# The table will be named 'ILOSTAT_paises_selecionados_genero_v4'.
table_ref = dataset_ref.table('ILOSTAT_paises_selecionados_genero_v4')

# Configure the load job. By passing the 'schema' object, we ensure the table is created
# with the correct data types and column descriptions.
job_config = bigquery.LoadJobConfig(schema=schema)

# Start the load job to upload the pandas DataFrame (df_final) to the specified BigQuery table.
# This sends the data from your local script to Google Cloud.
job = client.load_table_from_dataframe(df_final, table_ref, job_config=job_config)

# Wait for the load job to complete. This is a blocking call that will pause the script
# until the upload is finished or fails, raising an exception on error.
job.result()