# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
# !pip install gcloud
# !gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Load ILOSTAT data from CSV file
df = pd.read_csv("ilostat_todos_paises4.csv")

# Select and rename relevant columns
df = df[['ref_area.label', 'source.label', 'time', 'classif2.label', 'obs_value']]
df = df.rename(columns={
    'ref_area.label':'Reference area',
    'source.label':'Source',
    'time':'Time',
    'classif2.label':'Setor',
    'obs_value':'Total'
})

# Clean sector labels by removing prefix
df['Setor'] = df['Setor'].str.replace('Institutional sector: ', '')

# Create pivot table to calculate totals per country and year
x = df[['Reference area', 'Source', 'Time', 'Setor', 'Total']].pivot_table(
    index=['Reference area','Time'],
    values='Total',
    aggfunc=np.sum
)
y = pd.DataFrame(x)
total = y.reset_index()

# Prepare main dataframe with selected columns
df = df[['Reference area', 'Source', 'Time', 'Setor', 'Total']]
df = df.rename(columns={"Total":"quantidade_vinculos"})

# Merge with totals to calculate proportions
df1 = df.merge(total, right_on=['Reference area','Time'], left_on=['Reference area','Time'])

# Calculate proportion of each sector
df1['prop'] = df1['quantidade_vinculos']/df1['Total']

# Replace proportions of 1 with NaN (total rows)
df1['prop'] = np.where(df1['prop']==1, np.nan, df1['prop'])

# Filter for selected countries of interest
paises = ["Japan","United Kingdom","Canada","Australia","Germany"]
selecionados = df1[df1['Reference area'].isin(paises)]

# Rename columns to final names
df1 = df1.rename(columns={
    'Time':'ano',
    'Reference area':'pais',
    'Source':'fonte_pesquisa',
    'Total':'total_ano',
    'Setor':'setor'
})

# Identify rows without proportion values
linhas_sem_prop = df1[df1['prop'].isna()]
print(linhas_sem_prop)

In [None]:
min(df1['ano'])

2010

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pais                 2105 non-null   object 
 1   fonte_pesquisa       2105 non-null   object 
 2   ano                  2105 non-null   int64  
 3   setor                2105 non-null   object 
 4   quantidade_vinculos  2105 non-null   float64
 5   total_ano            2105 non-null   float64
 6   prop                 2104 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 115.2+ KB


In [None]:
df1['setor'].unique()

array(['Public', 'Private'], dtype=object)

# Upload

In [None]:
# Define the BigQuery table schema with Portuguese descriptions
schema = [
    bigquery.SchemaField('pais', 'STRING', description='Nome do país selecionado'),
    bigquery.SchemaField('fonte_pesquisa', 'STRING',
                       description='De qual pesquisa nacional foi extraído aquele dado.'),
    bigquery.SchemaField('ano', 'INTEGER',
                       description='Ano de coleta da informação'),
    bigquery.SchemaField('setor', 'STRING',
                       description='se o setor é público ou privado'),
    bigquery.SchemaField('quantidade_vinculos', 'FLOAT',
                       description='total de pessoas que trabalham naquele setor'),
    bigquery.SchemaField('total_ano', 'FLOAT',
                       description='total de pessoas que trabalham naquele ano'),
    bigquery.SchemaField('prop', 'FLOAT',
                       description='Proporção de vínculos naquele setor'),
]

# Initialize BigQuery client connection
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('perfil_remuneracao')

# Create reference to target table with standardized naming convention:
# FONTE_algo_intuitivo_dado (ILOSTAT_todos_paises_v3)
table_ref = dataset_ref.table('ILOSTAT_todos_paises_v3')

# Configure the load job with our schema definition
job_config = bigquery.LoadJobConfig(schema=schema)

# Execute the load job to upload DataFrame to BigQuery
job = client.load_table_from_dataframe(
    dataframe=df1,
    destination=table_ref,
    job_config=job_config
)

# Wait for the job to complete
job.result()