# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
# !pip install gcloud
# !gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Set Google Cloud project ID
project_id = "repositoriodedadosgpsp"

# SQL query to fetch all data from the MUNIC_quantidade_vinculos_v1 table
query = """
SELECT * FROM `repositoriodedadosgpsp.perfil_remuneracao.MUNIC_quantidade_vinculos_v1`
"""

# Execute query and load results into DataFrame
df = pandas_gbq.read_gbq(query, project_id=project_id)

# Create pivot table to calculate total employment by municipality
y = df.pivot_table(
    index={'id_municipio'},
    values='quantidade_vinculos',
    aggfunc=np.sum
)

# Convert pivot table to DataFrame and reset index
y = pd.DataFrame(y)
total = y.reset_index()

# Rename column for clarity
total = total.rename(columns={'quantidade_vinculos': 'total_municipio'})

# Merge totals back with original data
df1 = df.merge(
    total,
    right_on='id_municipio',
    left_on='id_municipio'
)

# Example check for specific municipality (Vitória-ES)
df1[df1['id_municipio'] == "3200102"]

# Update main DataFrame with merged data
df = df1

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27850 entries, 0 to 27849
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ano                  27850 non-null  Int64 
 1   tipo_vinculo         27850 non-null  object
 2   sigla_uf             27850 non-null  object
 3   id_municipio         27850 non-null  object
 4   quantidade_vinculos  27850 non-null  Int64 
 5   total_municipio      27850 non-null  Int64 
dtypes: Int64(3), object(3)
memory usage: 1.4+ MB


# Upload

In [None]:
# Define the BigQuery table schema with Portuguese descriptions
schema = [
    bigquery.SchemaField('ano', 'INTEGER', description='Ano de referencia da informacao'),
    bigquery.SchemaField('id_municipio', 'STRING', description='Identificador de município'),
    bigquery.SchemaField('sigla_uf', 'STRING', description='Sigla da Unidade da Federação.'),
    bigquery.SchemaField('tipo_vinculo', 'STRING', description='Tipo de vinculo.'),
    bigquery.SchemaField('quantidade_vinculos', 'INTEGER', description='Quantidade de vinculos'),
    bigquery.SchemaField('total_municipio', 'INTEGER', description='Total de vínculos daquele municipio')
]

# Initialize BigQuery client connection
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('perfil_remuneracao')

# Create reference to target table with standardized naming convention:
# FONTE_algo_intuitivo_dado (MUNIC_quantidade_vinculos_mapa_v1)
table_ref = dataset_ref.table('MUNIC_quantidade_vinculos_mapa_v1')

# Configure the load job with our schema definition
job_config = bigquery.LoadJobConfig(
    schema=schema,
    # Optional parameters (commented out):
    # write_disposition="WRITE_TRUNCATE",  # Overwrites table if exists
    # create_disposition="CREATE_IF_NEEDED"  # Default behavior
)

# Execute the load job to upload DataFrame to BigQuery
job = client.load_table_from_dataframe(
    dataframe=df,
    destination=table_ref,
    job_config=job_config
)

# Wait for the job to complete
job.result()