# Dependência

In [None]:
# Install Google Cloud SDK components (typically run once per environment)
# Note: These are shell commands, not Python code, hence the '!' prefix in Jupyter notebooks
!pip install gcloud  # Installs the Google Cloud CLI tools
!gcloud auth application-default login  # Sets up application default credentials for GCP access

# Import required Python packages for data processing and Google Cloud operations
import pandas as pd         # Primary data analysis library (DataFrames, Series)
import numpy as np          # Numerical computing (arrays, math operations)
import time                 # Time measurement and delays
import os                   # Operating system interactions (files, paths)
import pandas_gbq           # Pandas-BigQuery integration (direct DataFrame transfers)
from google.cloud import bigquery  # Official BigQuery client library (detailed operations)
import glob                 # File path pattern matching (finding files by patterns)
import openpyxl             # Excel file reading/writing (.xlsx format support)
import csv                  # CSV file reading/writing
import re                   # Regular expressions (text pattern matching)

# Tratamento

In [None]:
# Load data from Excel file (MUNIC 2023 survey, Human Resources sheet)
df = pd.read_excel("Base_MUNIC_2023(1).xlsx", sheet_name='Recursos humanos')

# Create two separate dataframes for different question groups (MREH01* and MREH03*)
df1 = df[['Sigla UF', 'CodMun', 'MREH0111', 'MREH0112', 'MREH0113', 'MREH0114', 'MREH0115']]
df2 = df[['Sigla UF', 'CodMun', 'MREH0311', 'MREH0312', 'MREH0313', 'MREH0314', 'MREH0315']]

# Rename columns to meaningful Portuguese names
df1 = df1.rename(columns={
    'MREH0111':'Estatutários',
    'MREH0112':'Celetistas',
    'MREH0113':'Somente comissionados',
    'MREH0114':'Estagiários',
    'MREH0115':'Sem vínculo permanente',
    'CodMun':'id_municipio',
    'Sigla UF':'sigla_uf'
})

df2 = df2.rename(columns={
    'MREH0311':'Estatutários',
    'MREH0312':'Celetistas',
    'MREH0313':'Somente comissionados',
    'MREH0314':'Estagiários',
    'MREH0315':'Sem vínculo permanente',
    'CodMun':'id_municipio',
    'Sigla UF':'sigla_uf'
})

# Transform both dataframes from wide to long format
df1 = df1.melt(
    id_vars=['sigla_uf', 'id_municipio'],
    value_vars=['Estatutários', 'Celetistas', 'Somente comissionados', 'Estagiários', 'Sem vínculo permanente'],
    var_name='tipo_vinculo',
    value_name='quantidade_vinculos'
)

df2 = df2.melt(
    id_vars=['sigla_uf', 'id_municipio'],
    value_vars=['Estatutários', 'Celetistas', 'Somente comissionados', 'Estagiários', 'Sem vínculo permanente'],
    var_name='tipo_vinculo',
    value_name='quantidade_vinculos'
)

# Combine both dataframes
df = pd.concat([df1, df2])

# Clean and standardize the numeric values
df['quantidade_vinculos'] = pd.to_numeric(df['quantidade_vinculos'], errors='coerce')
df['quantidade_vinculos'] = df['quantidade_vinculos'].fillna(0)

# Select and reorder columns
df = df[['tipo_vinculo', 'sigla_uf', 'quantidade_vinculos', 'id_municipio']]

# Aggregate data by UF, municipality and employment type
df = df.groupby(['sigla_uf', 'tipo_vinculo', 'id_municipio'], as_index=False)['quantidade_vinculos'].sum()

# Add year column and finalize column order
df['ano'] = 2023
df = df[['ano', 'tipo_vinculo', 'sigla_uf', 'id_municipio', 'quantidade_vinculos']]

# Ensure municipality code is treated as string
df['id_municipio'] = df['id_municipio'].astype(str)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27850 entries, 0 to 27849
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ano                  27850 non-null  int64  
 1   tipo_vinculo         27850 non-null  object 
 2   sigla_uf             27850 non-null  object 
 3   id_municipio         27850 non-null  object 
 4   quantidade_vinculos  27850 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB


# Upload

In [None]:
# Define the BigQuery table schema with field types and descriptions
schema = [
    bigquery.SchemaField('ano', 'INTEGER', description='Reference year of the information'),
    bigquery.SchemaField('tipo_vinculo', 'STRING', description='Type of employment contract'),
    bigquery.SchemaField('sigla_uf', 'STRING', description='State abbreviation code'),
    bigquery.SchemaField('id_municipio', 'STRING', description='Municipality identifier'),
    bigquery.SchemaField('quantidade_vinculos', 'INTEGER', description='Number of employment contracts')
]

# Initialize BigQuery client connection
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('perfil_remuneracao')

# Create reference to target table with standardized naming convention (SOURCE_descriptive_name_version)
table_ref = dataset_ref.table('MUNIC_quantidade_vinculos_v1')

# Configure the load job with our schema definition
job_config = bigquery.LoadJobConfig(schema=schema)

# Execute the load job to transfer DataFrame to BigQuery
job = client.load_table_from_dataframe(
    dataframe=df,
    destination=table_ref,
    job_config=job_config
)

# Wait for the job to complete
job.result()