In [6]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

Collecting gcloud
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcloud
  Building wheel for gcloud (setup.py) ... [?25l[?25hdone
  Created wheel for gcloud: filename=gcloud-0.18.3-py3-none-any.whl size=602927 sha256=780f4b1b515041506acd7a202d86f758235f7d85e1698d59958bd69f94929ee6
  Stored in directory: /root/.cache/pip/wheels/3c/e8/d1/cb82a63f69083ea485de71d14248b8d145f1af46a41578be9c
Successfully built gcloud
Installing collected packages: gcloud
Successfully installed gcloud-0.18.3
Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefau

# Tratamento

In [None]:
# Set Google Cloud project ID
project_id = "repositoriodedadosgpsp"

# SQL query to fetch all data from the ESTADIC_quantidade_vinculos_v1 table
query = """
SELECT * FROM `repositoriodedadosgpsp.perfil_remuneracao.ESTADIC_quantidade_vinculos_v1`
"""

# Execute query and load results into DataFrame
df = pandas_gbq.read_gbq(query, project_id=project_id)

# Display first few rows of the data
print(df.head())

# Create pivot table to calculate total employment by state
y = df.pivot_table(
    index={'sigla_uf'},  # Group by state abbreviation
    values='quantidade_vinculos',  # Aggregate employment numbers
    aggfunc=np.sum  # Sum all employment types
)

# Convert pivot table to DataFrame and reset index
y = pd.DataFrame(y)
total = y.reset_index()

# Rename column for clarity
total = total.rename(columns={'quantidade_vinculos': 'total_estado'})

# Merge totals back with original data
df1 = df.merge(
    total,
    right_on='sigla_uf',
    left_on='sigla_uf'
)

# Example check for Espírito Santo state data
df1[df1['sigla_uf'] == "ES"]

# Special handling for Rondônia state (setting total to 1)
df1.loc[df1['sigla_uf'] == 'RO', 'total_estado'] = 1

# Verify Rondônia data
df1[df1['sigla_uf'] == 'RO']

# Update main DataFrame with merged data
df = df1

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ano                  135 non-null    Int64 
 1   tipo_vinculo         135 non-null    object
 2   sigla_uf             135 non-null    object
 3   quantidade_vinculos  135 non-null    Int64 
 4   total_estado         135 non-null    Int64 
dtypes: Int64(3), object(2)
memory usage: 5.8+ KB


# Upload

In [None]:
# Define the BigQuery table schema with Portuguese descriptions
schema = [
    bigquery.SchemaField('ano', 'INTEGER', description='Ano de referencia da informacao'),
    bigquery.SchemaField('tipo_vinculo', 'STRING', description='Tipo de vinculo.'),
    bigquery.SchemaField('sigla_uf', 'STRING', description='Sigla da Unidade da Federação.'),
    bigquery.SchemaField('quantidade_vinculos', 'INTEGER', description='Quantidade de vinculos'),
    bigquery.SchemaField('total_estado', 'INTEGER', description='Total de vínculos daquela UF')
]

# Initialize BigQuery client connection
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('perfil_remuneracao')

# Create reference to target table with standardized naming convention:
# FONTE_algo_intuitivo_dado (ESTADIC_quantidade_vinculos_mapa_v2)
table_ref = dataset_ref.table('ESTADIC_quantidade_vinculos_mapa_v2')

# Configure the load job with our schema definition
job_config = bigquery.LoadJobConfig(
    schema=schema,
    # Optional parameters (commented out):
    # write_disposition="WRITE_TRUNCATE",  # Overwrites table if exists
    # create_disposition="CREATE_IF_NEEDED"  # Default behavior
)

# Execute the load job to upload DataFrame to BigQuery
job = client.load_table_from_dataframe(
    dataframe=df,
    destination=table_ref,
    job_config=job_config
)

# Wait for the job to complete
job.result()