# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

Collecting gcloud
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/454.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m450.6/454.4 kB[0m [31m29.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcloud
  Building wheel for gcloud (setup.py) ... [?25l[?25hdone
  Created wheel for gcloud: filename=gcloud-0.18.3-py3-none-any.whl size=602927 sha256=99308a741158f5b4bea73286760bf59a13b95fe4561de7898962f8bffcfffa3b
  Stored in directory: /root/.cache/pip/wheels/2a/62/75/3d74209bfebb8805823ae74afa28653aa1ea76d8b5a9d741ff
Successfully built gcloud
Installing collected packages: gcloud
Successfully installed gcloud-0.18.3
Go to the following link in your browser, and 

# Tratamento

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pandas_gbq

# Define the SQL query to be executed in Google BigQuery.
# This query selects all columns from the `CNES_profissionais_saude_ocupacao_publico` table,
# ordering the results by the 'ano' column in descending order.
query = """
  SELECT * FROM `repositoriodedadosgpsp.Datalake.CNES_profissionais_saude_ocupacao_publico` order by ano desc
        """
# Execute the query using pandas_gbq.read_gbq and load the result into a pandas DataFrame called 'df'.
# The 'project_id' specifies the Google Cloud Project to use.
df= pandas_gbq.read_gbq(query, project_id='repositoriodedadosgpsp')

# Filter the DataFrame to find rows where 'tipo_ocupacao' starts with 'Médicos' and display the unique values.
# This line is for inspection and does not modify the DataFrame.
df[df['tipo_ocupacao'].str.startswith('Médicos')]['tipo_ocupacao'].unique()

# This is a duplicate of the previous line, also for inspecting unique values.
df[df['tipo_ocupacao'].str.startswith('Médicos')]['tipo_ocupacao'].unique()

# Create a new column 'tipo_2'.
# If the 'tipo_ocupacao' starts with 'Médicos', the value in 'tipo_2' becomes 'Médicos'.
# Otherwise, it keeps the original 'tipo_ocupacao' value.
df['tipo_2'] = np.where(df['tipo_ocupacao'].str.startswith('Médicos'),'Médicos',df['tipo_ocupacao'])

## Here I unify the three types of Doctors (clinic, surgery and therapeutic)
# The line above is a comment in Portuguese explaining the previous operation.

# Sort the DataFrame by the 'quantidade_vinculos' column in descending order.
# Note: This operation is not performed in-place and the result is not assigned back to 'df',
# so this line only displays the sorted result in an interactive environment.
df.sort_values('quantidade_vinculos',ascending=False)

# Define a list of occupation types that are of primary interest.
ocupacao_interesse = ['Técnicos e auxiliares de enfermagem',
 'Médicos',
 'Trabalhadores em serviços de promoção e apoio à saúde',
 'Enfermeiros e afins',
 'Cirurgiões-dentistas',
 'Fisioterapeutas',
 'Psicólogos e psicanalistas',
 'Farmacêuticos',
 'Técnicos de odontologia',
 'Tecnólogos e técnicos em métodos de diagnósticos e terapêutica',
 'Nutricionistas',
 'Assistentes sociais e economistas domésticos','Outros']

# Create a new column 'tipo_3'.
# If a value in 'tipo_2' is present in the 'ocupacao_interesse' list, that value is kept.
# If not, it is replaced with the string 'Outros' (Others).
df['tipo_3'] = np.where(df['tipo_2'].isin(ocupacao_interesse) ,df['tipo_2'],'Outros')

# Create a pivot table 'x' from the DataFrame 'df'.
# This aggregates (sums) the 'quantidade_vinculos' (number of employment links)
# for each unique combination of 'ano', 'sigla_uf', and 'tipo_3'.
x=df.pivot_table(index=['ano','sigla_uf'  ,'tipo_3'], values='quantidade_vinculos',aggfunc=np.sum)

# Convert the pivot table 'x' into a pandas DataFrame 'y'.
y = pd.DataFrame(x)

# Reset the index of DataFrame 'y'. This converts the multi-level index ('ano', 'sigla_uf', 'tipo_3') into columns.
y = y.reset_index() ##insert the count as a column

# A commented-out line that was likely used to rename a column in a previous version of the code.
#y = y.rename(columns={0:'count'})

## Adjusting for BQ
# The line above is a comment in Portuguese: "Adjusting for BigQuery"

# Assign the processed DataFrame 'y' back to the 'df' variable and rename the 'tipo_3' column to 'tipo_ocupacao'.
df= y.rename(columns={'tipo_3':'tipo_ocupacao'})

# Sort the new DataFrame 'df' by 'quantidade_vinculos' in descending order.
# As before, this is not an in-place operation and the result is not stored.
df.sort_values('quantidade_vinculos', ascending=False)

# Filter the DataFrame 'df' to keep only the rows where the 'ano' is greater than 2012.
df = df[df['ano']>2012]

# Print a concise summary of the final DataFrame 'df'.
# This includes information about the columns, their data types, non-null values, and memory usage.
df.info()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4211 entries, 1755 to 5965
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ano                  4211 non-null   Int64 
 1   sigla_uf             4211 non-null   object
 2   tipo_ocupacao        4211 non-null   object
 3   quantidade_vinculos  4211 non-null   Int64 
dtypes: Int64(2), object(2)
memory usage: 172.7+ KB


# Upload

In [None]:
# Import the bigquery library from google.cloud
from google.cloud import bigquery

# Initialize the BigQuery client, specifying the Google Cloud project ID.
# This client object is used for all subsequent interactions with the BigQuery API.
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create a reference to the BigQuery dataset named 'perfil_remuneracao'.
# This object points to the dataset where the table will be created or updated.
dataset_ref = client.dataset('perfil_remuneracao')

# Define the schema for the destination BigQuery table.
# The schema is a list of SchemaField objects, where each object defines a column's:
# 1. Name (e.g., 'ano')
# 2. Data type (e.g., 'INTEGER')
# 3. Description (e.g., 'Ano de referencia da informacao')
schema=[bigquery.SchemaField('ano','INTEGER',description='Ano de referencia da informacao'),
 bigquery.SchemaField('sigla_uf','STRING',description='Sigla da Unidade da Federação.'),
 bigquery.SchemaField('tipo_ocupacao','STRING',description='Qual a ocupação daquele vínculo'),
 bigquery.SchemaField('quantidade_vinculos','INTEGER',description='Quantidade de vinculos')
 ]

# Create a reference to the target table within the dataset specified earlier.
# The table will be named 'CNES_total_profissionais_saude_v1'.
table_ref = dataset_ref.table('CNES_total_profissionais_saude_v1')

# Configure the load job by creating a LoadJobConfig object.
# Here, we specify the schema that BigQuery should use for the table. This ensures
# that the columns in BigQuery have the correct data types and descriptions.
job_config = bigquery.LoadJobConfig(schema=schema)

# Start the job to load data from the pandas DataFrame 'df' into the specified BigQuery table ('table_ref').
# The job is configured with the previously defined 'job_config'. This command sends the data to BigQuery.
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)

# Wait for the load job to complete and retrieve its result.
# This line is blocking and will pause the script's execution until the data upload is finished.
# It's crucial for ensuring the data is fully loaded before the script ends or proceeds.
job.result()