# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
!pip install gcloud
!gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

Collecting gcloud
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/454.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcloud
  Building wheel for gcloud (setup.py) ... [?25l[?25hdone
  Created wheel for gcloud: filename=gcloud-0.18.3-py3-none-any.whl size=602927 sha256=1c209251ed1966c764f15d41c497f65cdc072627fe8f3141a9035e42cc27933a
  Stored in directory: /root/.cache/pip/wheels/2a/62/75/3d74209bfebb8805823ae74afa28653aa1ea76d8b5a9d741ff
Successfully built gcloud
Installing collected packages: gcloud
Successfully installed gcloud-0.18.3
Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6h

# Tratamento

In [None]:
# Define the SQL query to be executed in Google BigQuery.
# This query selects all columns from the `CNES_profissionais_saude_ocupacao_publico` table,
# ordering the results by the 'ano' column in descending order.
query = """
  SELECT * FROM `repositoriodedadosgpsp.Datalake.CNES_profissionais_saude_ocupacao_publico` order by ano desc
        """
# Execute the query using pandas_gbq.read_gbq and load the result into a pandas DataFrame called 'df'.
# The 'project_id' specifies the Google Cloud Project to use.
df= pandas_gbq.read_gbq(query, project_id='repositoriodedadosgpsp')

# Filter the DataFrame to find rows where 'tipo_ocupacao' starts with 'Médicos' and display the unique values.
# This line is for inspection and does not modify the DataFrame.
df[df['tipo_ocupacao'].str.startswith('Médicos')]['tipo_ocupacao'].unique()

# Create a new column 'tipo_2'.
# If the 'tipo_ocupacao' starts with 'Médicos', the value in 'tipo_2' becomes 'Médicos'.
# Otherwise, it keeps the original 'tipo_ocupacao' value. This groups various medical specialities together.
df['tipo_2'] = np.where(df['tipo_ocupacao'].str.startswith('Médicos'),'Médicos',df['tipo_ocupacao'])

## Here I unify the three types of Doctors (clinic, surgery and therapeutic)
# The line above is a comment in Portuguese explaining the previous operation.

# Filter the DataFrame 'df'.
# It keeps only the rows where the 'tipo_2' column is 'Médicos' AND the 'ano' is greater than 2017.
df = df[(df['tipo_2']=='Médicos') & (df['ano']>2017)]

## Calling the population quantity.
# The line above is a comment in Portuguese.

# Read population data from a local CSV file named "Censo_previa_pop.csv" into a new DataFrame called 'pop'.
# The separator for the CSV is specified as a comma.
pop = pd.read_csv("Censo_previa_pop.csv", sep=',')

# Rename columns in the 'pop' DataFrame for better clarity and consistency with the 'df' DataFrame.
pop = pop.rename(columns={'CodMun': 'id_municipio', 'Sigla UF': 'UF', 'PopMun': 'POPULAÇÃO', 'Mun': 'NOME DO MUNICÍPIO'})

# Convert the 'id_municipio' column in the 'df' DataFrame to a numeric data type to ensure proper merging.
df['id_municipio'] = pd.to_numeric(df['id_municipio'])

# Create a pivot table 'x' from the 'df' DataFrame.
# This aggregates (sums) the 'quantidade_vinculos' (number of employment links)
# for each unique combination of 'ano', 'sigla_uf', 'id_municipio', and 'tipo_2'.
x=df.pivot_table(index=['ano','sigla_uf'  ,'id_municipio','tipo_2'], values='quantidade_vinculos',aggfunc=np.sum)

# Convert the pivot table 'x' (which is a pandas Series with a MultiIndex) into a DataFrame 'y'.
y = pd.DataFrame(x)

# Reset the index of DataFrame 'y' to turn the index levels into columns. The result is assigned back to 'df'.
df = y.reset_index()

# Merge the 'df' DataFrame with a selection of columns from the 'pop' DataFrame.
# The merge is performed on the 'id_municipio' column.
# The result is stored in a new DataFrame 'df1' and sorted by 'quantidade_vinculos' in descending order.
df1= df.merge(pop[['UF','NOME DO MUNICÍPIO','id_municipio','POPULAÇÃO']], left_on='id_municipio',right_on='id_municipio').sort_values('quantidade_vinculos', ascending=False)

# Calculate the rate of doctors per 1,000 people and create a new column 'taxa'.
# This is done by dividing the 'quantidade_vinculos' by the 'POPULAÇÃO' and multiplying by 1000.
df1['taxa'] = df1['quantidade_vinculos']/df1['POPULAÇÃO']*1000

## Rename columns
# The line above is a comment in Portuguese.

# Select a subset of columns from 'df1' and rename them for the final DataFrame structure.
# The result is assigned back to 'df'.
df = df1[['ano', 'sigla_uf', 'id_municipio', 'tipo_2', 'quantidade_vinculos',
        'NOME DO MUNICÍPIO', 'POPULAÇÃO', 'taxa']].rename(columns={'tipo_2':'tipo_ocupacao','UF':'sigla_uf', 'NOME DO MUNICÍPIO':'nome_municipio','POPULAÇÃO':'populacao_domiciliada' })

# Reorder the columns in the final DataFrame 'df' to a specific desired order.
df= df[['ano', 'sigla_uf', 'nome_municipio','id_municipio','populacao_domiciliada', 'tipo_ocupacao',
       'quantidade_vinculos',
       'taxa']]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38981 entries, 38759 to 16600
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ano                    38981 non-null  Int64  
 1   sigla_uf               38981 non-null  object 
 2   nome_municipio         38981 non-null  object 
 3   id_municipio           38981 non-null  int64  
 4   populacao_domiciliada  38981 non-null  int64  
 5   tipo_ocupacao          38981 non-null  object 
 6   quantidade_vinculos    38981 non-null  Int64  
 7   taxa                   38981 non-null  Float64
dtypes: Float64(1), Int64(2), int64(2), object(3)
memory usage: 2.8+ MB


# Upload

In [None]:
# Import the bigquery library from google.cloud
from google.cloud import bigquery

# Initialize the BigQuery client, specifying the Google Cloud project ID.
# This client object is the main entry point for interacting with the BigQuery API.
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create a reference to the BigQuery dataset named 'perfil_remuneracao'.
# This object points to the dataset where the table will be created or updated.
dataset_ref = client.dataset('perfil_remuneracao')

# Define the schema for the destination BigQuery table.
# The schema is a list of SchemaField objects, where each object defines a column's:
# 1. Name (e.g., 'ano')
# 2. Data type (e.g., 'INTEGER')
# 3. Description (e.g., 'Ano de referencia da informacao')
schema=[bigquery.SchemaField('ano','INTEGER',description='Ano de referencia da informacao'),
 bigquery.SchemaField('sigla_uf','STRING',description='Sigla da Unidade da Federação.'),
 bigquery.SchemaField('nome_municipio','STRING',description='Nome do município da observação'),
 bigquery.SchemaField('id_municipio','FLOAT',description='Identificador do município pelo IBGE'),
 bigquery.SchemaField('populacao_domiciliada','FLOAT',description='População domiciliada (prévia do Censo 2022)'),
 bigquery.SchemaField('tipo_ocupacao','STRING',description='Qual a ocupação daquele vínculo'),
 bigquery.SchemaField('quantidade_vinculos','INTEGER',description='Quantidade de vinculos'),
 bigquery.SchemaField('taxa','Float',description='Taxa de médicos por 1000 habitantes')
 ]

# Create a reference to the target table within the dataset specified earlier.
# The table will be named 'CNES_medicos_mil_habitantes_v1'.
table_ref = dataset_ref.table('CNES_medicos_mil_habitantes_v1') # table name in the format SOURCE_intuitive_data_name

# Configure the load job by creating a LoadJobConfig object.
# Here, we specify the schema that BigQuery should use for the table. This ensures
# that the columns in BigQuery have the correct data types and descriptions.
job_config = bigquery.LoadJobConfig(schema=schema)

# Start the job to load data from the pandas DataFrame 'df' into the specified BigQuery table ('table_ref').
# The job is configured with the previously defined 'job_config'. This command sends the data to BigQuery.
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)

# Wait for the load job to complete and retrieve its result.
# This line is blocking and will pause the script's execution until the data upload is finished.
# It's crucial for ensuring the data is fully loaded before the script ends or proceeds.
job.result()