# Dependências

In [None]:
#Prepare credentials to upload table after treatment
!pip install gcloud
!gcloud auth application-default login

#Basic dependecies
import pandas as pd
import numpy as np
import time
import os
import pandas_gbq
from google.cloud import bigquery
import glob
import openpyxl
import csv
import re

Collecting gcloud
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/454.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/454.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcloud
  Building wheel for gcloud (setup.py) ... [?25l[?25hdone
  Created wheel for gcloud: filename=gcloud-0.18.3-py3-none-any.whl size=602927 sha256=2cb9ae7dcb2a200bf0a7ec9c40838bb59e5652d4abb1cbf3b5eb544afe25fddb
  Stored in directory: /root/.cache/pip/wheels/3c/e8/d1/cb82a63f69083ea485de71d14248b8d145f1af46a41578be9c
Successfully built gcloud
Installing collected packages: gcloud
Successfully installed gcloud-0.18.3
Go to the following link in your brow

# Tratamento

In [None]:
# Read data from an XML file and load it into a pandas DataFrame.
# A DataFrame is a 2-dimensional labeled data structure.
df = pd.read_xml('acoes_afirmativas.xml')

# Rename specific columns of the DataFrame for better readability
# and to follow a consistent naming convention (snake_case).
# The 'inplace=True' argument modifies the DataFrame directly,
# avoiding the need to create a new one.
df.rename(columns={
    "tipoCota": "tipo_cota",
    "pubAlvo": "pub_alvo",
    "comissVer": "comiss_ver"
}, inplace=True)

#Drop specific row
df = df.drop('Unnamed: 0', axis=1)

# Save the modified DataFrame to a CSV (Comma Separated Values) file.
# 'index=False' prevents pandas from writing the DataFrame index as a column in the CSV.
df.to_csv('FLACSO_acoes_afirmativas.csv', index=False)


# Tratamento 2

In [None]:
# Read data from a CSV file into a pandas DataFrame
# Parameters:
#   encoding="utf8" - ensures proper handling of special characters
#   decimal="," - uses comma as decimal separator for numeric values
df = pd.read_csv('FLACSO_acoes_afirmativas.csv', encoding='utf8', decimal=",")

# Remove unnecessary columns from the DataFrame
# Parameters:
#   ['id', 'marker_id'] - list of columns to drop
#   axis=1 - indicates column-wise operation
#   inplace=True - modifies the DataFrame directly
df.drop(['id', 'marker_id'], axis=1, inplace=True)

# Extract year information from description text and create new 'ano' column
# Method:
#   str.extract() with regex pattern to find dates and capture year component
# Regex explanation:
#   \b(\d{2}\.\d{2}\.(\d{4}))\b matches DD.MM.YYYY format dates
#   [1] selects the second capture group (the year part)
df['ano'] = df['descricao'].str.extract(r'\b(\d{2}\.\d{2}\.(\d{4}))\b')[1]

# Export the processed data to a new CSV file for manual treatment
# Parameters:
#   index=False - prevents writing row numbers to the file
df.to_csv('FLACSO_acoes_afirmativas_v1.csv', index=False)

# Tratamento 2.1

In [None]:
# Read CSV file with UTF-8 encoding and comma as decimal separator
df = pd.read_csv('FLACSO_acoes_afirmativas_v2.csv', encoding='utf8', decimal=",")

# Rename columns for better clarity and standardization
df.rename(columns={
    'descricao': 'legislacao',
    'regiao': 'nome_regiao',
    'estado': 'sigla_uf',
    'identificacao': 'forma_identificacao',
    'pub_alvo': 'nomenclatura_legislacao',
    'Ano': 'ano',
    'Legislação': 'legislacao',
    'comissionado': 'flag_comissionado',
    'comiss_ver': 'flag_comiss_verificacao',
    'cidade': 'nome_municipio'
}, inplace=True)

# Reorder columns by creating a new column order
temp_cols = df.columns.tolist()
new_cols = temp_cols[4:5] + temp_cols[0:4] + temp_cols[5:]
df = df[new_cols]

# Standardize region names with proper capitalization
df['nome_regiao'] = df['nome_regiao'].replace({
    'sul': 'Sul',
    'norte': 'Norte',
    'nordeste': 'Nordeste',
    'centro-oeste': 'Centro-oeste',
    'sudeste': 'Sudeste'
})

# Standardize 'tipo_cota' values by consolidating similar entries
df['tipo_cota'] = df['tipo_cota'].replace({
    'Concurso público': 'Concurso público',
    'concurso público': 'Concurso público',
    'Concurso Público e estagiário': 'Concurso público e estágio profissional',
    'Concurso público e contratação temporária.': 'Concurso público e contratação temporária',
    'Sistema de pontuação diferenciado em concurso público': 'Concurso público'
})

NameError: name 'pd' is not defined

# Upload

In [None]:
# Define the schema for the BigQuery table
# Each SchemaField specifies column name, type, and description
schema = [
    bigquery.SchemaField('ano', 'INTEGER', description='Ano de implementação da legislação.'),
    bigquery.SchemaField('abrangencia', 'STRING', description='Se a abrangência é municipal, estadual, federal, nacional ou distrital.'),
    bigquery.SchemaField('nome_regiao', 'STRING', description='Nome da Região.'),
    bigquery.SchemaField('sigla_uf', 'STRING', description='Sigla da Unidade da Federação.'),
    bigquery.SchemaField('nome_municipio', 'STRING', description='Município.'),
    bigquery.SchemaField('legislacao', 'STRING', description='Número da legislação e detalhes.'),
    bigquery.SchemaField('regulamentacao', 'STRING', description='Detalhes sobre a regulamentação'),
    bigquery.SchemaField('tipo_cota', 'STRING', description='A cota vale para quais formas de ingresso.'),
    bigquery.SchemaField('flag_comissionado', 'INTEGER', description='A ação abrange cargos comissionados?'),
    bigquery.SchemaField('percentual', 'STRING', description='Percentual de vagas reservadas'),
    bigquery.SchemaField('nomenclatura_legislacao', 'STRING', description='Se a abrangência é municipal, estadual ou federal'),
    bigquery.SchemaField('forma_identificacao', 'STRING', description='Forma de identificação do público alvo na legislação.'),
    bigquery.SchemaField('flag_comiss_verificacao', 'INTEGER', description='Se há comissão de verificação.'),
    bigquery.SchemaField('vigencia', 'STRING', description='Se está vigente ou não.'),
    bigquery.SchemaField('lat', 'STRING', description='Latitude'),
    bigquery.SchemaField('lng', 'STRING', description='Longitude')
]

# Initialize BigQuery client and configure the upload
client = bigquery.Client(project='repositoriodedadosgpsp')  # Connect to the specified project
dataset_ref = client.dataset('acoes_afirmativas')  # Reference to the target dataset

# Configure the table reference with standardized naming (FONTE_algo_intuitivo_dado)
table_ref = dataset_ref.table('FLACSO_acoes_afirmativas_v1')

# Set up the job configuration with our predefined schema
job_config = bigquery.LoadJobConfig(schema=schema)

# Execute the upload job from DataFrame to BigQuery
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()  # Wait for the job to complete