# Dependências

In [None]:
# Install required Google Cloud packages (commented out as these are typically one-time setup commands)
# !pip install gcloud
# !gcloud auth application-default login

# Import necessary Python libraries
import pandas as pd                # Data manipulation and analysis
import numpy as np                 # Numerical computing
import time                        # Time-related functions
import os                          # Operating system interfaces
import pandas_gbq                  # Pandas integration with BigQuery
from google.cloud import bigquery  # BigQuery client library
import glob                        # File path pattern matching
import openpyxl                    # Excel file handling
import csv                         # CSV file handling
import re                          # Regular expressions

# Note: The actual imports remain exactly as in the original code

# Tratamento

In [None]:
# Load data from Excel file, specifically the 'Direitos Humanos' sheet
df = pd.read_excel("Base_Estadic_2023(2).xlsx", sheet_name='Direitos Humanos')

# Select and rename relevant columns
df = df[['Sigla UF', 'Cod UF', 'Nome UF', 'EDHU01', 'EDHU04', 'EDHU05', 'EDHU06', 'EDHU07']]
df['ano'] = 2023  # Add year column
df = df.rename(columns={
    'Sigla UF': 'sigla_uf',
    'Cod UF': 'cod_uf',
    'Nome UF': 'uf',
    'EDHU01': 'caracterizacao_orgao_gestor',
    'EDHU04': 'genero',
    'EDHU05': 'idade',
    'EDHU06': 'cor_raca',
    'EDHU07': 'grau_instrucao'
})

# Standardize text data (title case and specific value replacements)
df['caracterizacao_orgao_gestor'] = df['caracterizacao_orgao_gestor'].str.title()
df['cor_raca'] = np.where(df['cor_raca'] == 'Pardo', 'Parda', df['cor_raca'])

# Replace 'Recusa' and 'Não Informou' with standardized missing data indicators
for col in ['caracterizacao_orgao_gestor', 'genero', 'cor_raca', 'grau_instrucao']:
    df[col] = np.where(df[col].isin(['Recusa', 'Não Informou', 'Não informou']),
                      'Sem dados',
                      df[col])

# Handle numeric age column
df['idade'] = np.where(df['idade'].isin(['Recusa', 'Não informou']), np.nan, df['idade'])
df['idade'] = pd.to_numeric(df['idade'])

# Create age groups
limites = [18, 30, 50, 65, 100]
categorias = ['Entre 18-29', 'Entre 30-49', 'Entre 50-64', 'Acima de 65']
df['faixa_etaria'] = pd.cut(df['idade'], bins=limites, labels=categorias, right=False)

# Reorder columns
df = df[['ano', 'sigla_uf', 'cod_uf', 'uf', 'caracterizacao_orgao_gestor',
        'genero', 'faixa_etaria', 'cor_raca', 'grau_instrucao']]

# Standardize education levels
dict_esco = {
    'Ensino superior completo': 'Até Ensino Superior Completo',
    'Especialização': 'Até Pós Graduação ou Mestrado',
    'Mestrado': 'Até Pós Graduação ou Mestrado',
    'Doutorado': 'Até Doutorado'
}
df = df.replace({'grau_instrucao': dict_esco})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   ano                          27 non-null     int64   
 1   sigla_uf                     27 non-null     object  
 2   cod_uf                       27 non-null     int64   
 3   uf                           27 non-null     object  
 4   caracterizacao_orgao_gestor  27 non-null     object  
 5   genero                       27 non-null     object  
 6   faixa_etaria                 26 non-null     category
 7   cor_raca                     27 non-null     object  
 8   grau_instrucao               27 non-null     object  
dtypes: category(1), int64(2), object(6)
memory usage: 2.0+ KB


# Upload

In [None]:
# Define the BigQuery table schema with field types and descriptions
schema = [
    bigquery.SchemaField('ano', 'INTEGER', description='Year of data collection'),
    bigquery.SchemaField('sigla_uf', 'STRING', description='State abbreviation'),
    bigquery.SchemaField('cod_uf', 'INTEGER', description='IBGE state code'),
    bigquery.SchemaField('uf', 'STRING', description='State name'),
    bigquery.SchemaField('caracterizacao_orgao_gestor', 'STRING', description='Characterization of the management body'),
    bigquery.SchemaField('genero', 'STRING', description='Self-declared gender'),
    bigquery.SchemaField('faixa_etaria', 'STRING', description='Age group of the observation'),
    bigquery.SchemaField('cor_raca', 'STRING', description='Race/color of observed person'),
    bigquery.SchemaField('grau_instrucao', 'STRING', description='Education level with post-graduate detail')
]

# Initialize BigQuery client connecting to specific project
client = bigquery.Client(project='repositoriodedadosgpsp')

# Create reference to target dataset
dataset_ref = client.dataset('cargos_lideranca')

# Create reference to target table with standardized naming (format: SOURCE_descriptive_name_data)
table_ref = dataset_ref.table('ESTADIC_perfil_gestor_direitos_humanos_tipo_orgao_v1')

# Configure load job specifying the defined schema
job_config = bigquery.LoadJobConfig(schema=schema)

# Execute DataFrame load job to BigQuery
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)

# Wait for job completion
job.result()