### Importações

In [2]:
import pandas as pd
import numpy as np

### Constantes

In [3]:
ABA_ANALISE = "MEXER_NESSA"
DATA_PATH   = "/Users/gabrielribeirobizerril/Documents/GitHub/llm/editai_extractor_llm_based/tratamento/_data/output_capitais_pt2.xlsx"
NAME_NAN    = ["não previu", "indisponível", "none", "ok", "nan", "na"]

### Carregamento

In [4]:
df = pd.read_excel(io=DATA_PATH, sheet_name=ABA_ANALISE)

In [5]:
df.columns

Index(['uf', 'path_pdf', 'pdf', 'document', 'chunks_relevantes',
       'texto_completo', 'valor_total', 'cotas_negras', 'cotas_indigenas',
       'cotas_pcd', 'vagas_totais', 'observações', 'VALOR', 'PN', 'PI', 'PCD',
       'PN2', 'PI2', 'PCD2'],
      dtype='object')

In [6]:
df.shape

(144, 19)

### Validação de Valores

In [7]:
# Excluir última linha (no excel era uma linha de valores totais)
df = df.iloc[:-1]

In [8]:
# Corrige os valores que estão vazios
df["cotas_negras"]    = df["cotas_negras"].apply(
    lambda x: np.nan if str(x).strip().lower() in NAME_NAN else x
)

df["cotas_indigenas"] = df["cotas_indigenas"].apply(
    lambda x: np.nan if str(x).strip().lower() in NAME_NAN else x
)

df["cotas_pcd"]       = df["cotas_pcd"].apply(
    lambda x: np.nan if str(x).strip().lower() in NAME_NAN else x
)

df["cotas_negras"] = df["cotas_negras"].astype(str).str.replace(",", ".", regex=False)
df["cotas_indigenas"] = df["cotas_indigenas"].astype(str).str.replace(",", ".", regex=False)
df["cotas_pcd"] = df["cotas_pcd"].astype(str).str.replace(",", ".", regex=False)

In [9]:
# Passa coluna de cotas para decimais 
df["cotas_negras"]    = df["cotas_negras"].astype(float) / 100
df["cotas_indigenas"] = df["cotas_indigenas"].astype(float) / 100
df["cotas_pcd"]       = df["cotas_pcd"].astype(float) / 100

### Filtros

In [10]:
df = df[["uf", "pdf", "cotas_negras", "cotas_indigenas", "cotas_pcd", "vagas_totais", "valor_total"]]

### Instrumento de Fomento

In [11]:
df["instrumento_fomento"] = df["pdf"].apply(
    lambda x: x.split("_")[2].split(".")[0]
)

### Calculo de Valor e Vagas

In [12]:
for categoria in ["negras", "indigenas", "pcd"]:
    df[f"valor_{categoria}"] = df[f"cotas_{categoria}"] * df["valor_total"]
    df[f"vagas_{categoria}"] = df[f"cotas_{categoria}"] * df["vagas_totais"]

In [13]:
df.describe()

Unnamed: 0,cotas_negras,cotas_indigenas,cotas_pcd,vagas_totais,valor_total,valor_negras,vagas_negras,valor_indigenas,vagas_indigenas,valor_pcd,vagas_pcd
count,125.0,119.0,123.0,132.0,143.0,125.0,116.0,119.0,113.0,123.0,114.0
mean,0.26928,0.112381,0.058257,47.69697,1776856.0,493171.8,12.813534,207703.5,5.429104,99919.63,2.721768
std,0.068288,0.039696,0.021869,65.167423,3183502.0,809097.6,16.618149,352284.6,7.298662,168102.1,3.372575
min,0.2,0.0833,0.04,1.0,100000.0,25000.0,0.25,10000.0,0.1,5000.0,0.05
25%,0.25,0.1,0.05,13.75,476434.0,125000.0,3.75,50000.0,1.8,26412.5,1.0
50%,0.25,0.1,0.05,29.0,800000.0,225000.0,7.5,90000.0,3.0,48039.21,1.825
75%,0.25,0.1,0.05,52.0,2000000.0,600000.0,13.875,211036.7,6.0,120000.0,3.0
max,0.5,0.25,0.2,510.0,25000000.0,6075000.0,127.5,2500000.0,51.0,1250000.0,25.5


### Rank dos Editais

In [30]:
agrupado = df.groupby("uf").agg(
    qtd_pdfs=("pdf", "nunique"),
    valor_total=("valor_total", "sum")
).reset_index()

In [31]:
agrupado = agrupado.sort_values(by="qtd_pdfs", ascending=False)

In [32]:
agrupado["valor_per_edital"] = agrupado["valor_total"]/agrupado["qtd_pdfs"]

In [33]:
agrupado["valor_per_edital"] = agrupado["valor_per_edital"].apply(lambda x: f"{x:,.2f}")
agrupado["valor_total"]      = agrupado["valor_total"].apply(lambda x: f"{x:,.2f}")


In [34]:
agrupado

Unnamed: 0,uf,qtd_pdfs,valor_total,valor_per_edital
8,FORTALEZA,12,9051094.08,754257.84
13,MANAUS,9,14686857.14,1631873.02
10,JOÃO PESSOA,9,5290000.0,587777.78
23,SÃO PAULO,9,48985200.0,5442800.0
21,SALVADOR,9,18210000.0,2023333.33
12,MACEIÓ,8,6619233.4,827404.18
19,RIO BRANCO,7,4088000.0,584000.0
14,NATAL,7,5208000.0,744000.0
20,RIO DE JANEIRO,6,36740000.0,6123333.33
25,VITÓRIA,6,2340558.7,390093.12


### Matriz das Cotas

In [36]:
df.columns

Index(['uf', 'pdf', 'cotas_negras', 'cotas_indigenas', 'cotas_pcd',
       'vagas_totais', 'valor_total', 'instrumento_fomento', 'valor_negras',
       'vagas_negras', 'valor_indigenas', 'vagas_indigenas', 'valor_pcd',
       'vagas_pcd'],
      dtype='object')