<a href="https://colab.research.google.com/github/TiagoIesbick/dashboard-etl/blob/main/dashboard_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [97]:
import pandas as pd
import pathlib

In [98]:
file_path = r'/content/drive/MyDrive/Dashboard_data/utils/auxiliary_tables.xlsx'

df_uni = pd.read_excel(file_path, sheet_name='n_uni', skiprows=2, header=None, names=['uni'])
df_proj = pd.read_excel(file_path, sheet_name='n_proj', skiprows=3, header=None, names=['proj', 'nome_proj'])
df_elem = pd.read_excel(file_path, sheet_name='n_elem', skiprows=3, header=None, names=['elem', 'nome_elem'])
df_vinc = pd.read_excel(file_path, sheet_name='n_vinc', skiprows=3, header=None, names=['vinc', 'nome_vinc'])
df_rub = pd.read_excel(file_path, sheet_name='n_rub', skiprows=3, header=None, names=['rub', 'nome_rub'])

df_uni['nome_uni'] = df_uni['uni'].str[5:]
df_uni['uni_int'] = df_uni['uni'].str[:4].astype(int)
df_proj['proj_int'] = df_proj['proj'].str[:4].astype(int)
df_elem['elem_int'] = df_elem['elem'].str[:6].astype(int)
df_rub['rub_int'] = df_rub['rub'].str[:12].astype('int64')
df_vinc['vinc_int'] = df_vinc['vinc'].str[:4].astype(int)

In [99]:
# Generating a dataframe with paid expenses data from 2002 to 2023

exp_folder_2002_2023 = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Despesas/2002_2023')

df_list = [
    pd.read_excel(expense_file).dropna(axis=1, how='all')
    for expense_file in exp_folder_2002_2023.iterdir()
]

df_exp_2002_2023 = pd.concat(df_list, ignore_index=True)

df_exp_2002_2023.drop(['VLCANCEL','DSFORMAPAG','DTPROG','BANCOCREDOR', 'AGENCIACREDOR','CONTA','NRCHEQUE',
                       'IDENTPESSOA', 'NOMEPESSOA', 'BANCOVF', 'AGENCIAVF', 'CONTAVF', 'ORGREQ', 'CODCOMPL',
                       'PERCENTUAL', 'VLRESULTADO', 'IRPF_CODCOMPL', 'IRPF_PERCENTUAL', 'IRPF_VLRESULTADO', 'NRPROCCONTR',
                       'NRNOTAFISCAL', 'DTENTREMP', 'DTPREVENT', 'LOCALENT', 'DTBASE', 'CODRECFIN', 'CODTIPOPESSOA'], axis=1, inplace=True)

df_exp_2002_2023.rename(columns = {'ANO':'Ano Emp.', 'EMPENHO':'Num. Empenho', 'NRPARCLIQ':'Num.Parc.Liq.',
                                   'NRPARCPAG':'Num.Parc.Pag.', 'DSSITPROG':'Situação Pag.', 'DSTPEMP':'Tipo Emp.',
                                   'CPTPAG':'Comp.pagto.', 'CPTPAGEST':'Comp.estorno', 'VLPAG':'Val.Pago',
                                   'VLPAGEST':'Val.Pago Estorn.', 'VLRETIDO':'Val. Retido', 'RESUL_PAGO':'Result. pago',
                                   'UNIDORC':'Unid. Orçam.', 'PROJATIV':'Proj/Ativ', 'RUBRICA':'Rubrica',
                                   'DSELEDES':'Descrição da Rubrica', 'VINCORC':'Vinc. Orçam.', 'CPTEMP':'Comp.Empenho',
                                   'GORANOPL':'Ano PL', 'GORNRPL':'Num. PL', 'GORNRPARCPL':'Parc. PL',
                                   'GORANOOBRAACAO':'Ano Obra/Ação', 'GOROBRAACAO':'Num.Obra/Ação',
                                   'NOMERECFIN':'Nome Recurso Financeiro', 'HISTORICO':'Histórico do Empenho',
                                   'TPPROCCONTR':'Tipo Processo'}, inplace = True)

df_exp_2002_2023 = df_exp_2002_2023.dropna(subset=['Comp.pagto.'], ignore_index=True)

In [100]:
# Generating a dataframe with paid expenses data from 2024 onwards

exp_folder_2024_onwards = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Despesas/2024+')

df_list = [
    pd.read_csv(expense_file).dropna(axis=1, how='all')
    for expense_file in exp_folder_2024_onwards.iterdir()
]

df_exp_2024_onwards = pd.concat(df_list, ignore_index=True)

cols_to_convert = ['valorpago', 'restospagarnaoprocessadospagos', 'restospagarprocessadospagos']

df_exp_2024_onwards[cols_to_convert] = (
    df_exp_2024_onwards[cols_to_convert]
    .apply(lambda col: col.str.removeprefix('R$ ').str.replace(',', '.', regex=False).astype(float))
)

df_exp_2024_onwards['Result. pago'] = df_exp_2024_onwards[cols_to_convert].sum(axis=1, numeric_only=True)

df_exp_2024_onwards['mes'] = df_exp_2024_onwards['mes'].replace({0: 1, 13: 12})
df_exp_2024_onwards.rename(columns={'exercicio': 'YEAR', 'mes': 'MONTH'}, inplace=True)
df_exp_2024_onwards['Comp.pagto.'] = pd.to_datetime(df_exp_2024_onwards[['YEAR', 'MONTH']].assign(DAY=1))

In [101]:
# Generating a dataframe with revenue collected from 2004 to 2017

revenue_file_2004_2017 = r'/content/drive/MyDrive/Dashboard_data/Receitas/2004_2017/2004_2017_Cubo_Antigo.xlsx'

df_rev_2004_2017 = pd.read_excel(revenue_file_2004_2017)
df_rev_2004_2017.drop(0, inplace=True)
df_rev_2004_2017.rename(columns={'Exercício': 'YEAR',
                                 'Mês': 'MONTH',
                                 'Rótulos de Linha': 'orgao',
                                 'Vínculo Cod': 'vinculo',
                                 'N6 Subalinea': 'desdobramento6',
                                 'Valor Arrecadado': 'valor_arrecadado',
                                 'Valor Orçado': 'valor_orcado'}, inplace=True)
df_rev_2004_2017['Data'] = pd.to_datetime(df_rev_2004_2017[['YEAR', 'MONTH']].assign(DAY=1))
df_rev_2004_2017[['YEAR', 'orgao', 'MONTH', 'vinculo']] = df_rev_2004_2017[['YEAR', 'orgao', 'MONTH', 'vinculo']].astype('int64')
df_rev_2004_2017['vinculo'] = df_rev_2004_2017['vinculo'].astype('str').replace('400', '0400').replace('6051', '605X')\
  .replace('1', '0001').replace('605X', '6051')

In [102]:
# Generating a dataframe with revenue collected from 2018 to 2023

rev_folder_2018_2023 = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Receitas/2018_2023')

df_list = [
    pd.read_excel(revenue_file).dropna(axis=1, how='all')
    for revenue_file in rev_folder_2018_2023.iterdir()
]

df_rev_2018_2023 = pd.concat(df_list, ignore_index=True)

df_rev_2018_2023.drop(columns=['valor_cancelado', 'valor_lancado', 'valor_meta', 'informacao_complementar'], inplace=True)
df_rev_2018_2023 = df_rev_2018_2023.loc[df_rev_2018_2023['orgao_raiz'] == 7000].reset_index(drop=True)
df_rev_2018_2023.rename(columns={'ano': 'YEAR', 'mes': 'MONTH'}, inplace=True)
df_rev_2018_2023['Data'] = pd.to_datetime(df_rev_2018_2023[['YEAR', 'MONTH']].assign(DAY=1))
df_rev_2018_2023['vinculo'] = df_rev_2018_2023['vinculo'].astype(str).replace({'400': '0400'})
df_rev_2018_2023['rubrica'] = df_rev_2018_2023['rubrica'].astype(str)

df_rev_2018_2023['Natureza da Receita Antiga'] = df_rev_2018_2023['rubrica'].str.pad(14, fillchar='0').str.replace(
    r'(\d{2})(\d)(\d)(\d)(\d{2})(\d)(\d)(\d{2})(\d{2})(\d+)', r'\1.\2.\3.\4.\5.\6.\7.\8.\9.\10', regex=True
) + '.' + df_rev_2018_2023['vinculo'] + '.0'

In [103]:
# Generating a dataframe with revenue collected from 2024 onwards

rev_folder_2024_onwards = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Receitas/2024+')

df_list = [
    pd.read_csv(revenue_file).dropna(axis=1, how='all')
    for revenue_file in rev_folder_2024_onwards.iterdir()
]

df_rev_2024_onwards = pd.concat(df_list, ignore_index=True)

df_rev_2024_onwards.rename(columns={'exercicio': 'YEAR', 'mes': 'MONTH'}, inplace=True)
df_rev_2024_onwards['Data'] = pd.to_datetime(df_rev_2024_onwards[['YEAR', 'MONTH']].assign(DAY=1))