<a href="https://colab.research.google.com/github/TiagoIesbick/dashboard-etl/blob/main/dashboard_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [176]:
import pandas as pd
import pathlib

In [196]:
file_path = r'/content/drive/MyDrive/Dashboard_data/utils/auxiliary_tables.xlsx'

sheets_info = {
    'n_uni': {'skiprows': 2, 'cols': ['uni']},
    'n_proj': {'skiprows': 3, 'cols': ['proj', 'nome_proj']},
    'n_elem': {'skiprows': 3, 'cols': ['elem', 'nome_elem']},
    'n_vinc': {'skiprows': 3, 'cols': ['vinc', 'nome_vinc']},
    'n_rub': {'skiprows': 3, 'cols': ['rub', 'nome_rub']}
}

dfs = {
    sheet: pd.read_excel(file_path, sheet_name=sheet, skiprows=info['skiprows'], header=None, names=info['cols'])
    for sheet, info in sheets_info.items()
}

for sheet, df in dfs.items():
  if 'uni' in df.columns:
      df['nome_uni'] = df['uni'].str[5:]
      df['uni_int'] = df['uni'].str[:4].astype(int)
  if 'proj' in df.columns:
      df['proj_int'] = df['proj'].str[:4].astype(int)
  if 'elem' in df.columns:
      df['elem_int'] = df['elem'].str[:6].astype(int)
  if 'rub' in df.columns:
      df['rub_int'] = df['rub'].str[:12].astype('int64')
  if 'vinc' in df.columns:
      df['vinc_int'] = df['vinc'].str[:4].astype(int)

df_uni, df_proj, df_elem, df_vinc, df_rub = dfs.values()

In [178]:
# Generating a dataframe with paid expenses data from 2002 to 2023

exp_folder_2002_2023 = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Despesas/2002_2023')

df_list = [
    pd.read_excel(expense_file).dropna(axis=1, how='all')
    for expense_file in exp_folder_2002_2023.iterdir()
]

df_exp_2002_2023 = pd.concat(df_list, ignore_index=True)

df_exp_2002_2023.drop(['VLCANCEL','DSFORMAPAG','DTPROG','BANCOCREDOR', 'AGENCIACREDOR','CONTA','NRCHEQUE',
                       'IDENTPESSOA', 'NOMEPESSOA', 'BANCOVF', 'AGENCIAVF', 'CONTAVF', 'ORGREQ', 'CODCOMPL',
                       'PERCENTUAL', 'VLRESULTADO', 'IRPF_CODCOMPL', 'IRPF_PERCENTUAL', 'IRPF_VLRESULTADO', 'NRPROCCONTR',
                       'NRNOTAFISCAL', 'DTENTREMP', 'DTPREVENT', 'LOCALENT', 'DTBASE', 'CODRECFIN', 'CODTIPOPESSOA'], axis=1, inplace=True)

df_exp_2002_2023.rename(columns = {'ANO':'Ano Emp.', 'EMPENHO':'Num. Empenho', 'NRPARCLIQ':'Num.Parc.Liq.',
                                   'NRPARCPAG':'Num.Parc.Pag.', 'DSSITPROG':'Situação Pag.', 'DSTPEMP':'Tipo Emp.',
                                   'CPTPAG':'Comp.pagto.', 'CPTPAGEST':'Comp.estorno', 'VLPAG':'Val.Pago',
                                   'VLPAGEST':'Val.Pago Estorn.', 'VLRETIDO':'Val. Retido', 'RESUL_PAGO':'Result. pago',
                                   'UNIDORC':'Unid. Orçam.', 'PROJATIV':'Proj/Ativ', 'RUBRICA':'Rubrica',
                                   'DSELEDES':'Descrição da Rubrica', 'VINCORC':'Vinc. Orçam.', 'CPTEMP':'Comp.Empenho',
                                   'GORANOPL':'Ano PL', 'GORNRPL':'Num. PL', 'GORNRPARCPL':'Parc. PL',
                                   'GORANOOBRAACAO':'Ano Obra/Ação', 'GOROBRAACAO':'Num.Obra/Ação',
                                   'NOMERECFIN':'Nome Recurso Financeiro', 'HISTORICO':'Histórico do Empenho',
                                   'TPPROCCONTR':'Tipo Processo'}, inplace = True)

df_exp_2002_2023 = df_exp_2002_2023.dropna(subset=['Comp.pagto.', 'Result. pago'], ignore_index=True)

In [190]:
# Generating a dataframe with paid expenses data from 2024 onwards

exp_folder_2024_onwards = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Despesas/2024+')

df_list = [
    pd.read_csv(expense_file).dropna(axis=1, how='all')
    for expense_file in exp_folder_2024_onwards.iterdir()
]

df_exp_2024_onwards = pd.concat(df_list, ignore_index=True)

cols_to_convert = ['valorpago', 'restospagarnaoprocessadospagos', 'restospagarprocessadospagos']

df_exp_2024_onwards[cols_to_convert] = (
    df_exp_2024_onwards[cols_to_convert]
    .apply(lambda col: col.str.removeprefix('R$ ').str.replace(',', '.', regex=False).astype(float))
)

df_exp_2024_onwards['Result. pago'] = df_exp_2024_onwards[cols_to_convert].sum(axis=1, numeric_only=True)

df_exp_2024_onwards['mes'] = df_exp_2024_onwards['mes'].replace({0: 1, 13: 12})
df_exp_2024_onwards.rename(columns={
    'exercicio': 'YEAR', 'mes': 'MONTH', 'subacao': 'Proj/Ativ', 'unidadeorcamentaria': 'Unid. Orçam.'
    }, inplace=True)
df_exp_2024_onwards['Comp.pagto.'] = pd.to_datetime(df_exp_2024_onwards[['YEAR', 'MONTH']].assign(DAY=1))
df_exp_2024_onwards['Elemento'] = df_exp_2024_onwards['elementocompleto'].str[:8].str.replace('.', '', regex=False).astype(int)
df_exp_2024_onwards['Rubrica'] = df_exp_2024_onwards['desdobramentocompleto'].str[:12].str.replace('.', '', regex=False).astype(int)

df_exp_2024_onwards = df_exp_2024_onwards.loc[df_exp_2024_onwards['Result. pago'] != 0].sort_values('Comp.pagto.').reset_index(drop=True)
df_exp_2024_onwards['Vinc. Orçam.'] = df_exp_2024_onwards['defonterecursos'].str[:4].astype(int)

In [123]:
# Updating the elements dataframe

df_new_elem = df_exp_2024_onwards[['Elemento', 'deelemento']].drop_duplicates(keep='last').rename(columns={
    'Elemento': 'elem_int',
    'deelemento': 'nome_elem'
})
df_elem = pd.concat([df_elem, df_new_elem], ignore_index=True).drop_duplicates(subset='elem_int', keep='last').reset_index(drop=True)
df_elem['elem'] = df_elem['elem_int'].astype(str) + ' - ' + df_elem['nome_elem'].str.title()

In [124]:
# Updating the rubric dataframe

df_new_rub = df_exp_2024_onwards[['Rubrica', 'dedesdobramento']].drop_duplicates(keep='last').rename(columns={
    'Rubrica': 'rub_int',
    'dedesdobramento': 'nome_rub'
})
df_rub = pd.concat([df_rub, df_new_rub], ignore_index=True).drop_duplicates(subset='rub_int', keep='last').reset_index(drop=True)
df_rub['rub'] = df_rub['rub_int'].astype(str) + ' - ' + df_rub['nome_rub'].str.title()

In [132]:
# Updating the project dataframe

df_new_proj = df_exp_2024_onwards[['Proj/Ativ', 'desubacao']].drop_duplicates(keep='last').rename(columns={
    'Proj/Ativ': 'proj_int',
    'desubacao': 'nome_proj'
})
df_proj = pd.concat([df_proj, df_new_proj], ignore_index=True).drop_duplicates(subset='proj_int', keep='last').reset_index(drop=True)
df_proj['proj'] = df_proj['proj_int'].astype(str) + ' - ' + df_proj['nome_proj'].str.title()

In [143]:
# Updating the budget units dataframe

df_new_uni = df_exp_2024_onwards[['Unid. Orçam.', 'deunidadeorcamentária']].drop_duplicates(keep='last').rename(columns={
    'Unid. Orçam.': 'uni_int',
    'deunidadeorcamentária': 'nome_uni'
})
df_uni = pd.concat([df_uni, df_new_uni], ignore_index=True).drop_duplicates(subset='uni_int', keep='last').reset_index(drop=True)
df_uni['uni'] = df_uni['uni_int'].astype(str) + ' - ' + df_uni['nome_uni'].str.title()

In [197]:
# Updating the entail dataframe

df_new_vinc = df_exp_2024_onwards[['Vinc. Orçam.', 'defonterecursos']].drop_duplicates(keep='last').rename(columns={
    'Vinc. Orçam.': 'vinc_int', 'defonterecursos': 'nome_vinc'
})
df_new_vinc['nome_vinc'] = df_new_vinc['nome_vinc'].str.split('-').str.get(1).str.strip()
df_vinc = pd.concat([df_vinc, df_new_vinc], ignore_index=True).drop_duplicates(subset='vinc_int', keep='last').reset_index(drop=True)
df_vinc['vinc'] = df_vinc['vinc_int'].astype(str) + ' - ' + df_vinc['nome_vinc'].str.title()

In [5]:
# Generating a dataframe with revenue collected from 2004 to 2017

revenue_file_2004_2017 = r'/content/drive/MyDrive/Dashboard_data/Receitas/2004_2017/2004_2017_Cubo_Antigo.xlsx'

df_rev_2004_2017 = pd.read_excel(revenue_file_2004_2017)
df_rev_2004_2017.drop(0, inplace=True)
df_rev_2004_2017.rename(columns={'Exercício': 'YEAR',
                                 'Mês': 'MONTH',
                                 'Rótulos de Linha': 'orgao',
                                 'Vínculo Cod': 'vinculo',
                                 'N6 Subalinea': 'desdobramento6',
                                 'Valor Arrecadado': 'valor_arrecadado',
                                 'Valor Orçado': 'valor_orcado'}, inplace=True)
df_rev_2004_2017['Data'] = pd.to_datetime(df_rev_2004_2017[['YEAR', 'MONTH']].assign(DAY=1))
df_rev_2004_2017[['YEAR', 'orgao', 'MONTH', 'vinculo']] = df_rev_2004_2017[['YEAR', 'orgao', 'MONTH', 'vinculo']].astype('int64')
df_rev_2004_2017['vinculo'] = df_rev_2004_2017['vinculo'].astype('str').replace('400', '0400').replace('6051', '605X')\
  .replace('1', '0001').replace('605X', '6051')

In [6]:
# Generating a dataframe with revenue collected from 2018 to 2023

rev_folder_2018_2023 = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Receitas/2018_2023')

df_list = [
    pd.read_excel(revenue_file).dropna(axis=1, how='all')
    for revenue_file in rev_folder_2018_2023.iterdir()
]

df_rev_2018_2023 = pd.concat(df_list, ignore_index=True)

df_rev_2018_2023.drop(columns=['valor_cancelado', 'valor_lancado', 'valor_meta', 'informacao_complementar'], inplace=True)
df_rev_2018_2023 = df_rev_2018_2023.loc[df_rev_2018_2023['orgao_raiz'] == 7000].reset_index(drop=True)
df_rev_2018_2023.rename(columns={'ano': 'YEAR', 'mes': 'MONTH'}, inplace=True)
df_rev_2018_2023['Data'] = pd.to_datetime(df_rev_2018_2023[['YEAR', 'MONTH']].assign(DAY=1))
df_rev_2018_2023['vinculo'] = df_rev_2018_2023['vinculo'].astype(str).replace({'400': '0400'})
df_rev_2018_2023['rubrica'] = df_rev_2018_2023['rubrica'].astype(str)

df_rev_2018_2023['Natureza da Receita Antiga'] = df_rev_2018_2023['rubrica'].str.pad(14, fillchar='0').str.replace(
    r'(\d{2})(\d)(\d)(\d)(\d{2})(\d)(\d)(\d{2})(\d{2})(\d+)', r'\1.\2.\3.\4.\5.\6.\7.\8.\9.\10', regex=True
) + '.' + df_rev_2018_2023['vinculo'] + '.0'

In [7]:
# Generating a dataframe with revenue collected from 2024 onwards

rev_folder_2024_onwards = pathlib.Path(r'/content/drive/MyDrive/Dashboard_data/Receitas/2024+')

df_list = [
    pd.read_csv(revenue_file).dropna(axis=1, how='all')
    for revenue_file in rev_folder_2024_onwards.iterdir()
]

df_rev_2024_onwards = pd.concat(df_list, ignore_index=True)

df_rev_2024_onwards.rename(columns={'exercicio': 'YEAR', 'mes': 'MONTH'}, inplace=True)
df_rev_2024_onwards['Data'] = pd.to_datetime(df_rev_2024_onwards[['YEAR', 'MONTH']].assign(DAY=1))

In [38]:
df_exp_2002_2023.loc[df_exp_2002_2023['Comp.pagto.'].dt.year == 2023, 'Result. pago'].sum()

1850336804.21