### Importing libraries

In [29]:
# Importing libraries

# Data treatment
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Path
import sys
sys.path.append('../')

# Config
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

### Loading data

In [30]:
# Build a list of paths, read them and store dataframes in otrher list
years = [i for i in range(2013, 2022)]
paths = [f"../data/raw/datos-{year}.csv" for year in years]
data = []

for path in paths:
    data.append(pd.read_csv(path, sep = ';'))

### Inspecting datasets

In [None]:
# We check all columns are the same for all dataframes

cols = data[0].columns
print(cols)

for df in data:
    print(all(df.columns == cols))

Renaming columns to understandable English

In [None]:
# Name dictionary
new_columns = {
    'CÓDIGO ÓRGÃO SUPERIOR': 'Superior Agency Code',
    'NOME ÓRGÃO SUPERIOR': 'Superior Agency Name',
    'CÓDIGO ÓRGÃO': 'Agency Code',
    'NOME ÓRGÃO': 'Agency Name',
    'CÓDIGO UNIDADE GESTORA': 'Managing Unit Code',
    'NOME UNIDADE GESTORA': 'Managing Unit Name',
    'CATEGORIA ECONÔMICA': 'Economic Category',
    'ORIGEM RECEITA': 'Revenue Source',
    'ESPÉCIE RECEITA': 'Revenue Type',
    'DETALHAMENTO': 'Detailing',
    'VALOR PREVISTO ATUALIZADO': 'Updated Budgeted Amount',
    'VALOR LANÇADO': 'Posted Amount',
    'VALOR REALIZADO': 'Actual Amount',
    'PERCENTUAL REALIZADO': 'Realization Percentage',
    'DATA LANÇAMENTO': 'Posting Date',
    'ANO EXERCÍCIO': 'Fiscal Year'
}

# Dataframe renaming
[df.rename(columns= new_columns, inplace=True) for df in data]

# Check new column names for all dataframes
#print([df.columns for df in data])

### Year inspection

We can concatenate the datasets since they share the same structure

Before concatenating datasets we check dates both in Fiscal Year and Posting Date

In [None]:
# Checking Fiscal Year
for df in data:
    print(df['Fiscal Year'].unique())

In [None]:
# Checking Posting Date

# Since date format is DD/MM/YYYY we can extract it using a regex pattern 
pattern = r'/(\d{4})'

for df in data:
    print(df['Posting Date'].str.extract(pattern)[0].unique())

In [None]:
# We can check number of missing year data
for df in data:
    print(f'Missing {df[df['Posting Date'].isna() & data[0]['Fiscal Year'].isna()].shape[0]} out of {df.shape[0]}')

We see that we have some NaN values, but not mixed data from years so we will assume that NaN can be replaced by the year

In [36]:
i = 0

for df in data:

    df['Fiscal Year'] = years[i]
    i += 1

### Currency inspection

In [37]:
# To do

### Join the dataframes

We can concatenate the list of dataframes to stack them

In [None]:
# We use pd.concat() since we have the same columns

df_full = pd.concat(data, ignore_index = True)
df_full.sample()

In [None]:
df_full.shape

We see that we have 1026299 entries over the 2013-2021 period

In [40]:
# Now we save the dataframe
df_full.to_csv("../data/output/data_full.csv", index = False)