![PPGI_UFRJ](https://github.com/zavaleta/Fundamentos_DS/blob/main/imagens/ppgi-ufrj.png?raw=1)
# **DRECVM**

### **Providing Data on Financial Results of Public Companies Enriched with Provenance for [OBInvest](https://obinvest.org/)**

---

**Authors:** Gilberto Gil | Saulo Almeida | Valquire Jesus | Sergio Serra | Jorge Zavaleta



## **FAIRification**

In [None]:
#checking version machine architecture, OS, python and all libs used in this notebook
import platform
import os
import numpy
import pandas
import conda
import pydot
import prov
import seaborn
import matplotlib
import plotly

def verify_libs_environment_versions(details=False):

    #definnig version of python and all libs used
    HOST_MACHINE_ARCHTECTURE_EXPECTED = 'x86_64'
    HOST_MACHINE_OS_EXPECTED = 'Linux'
    CONDA_VERSION_EXPECTED = '22.9.0'
    CONDA_DEFAULT_ENV_EXPECTED = 'drecvmenv'
    PYTHON_VERSION_EXPECTED = '3.9.12'
    NUMPY_LIB_VERSION_EXPECTED = '1.21.5'
    PANDAS_LIB_VERSION_EXPECTED = '1.4.2'
    PYDOT_LIB_VERSION_EXPECTED = '1.4.2'
    PROV_LIB_VERSION_EXPECTED = '2.0.0'
    SEABORN_LIB_VERSION_EXPECTED = '0.11.2'
    MATPLOTLIB_LIB_VERSION_EXPECTED = '3.5.1'
    PLOTLY_LIB_VERSION_EXPECTED = '5.6.0'
    
    

    if details: 
        print('Host Machine Architecture:', platform.machine())
        print('Host Machine OS:', platform.system())
        print('Conda Version:', conda.__version__)
        print('Conda default env:', os.environ['CONDA_DEFAULT_ENV'])
        print('Python Version:', platform.python_version())
        print('NumPy Lib Version:', numpy.__version__)
        print('Pandas Lib Version:', pandas.__version__)
        print('PyDot Lib Version:', pydot.__version__)
        print('Prov Lib Version:', prov.__version__)
        print('Seaborn Lib Version:', seaborn.__version__)
        print('Matplotlib Lib Version:', matplotlib.__version__)
        print('Plotly Lib Version:', plotly.__version__)
        
        
    #checking versions
    try:
        #checking Machine Architecute expected
        assert platform.machine() == HOST_MACHINE_ARCHTECTURE_EXPECTED

        #checking OS expected
        assert platform.system() == HOST_MACHINE_OS_EXPECTED
        
        #checking conda version
        assert conda.__version__ == CONDA_VERSION_EXPECTED
        
        #checking conda default environment
        assert os.environ['CONDA_DEFAULT_ENV'] == CONDA_DEFAULT_ENV_EXPECTED    

        #checking python version
        assert platform.python_version() == PYTHON_VERSION_EXPECTED

        #checking numpy lib version
        assert numpy.__version__ == NUMPY_LIB_VERSION_EXPECTED  

        #checking Pandas lib version
        assert pandas.__version__ == PANDAS_LIB_VERSION_EXPECTED
        
        #checking pydot version
        assert pydot.__version__ == PYDOT_LIB_VERSION_EXPECTED
        
        #checking prov version
        assert prov.__version__ == PROV_LIB_VERSION_EXPECTED
        
        #checking seaborn version
        assert seaborn.__version__ == SEABORN_LIB_VERSION_EXPECTED
        
        #checking matplotlib version
        assert matplotlib.__version__ == MATPLOTLIB_LIB_VERSION_EXPECTED
        
        #checking plotly version
        assert plotly.__version__ == PLOTLY_LIB_VERSION_EXPECTED
    except:
        #if any assert fail, or something else get wrong during verification
        if details: print('Somethings is wrong. Verify environment and libs versions!')
        return False
    else:
        #if pass all asserts
        if details: print('All environment and libs versions are correct!')
        return True

## **Data processing**

In [None]:
#Using python 3.9
import pandas as pd
import numpy as np
import sys, subprocess, datetime

def load_csv(path, date_columns_names=[]):
    return pd.read_csv(path, sep=';', header=0, encoding="ISO-8859-1", parse_dates=date_columns_names)

def load_csvs_dre(template_path_file, years):
    tempDF = pd.DataFrame() 
    
    for year in years:
        df_current_year = load_csv(template_path_file.format(year), 
                                      ['DT_REFER', 'DT_INI_EXERC', 'DT_FIM_EXERC'])
        
        tempDF = pd.concat([tempDF, df_current_year] , ignore_index=True)
        
    return tempDF
    

def load_companies_data():
    return load_csv("data/cad-emp/cad_cia_aberta.csv")

def list_all_years():
    return [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

def load_all_dre_itr_years_data():
    return load_csvs_dre('data/dre-itr/itr_cia_aberta_DRE_con_{0}.csv', list_all_years())

def load_all_dre_dfp_years_data():
    return load_csvs_dre('data/dre-dfp/dfp_cia_aberta_DRE_con_{0}.csv', list_all_years())

def load_all_datasets():
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    companyInfoDF = load_companies_data()
    dreItrDF = load_all_dre_itr_years_data()
    dreDfpDF = load_all_dre_dfp_years_data()
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-load-ds"] = doc_prov.activity("ufrj:load_all_datasets", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-load-ds"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-cademp-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2011-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2012-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2013-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2014-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2015-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2016-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2017-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2018-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2019-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2020-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dreitr2021-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2011-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2012-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2013-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2014-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2015-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2016-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2017-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2018-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2019-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2020-ds"])
    doc_prov.used(dict_activities["act-load-ds"],dict_entities["ent-dredfp2021-ds"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-cademp-df"] = doc_prov.entity('obinvest:df_cia', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das empresas'})
    dict_entities["ent-dreitr-df"] = doc_prov.entity('obinvest:df_itr', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs trimestrais'})
    dict_entities["ent-dredfp-df"] = doc_prov.entity('obinvest:df_dfp', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs anuais'})
    doc_prov.wasGeneratedBy(dict_entities["ent-cademp-df"], dict_activities["act-load-ds"])    
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitr-df"], dict_activities["act-load-ds"])    
    doc_prov.wasGeneratedBy(dict_entities["ent-dredfp-df"], dict_activities["act-load-ds"])    
    
    return companyInfoDF, dreItrDF, dreDfpDF

def sanitaze_company_data(dataframe_cia):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    #DF SANITIZATION OF CAD_CIA
    #filter only active companies - df cad_cia
    dataframe_cia = dataframe_cia[dataframe_cia['SIT'] == 'ATIVO']
    #remove unnecessary columns
    dataframe_cia = dataframe_cia.drop(['DENOM_SOCIAL', 'DENOM_COMERC', 'DT_REG', 'DT_CONST', 'DT_CANCEL', 'MOTIVO_CANCEL', 'SIT', 
                   'DT_INI_SIT', 'TP_MERC', 'CATEG_REG', 'DT_INI_CATEG', 'SIT_EMISSOR', 'DT_INI_SIT_EMISSOR', 
                   'CONTROLE_ACIONARIO', 'TP_ENDER', 'LOGRADOURO', 'COMPL', 'BAIRRO', 'MUN', 'UF', 'PAIS', 
                   'CEP', 'DDD_TEL', 'TEL', 'DDD_FAX', 'FAX', 'EMAIL', 'TP_RESP', 'RESP', 'DT_INI_RESP', 
                   'LOGRADOURO_RESP', 'COMPL_RESP', 'BAIRRO_RESP', 'MUN_RESP', 'UF_RESP', 'PAIS_RESP', 
                   'CEP_RESP', 'DDD_TEL_RESP', 'TEL_RESP', 'DDD_FAX_RESP', 'FAX_RESP', 'EMAIL_RESP', 
                   'CNPJ_AUDITOR', 'AUDITOR'], axis=1)
    
    #remove duplicate records with same cnpj and cvm ID
    dataframe_cia = dataframe_cia.drop_duplicates(subset = ['CNPJ_CIA', 'CD_CVM'], keep = 'last')
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-sanatize-company-data"] = doc_prov.activity("ufrj:sanitaze_company_data", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-sanatize-company-data"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-sanatize-company-data"],dict_entities["ent-cademp-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-cademp-tratado-df"] = doc_prov.entity('obinvest:df_cia_tratado', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das empresas ativas, apenas com os campos necessarios'})
    doc_prov.wasGeneratedBy(dict_entities["ent-cademp-tratado-df"], dict_activities["act-sanatize-company-data"])    
    
    return dataframe_cia

def add_year(dataframe_dre):
    ano = []
    
    #adding year column
    for i in dataframe_dre.itertuples():
        ano.append(i.DT_REFER.year)
    
    dataframe_dre['ANO'] = ano
    
    return dataframe_dre

def adding_year_field_to_dre(dataframe_dre_itr, dataframe_dre_dfp):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    dreItr = add_year(dataframe_dre_itr)
    dreDfp = add_year(dataframe_dre_dfp)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-adicionar-ano"] = doc_prov.activity("ufrj:adding_year_field_to_dre", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-adicionar-ano"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-adicionar-ano"],dict_entities["ent-dreitr-df"])
    doc_prov.used(dict_activities["act-adicionar-ano"],dict_entities["ent-dredfp-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitr-ano-df"] = doc_prov.entity('obinvest:df_itr_ano', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs trimestrais, adicionada a coluna ano'})
    dict_entities["ent-dredfp-ano-df"] = doc_prov.entity('obinvest:df_dfp_ano', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs anuais, adicionada a coluna ano'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitr-ano-df"], dict_activities["act-adicionar-ano"])    
    doc_prov.wasGeneratedBy(dict_entities["ent-dredfp-ano-df"], dict_activities["act-adicionar-ano"])      
    
    return dreItr, dreDfp

def merge_dre_itr_sector(dataframe_dre, dataframe_cia):
    
    #save execution start time
    execStartTime = datetime.datetime.now()

    dfTemp = merge_dre_sector(dataframe_dre, dataframe_cia)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-merge-dreitr-sector"] = doc_prov.activity("ufrj:merge_dre_itr_sector", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-merge-dreitr-sector"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-merge-dreitr-sector"],dict_entities["ent-dreitr-ano-df"])
    doc_prov.used(dict_activities["act-merge-dreitr-sector"],dict_entities["ent-cademp-tratado-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitf-sector-df"] = doc_prov.entity('obinvest:df_itr_sector', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs trimestrais, adicionado o campo de sector da empresa'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitf-sector-df"], dict_activities["act-merge-dreitr-sector"])    

    return dfTemp

def merge_dre_dfp_sector(dataframe_dre, dataframe_cia):
    
    #save execution start time
    execStartTime = datetime.datetime.now()

    dfTemp = merge_dre_sector(dataframe_dre, dataframe_cia)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-merge-dredfp-sector"] = doc_prov.activity("ufrj:merge_dre_dfp_sector", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-merge-dredfp-sector"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-merge-dredfp-sector"],dict_entities["ent-dredfp-exerc-df"])
    doc_prov.used(dict_activities["act-merge-dredfp-sector"],dict_entities["ent-cademp-tratado-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dredfp-sector-df"] = doc_prov.entity('obinvest:df_dfp_sector', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs anuais, adicionado o campo de sector da empresa'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dredfp-sector-df"], dict_activities["act-merge-dredfp-sector"])    

    return dfTemp

def merge_dre_sector(dataframe_dre, dataframe_cia):    
    #filter only active companies
    dataframe_dre = pd.merge(dataframe_dre, dataframe_cia, how = 'inner', on = ['CNPJ_CIA', 'CD_CVM']).reset_index(drop=True)
    return dataframe_dre.drop(['VERSAO'], axis=1)  #remove unnecessary column

def filter_last_year(dre):
    #maintain only last result 
    return dre[dre['ORDEM_EXERC'] == 'ÚLTIMO']

def filter_last_dfp_year(dre):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    dfTemp = filter_last_year(dre)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-filtrar-exerc-dfp"] = doc_prov.activity("ufrj:filter_last_dfp_year", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-filtrar-exerc-dfp"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-filtrar-exerc-dfp"],dict_entities["ent-dredfp-ano-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dredfp-exerc-df"] = doc_prov.entity('obinvest:df_dfp_ultimo_exerc', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs anuais apenas do ano relativo ao dataset'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dredfp-exerc-df"], dict_activities["act-filtrar-exerc-dfp"])
    
    return dfTemp
    
def filter_quarters_123(dataframe_itr):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    # slicing ITR DF
    dataframe_itr = filter_last_year(dataframe_itr)
    # filtering quarters 1, 2 and 3
    # create df with quarter info 1 (1 to 3 month), 2 (4 to 6 month) e 3 (6 to 9 month) - df quarter123
    dfTemp = dataframe_itr.loc[lambda dataframe_itr: ((dataframe_itr.DT_REFER.dt.month == 3) | 
                            ((dataframe_itr.DT_REFER.dt.month == 6) & (dataframe_itr.DT_INI_EXERC.dt.month > 3)) | 
                            ((dataframe_itr.DT_REFER.dt.month == 9) & (dataframe_itr.DT_INI_EXERC.dt.month > 6))) &
                             (dataframe_itr.DT_REFER.dt.year == dataframe_itr.DT_INI_EXERC.dt.year)]

    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-filter-quarters123"] = doc_prov.activity("ufrj:filter_quarters_123", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-filter-quarters123"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-filter-quarters123"],dict_entities["ent-dreitf-sector-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitf-trim123-df"] = doc_prov.entity('obinvest:df_trim123', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs trimestrais dos trimestres 1, 2 e 3'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitf-trim123-df"], dict_activities["act-filter-quarters123"])
    
    return dfTemp

def filter_quarter3_cumulative(dataframe_itr):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    # create df with cumulative value of third trimester, 07/01/YEAR a 09/30/YEAR - df_acm3
    dfTemp = dataframe_itr.loc[lambda dataframe_itr: ((dataframe_itr.DT_REFER.dt.month == 9) & 
                                                    (dataframe_itr.DT_INI_EXERC.dt.month <= 6)) &
                             (dataframe_itr.DT_REFER.dt.year == dataframe_itr.DT_INI_EXERC.dt.year)]
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-obter-acm3"] = doc_prov.activity("ufrj:filter_quarter3_cumulative", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-obter-acm3"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-obter-acm3"],dict_entities["ent-dreitf-sector-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitf-acm3-df"] = doc_prov.entity('obinvest:df_acm3', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com valores das DREs acumuladas dos tres primeiros trimestres'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitf-acm3-df"], dict_activities["act-obter-acm3"])
    
    return dfTemp
    
def filter_quarter_4(dataframe_dfp, dataframe_acm3):

    #save execution start time
    execStartTime = datetime.datetime.now()
    
    #filter records with end month in december - df acm4
    df_acm4 = dataframe_dfp.loc[lambda dataframe_dfp: (dataframe_dfp.DT_REFER.dt.month == 12) & 
                             (dataframe_dfp.DT_REFER.dt.year == dataframe_dfp.DT_INI_EXERC.dt.year)]
    #merge cumulative quarter 4 df and cumulative quarter 3, mantaining cumulative quarter df 4 - df trim4
    df_trim4 = pd.merge(df_acm4, dataframe_acm3, how='left', on=['CD_CVM', 'CD_CONTA', 'ANO'], suffixes=['_acm4','_acm3'])
    #fill values VL_CONTA absent with 0, values not sent until third quarter
    df_trim4.VL_CONTA_acm3.fillna(value=0, inplace=True)
    # calculate 4 quarter value, subtracting anual cumulative and third quarter cumulated
    df_trim4['RESULTADO'] = df_trim4['VL_CONTA_acm4'] - df_trim4['VL_CONTA_acm3']
    # drop unnecessaries columns
    df_trim4 = df_trim4.drop(['VL_CONTA_acm4','CNPJ_CIA_acm3', 'DT_REFER_acm3', 'DENOM_CIA_acm3', 'GRUPO_DFP_acm3', 
                 'MOEDA_acm3', 'ESCALA_MOEDA_acm3', 'ORDEM_EXERC_acm3', 'DT_INI_EXERC_acm3', 'DT_FIM_EXERC_acm3', 
                 'DS_CONTA_acm3', 'VL_CONTA_acm3', 'ST_CONTA_FIXA_acm3','SETOR_ATIV_acm3'], axis=1)
    # rename columns to concatenate with df trim123
    dtTemp = df_trim4.rename(columns = {'CNPJ_CIA_acm4':'CNPJ_CIA', 'DT_REFER_acm4':'DT_REFER', 
                             'DENOM_CIA_acm4':'DENOM_CIA', 'GRUPO_DFP_acm4':'GRUPO_DFP', 'MOEDA_acm4':'MOEDA', 
                             'ESCALA_MOEDA_acm4':'ESCALA_MOEDA', 'ORDEM_EXERC_acm4':'ORDEM_EXERC', 
                             'DT_INI_EXERC_acm4':'DT_INI_EXERC', 'DT_FIM_EXERC_acm4':'DT_FIM_EXERC', 
                             'DS_CONTA_acm4':'DS_CONTA', 'ST_CONTA_FIXA_acm4':'ST_CONTA_FIXA', 
                             'SETOR_ATIV_acm4':'SETOR_ATIV', 'RESULTADO':'VL_CONTA'})

    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-obter-trim4"] = doc_prov.activity("ufrj:filter_quarter_4", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-merge-dredfp-sector"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-obter-trim4"],dict_entities["ent-dredfp-sector-df"])
    doc_prov.used(dict_activities["act-obter-trim4"],dict_entities["ent-dreitf-acm3-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreirt-trim4-df"] = doc_prov.entity('obinvest:df_trim4', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs do 4 trimestre, subtraindo DFP anual pelo acumulado dos tres primeiros'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreirt-trim4-df"], dict_activities["act-obter-trim4"])    
    
    return dtTemp

def concatenate_quarters(df_trim123, df_trim4):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    dfTemp = pd.concat([df_trim123, df_trim4])
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-concact-trim"] = doc_prov.activity("ufrj:concatenate_quarters", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-concact-trim"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-concact-trim"],dict_entities["ent-dreirt-trim4-df"])
    doc_prov.used(dict_activities["act-concact-trim"],dict_entities["ent-dreitf-trim123-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreirt-trim1234-df"] = doc_prov.entity('obinvest:df_trim1234', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs de todos os trimestres'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreirt-trim1234-df"], dict_activities["act-concact-trim"])    
    
    
    return dfTemp

def remove_duplicate_records(df_obinvest_trim1234):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    dfTemp = df_obinvest_trim1234.drop_duplicates(subset = ['CNPJ_CIA', 'CD_CVM', 'CD_CONTA', 'DT_INI_EXERC', 'DT_FIM_EXERC', 'ANO'], 
                              keep = 'last')

    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-remove-dup"] = doc_prov.activity("ufrj:remove_duplicate_records", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-remove-dup"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-remove-dup"],dict_entities["ent-dreirt-trim1234-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitf-tratado-df"] = doc_prov.entity('obinvest:df_obinvest_tratado', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com valores das DREs acumuladas dos tres primeiros trimestres'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitf-tratado-df"], dict_activities["act-remove-dup"])
    
    return dfTemp

def adding_quarter_field(dataframe_dre):
    
    #save execution start time
    execStartTime = datetime.datetime.now()
    
    trimestre =[]
    
    #adding quater column
    for i in dataframe_dre.itertuples():
      if i.DT_REFER.month == 3:
        trimestre.append(1)
      elif i.DT_REFER.month == 6:
        trimestre.append(2)
      elif i.DT_REFER.month == 9:
        trimestre.append(3)
      else:
        trimestre.append(4)
    
    dataframe_dre = dataframe_dre.assign(TRIMESTRE=trimestre)
    dataframe_dre.reset_index(drop=True)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-adicionar-trimestre"] = doc_prov.activity("ufrj:adding_quarter_field", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-adicionar-trimestre"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-adicionar-trimestre"],dict_entities["ent-dreitf-tratado-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-dreitr-trimestre-df"] = doc_prov.entity('obinvest:df_obinvest_campo_trimestre', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe com dados das DREs trimestrais, adicionada a coluna trimestre'})
    doc_prov.wasGeneratedBy(dict_entities["ent-dreitr-trimestre-df"], dict_activities["act-adicionar-trimestre"])            
    
    return dataframe_dre


def standardization_account_value(dataframe_dre):

    #save execution start time
    execStartTime = datetime.datetime.now()
    
    # standardization account value
    # divide by 1000 values when currency field is unidade
    dataframe_dre['VL_CONTA'] = dataframe_dre.apply(lambda x: x.VL_CONTA/1000 if x.ESCALA_MOEDA == 'UNIDADE' 
                                         else x.VL_CONTA, axis = 1)
    
    # convert column currency data to valor mil
    dataframe_dre['ESCALA_MOEDA'] = dataframe_dre.ESCALA_MOEDA.replace('UNIDADE', 'MIL')
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-padr-valor"] = doc_prov.activity("ufrj:standardization_account_value", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-padr-valor"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-padr-valor"],dict_entities["ent-dreitr-trimestre-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-obinvest-df"] = doc_prov.entity('obinvest:df_obinvest', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataframe final com dados das DREs trimestrais'})
    doc_prov.wasGeneratedBy(dict_entities["ent-obinvest-df"], dict_activities["act-padr-valor"])            
        
    
    return dataframe_dre

def create_dataset_obinvest(df_obinvest):

    #save execution start time
    execStartTime = datetime.datetime.now()
    
    df_obinvest.to_csv('obinvest-dre-historico.csv', sep=';', encoding='"ISO-8859-1', index=False)
    
    #create activity with final time execution
    execEndTime = datetime.datetime.now()
    dict_activities["act-cria-ds"] = doc_prov.activity("ufrj:create_dataset_obinvest", execStartTime, execEndTime)
    
    #associate activity with agent
    doc_prov.wasAssociatedWith(dict_activities["act-cria-ds"], dict_agents["ag-drecvm-ipynb"])
    
    #associate activity with read datasets
    doc_prov.used(dict_activities["act-cria-ds"],dict_entities["ent-obinvest-df"])
    
    #generate dataframe entity and associate with entity
    dict_entities["ent-obinvest-ds"] = doc_prov.entity('obinvest:ds_obinvest', {'prov:generatedAtTime': str(execEndTime), 'prov:label': 'Dataset final com historico de dados das DREs trimestrais', 'prov:type': 'void:Dataset'})
    doc_prov.wasGeneratedBy(dict_entities["ent-obinvest-ds"], dict_activities["act-cria-ds"])       


# processing data and create dataset obinvest
def create_dataframe_obinvest():
    
    # load datasets 
    df_cia, df_itr, df_dfp = load_all_datasets()

    #add year data
    df_itr_ano, df_dfp_ano = adding_year_field_to_dre(df_itr, df_dfp)
    
    #sanitaze company data
    df_cia_tratado = sanitaze_company_data(df_cia)
    
    #merge company info with itr, getting itr df sector - dfs cad_cia e itr
    df_itr_sector = merge_dre_itr_sector(df_itr_ano, df_cia_tratado)

    #filter quarters 1, 2 and 3 
    df_trim123 = filter_quarters_123(df_itr_sector)

    # filter 3 quarter cumulative
    df_acm3 = filter_quarter3_cumulative(df_itr_sector)
    
    # slice dfp df
    # filter last year values - df dfp
    df_dfp_ultimo_exerc = filter_last_dfp_year(df_dfp_ano)
    
    # merge company and itr, using df dfp with sector - dfs cad_cia e dfp
    df_dfp_sector = merge_dre_dfp_sector(df_dfp_ultimo_exerc, df_cia_tratado)
    
    # filter cumulated quarter 4
    df_trim4 = filter_quarter_4(df_dfp_sector, df_acm3)
    
    # creating DF obinvest, concatenate dfs
    df_obinvest_trim1234 = concatenate_quarters(df_trim123, df_trim4)
    
    # remove duplicated values
    df_obinvest_tratado = remove_duplicate_records(df_obinvest_trim1234)
    
    #adding quater data
    df_obinvest_campo_trimestre = adding_quarter_field(df_obinvest_tratado)
    
    #standardization account value
    df_obinvest = standardization_account_value(df_obinvest_campo_trimestre)
    
    #create dataset obinvest
    create_dataset_obinvest(df_obinvest)
    
    return df_obinvest

## **Provenance**

In [None]:
import sys, subprocess, datetime
from prov.model import ProvDocument, Namespace
from prov.dot import prov_to_dot
from IPython.display import Image

def generate_provenance_outputs():
    entity = "DRE-CVM-PROV"
    #Generating the outup - a  Provenance Graph
    dot = prov_to_dot(doc_prov)
    graph = entity+".png"
    dot.write_png(graph)

    #Generating the Serialization - Output XML
    doc_prov.serialize(entity + ".xml", format='xml') 

    #Generating the Serialization - Output Turtle
    doc_prov.serialize(entity + ".ttl", format='rdf', rdf_format='ttl')

def adding_namespaces(document_prov):
    # Declaring namespaces for various prefixes used in the excution of Randon Walk Experiment
    document_prov.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/')
    document_prov.add_namespace('prov', 'http://www.w3.org/ns/prov#')
    document_prov.add_namespace('void', 'http://vocab.deri.ie/void#')
    document_prov.add_namespace('ufrj', 'https://www.ufrj.br')
    document_prov.add_namespace('cvm', 'https://www.gov.br/cvm/pt-br')
    document_prov.add_namespace('cvm-cademp', 'https://dados.cvm.gov.br/dados/CIA_ABERTA/CAD/DADOS/')
    document_prov.add_namespace('cvm-dre-itr', 'https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/ITR/DADOS/')
    document_prov.add_namespace('cvm-dre-dfp', 'https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/DFP/DADOS/')
    document_prov.add_namespace('obinvest', 'https://www.obinvest.org')
    return document_prov

def create_agents(document_prov):
    
    #creating agents
    dagnts={}
    dagnts["ag-cvm"] = document_prov.agent("cvm:CVM", {"prov:type":"prov:Organization", "foaf:name":"Comissão de Valores Mobiliários"})
    dagnts["ag-ufrj"] = document_prov.agent("ufrj:UFRJ", {"prov:type":"prov:Organization", "foaf:name":"Universidade Federal do Rio de Janeiro"})
    dagnts["ag-ppgi"] = document_prov.agent("ufrj:PPGI", {"prov:type":"prov:Organization", "foaf:name":"Programa de Pós Graduação em Informática"})
    dagnts["ag-greco"] = document_prov.agent("ufrj:GRECO", {"prov:type":"prov:Organization", "foaf:name":"Greco Group"})
    dagnts["ag-author-gil"] = document_prov.agent("ufrj:Gil", {"prov:type":"prov:Person", "foaf:name":"Gilberto Gil Fidelis Gomes Passos", "foaf:mbox":"gilberto.passos@cefet-rj.br"})
    dagnts["ag-author-saulo"] = document_prov.agent("ufrj:Saulo", {"prov:type":"prov:Person", "foaf:name":"Saulo Andrade Almeida", "foaf:mbox":"sauloandrade@gmail.com"})
    dagnts["ag-author-valquire"] = document_prov.agent("ufrj:Valquire", {"prov:type":"prov:Person", "foaf:name":"Valquire da Silva de Jesus", "foaf:mbox":"valquire@ufrj.br"})
    dagnts["ag-author-sergio"] = document_prov.agent("ufrj:Sergio", {"prov:type":"prov:Person", "foaf:name":"Sergio Serra", "foaf:mbox":"serra@ppgi.ufrj.br"})
    dagnts["ag-author-jorge"] = document_prov.agent("ufrj:Jorge", {"prov:type":"prov:Person", "foaf:name":"Jorge Zavaleta", "foaf:mbox":"zavaleta@pet-si.ufrrj.br"})
    dagnts["ag-drecvm-ipynb"] = document_prov.agent("ufrj:drecvm.ipynb", {"prov:type":"prov:SoftwareAgent", "foaf:name":"drecvm.ipynb", "prov:label":"Notebook Python utilizado no trabalho"})

    return dagnts

def associate_ufrj_agents(agents_dictionary):
    agents_dictionary["ag-ppgi"].actedOnBehalfOf(agents_dictionary["ag-ufrj"])
    agents_dictionary["ag-greco"].actedOnBehalfOf(agents_dictionary["ag-ppgi"])
    agents_dictionary["ag-drecvm-ipynb"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-author-gil"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-author-saulo"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-author-valquire"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-author-sergio"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-author-jorge"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    agents_dictionary["ag-drecvm-ipynb"].actedOnBehalfOf(agents_dictionary["ag-greco"])
    return agents_dictionary

def create_initial_activities(document_prov):
    #creating activities
    dataDownloadDatasets = datetime.datetime.strptime('15/09/22', '%d/%m/%y')
    
    dativs={}
    dativs["act-create-ds"] = document_prov.activity("cvm:create-dataset")
    dativs["act-create-ds-obinvest"] = document_prov.activity("ufrj:create-ds-obinvest")
    dativs["act-save-datasets"] = document_prov.activity("ufrj:save-datasets", dataDownloadDatasets, None)
    return dativs
    
def create_initial_entities(document_prov):
    #creating activities
    dents={}
    dents["ent-cademp-ds"] = document_prov.entity('cvm-cademp:cad_cia_aberta.csv', {'prov:label': 'Dataset com dados cadastrais das empresas listadas na CVM', 'prov:type': 'void:Dataset'})
    
    #generate DRE data of ITR kind
    dents["ent-dreitr"] = document_prov.entity('cvm:dre-itr', {'prov:label': 'Documento que representa o conceito de DREs do tipo Trimestral', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2011-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2011.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2011', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2011-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2011.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2011', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2012-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2012.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2012', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2012-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2012.csv', {'prov:label': 'Dataset com dados de DRE trimestrais,  do anos de 2012', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2013-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2013.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2013', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2013-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2013.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2013', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2014-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2014.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2014', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2014-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2014.csv', {'prov:label': 'Dataset com dados de DRE trimestrais,  do anos de 2014', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2015-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2015.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2015', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2015-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2015.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2015', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2016-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2016.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2016', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2016-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2016.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2016', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2017-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2017.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2017', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2017-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2017.csv', {'prov:label': 'Dataset com dados de DRE trimestrais,  do anos de 2017', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2018-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2018.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2018', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2018-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2018.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2018', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2019-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2019.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2019', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2019-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2019.csv', {'prov:label': 'Dataset com dados de DRE trimestrais,  do anos de 2019', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2020-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2020.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2020', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2020-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2020.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2020', 'prov:type': 'void:Dataset'})
    dents["ent-dreitr2021-zip"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_2021.zip', {'prov:label': 'ZIP com Dataset com DRE trimestrais, do ano de 2021', 'prov:type': 'foaf:Document'})
    dents["ent-dreitr2021-ds"] = document_prov.entity('cvm-dre-itr:itr_cia_aberta_DRE_con_2021.csv', {'prov:label': 'Dataset com dados de DRE trimestrais, do ano de 2021', 'prov:type': 'void:Dataset'})

    #generate DRE data of DFP kind
    dents["ent-dredfp"] = document_prov.entity('cvm:dre-dfp', {'prov:label': 'Documento que representa o conceito de DREs do tipo Trimestral', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2011-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2011.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2011', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2011-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2011.csv', {'prov:label': 'Dataset com dados de DRE anual de 2011', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2012-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2012.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2012', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2012-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2012.csv', {'prov:label': 'Dataset com dados de DRE anual,  do anos de 2012', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2013-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2013.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2013', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2013-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2013.csv', {'prov:label': 'Dataset com dados de DRE anual de 2013', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2014-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2014.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2014', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2014-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2014.csv', {'prov:label': 'Dataset com dados de DRE anual,  do anos de 2014', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2015-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2015.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2015', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2015-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2015.csv', {'prov:label': 'Dataset com dados de DRE anual de 2015', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2016-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2016.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2016', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2016-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2016.csv', {'prov:label': 'Dataset com dados de DRE anual de 2016', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2017-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2017.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2017', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2017-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2017.csv', {'prov:label': 'Dataset com dados de DRE anual,  do anos de 2017', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2018-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2018.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2018', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2018-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2018.csv', {'prov:label': 'Dataset com dados de DRE anual de 2018', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2019-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2019.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2019', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2019-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2019.csv', {'prov:label': 'Dataset com dados de DRE anual de 2019', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2020-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2020.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2020', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2020-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2020.csv', {'prov:label': 'Dataset com dados de DRE anual de 2020', 'prov:type': 'void:Dataset'})
    dents["ent-dredfp2021-zip"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_2021.zip', {'prov:label': 'ZIP com Dataset com DRE anual de 2021', 'prov:type': 'foaf:Document'})
    dents["ent-dredfp2021-ds"] = document_prov.entity('cvm-dre-dfp:dfp_cia_aberta_DRE_con_2021.csv', {'prov:label': 'Dataset com dados de DRE anual de 2021', 'prov:type': 'void:Dataset'})
    
    dents["ent-git-obinvest"] = document_prov.entity('obinvest:github-drecvm', {'prov:label': 'Repositorio DRECVM da OBInvest', 'prov:type': 'prov:Collection'})
    
    return dents
    
def initial_association_agents_activities_entities(document_prov, dictionary_agents, 
                                                   dictionary_activities, dictionary_entities):
    
    #Associate activity of generate dataset with CVM agent
    document_prov.wasAssociatedWith(dictionary_activities["act-create-ds"], 
                                     dictionary_agents["ag-cvm"])
    
    #Associating datasets with activities of generate CVM datasets
    document_prov.wasGeneratedBy(dictionary_entities["ent-cademp-ds"], dictionary_activities["act-create-ds"])
    document_prov.wasGeneratedBy(dictionary_entities["ent-dreitr"], dictionary_activities["act-create-ds"])    
    document_prov.wasGeneratedBy(dictionary_entities["ent-dredfp"], dictionary_activities["act-create-ds"])
    
    #Associating ZIPs, DREs ITR e DFP with entities of generic datasets
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2011-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2012-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2013-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2014-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2015-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2016-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2017-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2018-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2019-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2020-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2021-zip"], dictionary_entities["ent-dreitr"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2011-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2012-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2013-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2014-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2015-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2016-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2017-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2018-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2019-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2020-zip"], dictionary_entities["ent-dredfp"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2021-zip"], dictionary_entities["ent-dredfp"])  
    
    #Associating ZIPs, DREs ITR e DFP with CSVs
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2011-ds"], dictionary_entities["ent-dreitr2011-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2012-ds"], dictionary_entities["ent-dreitr2012-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2013-ds"], dictionary_entities["ent-dreitr2013-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2014-ds"], dictionary_entities["ent-dreitr2014-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2015-ds"], dictionary_entities["ent-dreitr2015-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2016-ds"], dictionary_entities["ent-dreitr2016-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2017-ds"], dictionary_entities["ent-dreitr2017-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2018-ds"], dictionary_entities["ent-dreitr2018-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2019-ds"], dictionary_entities["ent-dreitr2019-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2020-ds"], dictionary_entities["ent-dreitr2020-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2021-ds"], dictionary_entities["ent-dreitr2021-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2011-ds"], dictionary_entities["ent-dredfp2011-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2012-ds"], dictionary_entities["ent-dredfp2012-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2013-ds"], dictionary_entities["ent-dredfp2013-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2014-ds"], dictionary_entities["ent-dredfp2014-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2015-ds"], dictionary_entities["ent-dredfp2015-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2016-ds"], dictionary_entities["ent-dredfp2016-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2017-ds"], dictionary_entities["ent-dredfp2017-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2018-ds"], dictionary_entities["ent-dredfp2018-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2019-ds"], dictionary_entities["ent-dredfp2019-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2020-ds"], dictionary_entities["ent-dredfp2020-zip"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2021-ds"], dictionary_entities["ent-dredfp2021-zip"])    
    
    #associate activity of obinvest, with greco group
    document_prov.wasAssociatedWith(dictionary_activities["act-create-ds-obinvest"], dictionary_agents["ag-greco"])   

    #associate notebook agent with obinvest dataset
    document_prov.wasAssociatedWith(dictionary_activities["act-create-ds-obinvest"], dictionary_agents["ag-drecvm-ipynb"])    
    
    #associate activities dataset storing with greco group
    document_prov.wasAssociatedWith(dictionary_activities["act-save-datasets"], dictionary_agents["ag-greco"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-cademp-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2011-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2012-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2013-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2014-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2015-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2016-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2017-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2018-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2019-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2020-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dreitr2021-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2011-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2012-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2013-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2014-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2015-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2016-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2017-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2018-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2019-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2020-ds"])
    document_prov.used(dictionary_activities["act-save-datasets"],dictionary_entities["ent-dredfp2021-ds"])
    
    #associate obinvest github repository with store datasets activity
    document_prov.wasGeneratedBy(dictionary_entities["ent-git-obinvest"], dictionary_activities["act-save-datasets"])
    
    #detailing csvs in github
    document_prov.wasDerivedFrom(dictionary_entities["ent-cademp-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2011-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2012-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2013-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2014-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2015-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2016-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2017-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2018-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2019-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2020-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dreitr2021-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2011-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2012-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2013-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2014-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2015-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2016-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2017-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2018-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2019-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2020-ds"], dictionary_entities["ent-git-obinvest"])
    document_prov.wasDerivedFrom(dictionary_entities["ent-dredfp2021-ds"], dictionary_entities["ent-git-obinvest"])
    
def initProvenance():
    # Creating an empty provenance document
    doc_prov = ProvDocument()

    #creating namespacing of provenabce document
    doc_prov = adding_namespaces(doc_prov)
    
    #create agents
    agents_dict = create_agents(doc_prov)
    
    #creating agents hierarchy
    agents_dict = associate_ufrj_agents(agents_dict)
    
    #create initial activities
    activities_dict = create_initial_activities(doc_prov)
    
    #create initial entities
    entities_dict = create_initial_entities(doc_prov)
    
    #initial provenance associations
    initial_association_agents_activities_entities(doc_prov, agents_dict, activities_dict, entities_dict)
    
    #return provenance objects
    return doc_prov, agents_dict, activities_dict, entities_dict

## **Example of Data Analytic**

In [None]:
import plotly.express as px

def sector_margin_analytic_example(dataFrameObinvest):

  # filter df obinvest of accounts 3.01
  df_conta_301 = dataFrameObinvest.query('CD_CONTA == "3.01"')

  # filter df obinvest of accounts 3.09
  df_conta_309 = dataFrameObinvest.query('CD_CONTA == "3.09"')
 
  # grouping by account, sector, year and quarter
  vetor_301 = df_conta_301.groupby(['SETOR_ATIV', 'DT_REFER'])['VL_CONTA'].sum()
  vetor_309 = df_conta_309.groupby(['SETOR_ATIV', 'DT_REFER'])['VL_CONTA'].sum()

  df_margem_setor = pd.DataFrame(vetor_301)
  df_margem_setor = df_margem_setor.rename(columns = {'VL_CONTA':'SOMA_CONTA_3_01'})
  df_margem_setor['SOMA_CONTA_3_09'] = vetor_309.values
  df_margem_setor['MARGEM_SETOR'] = df_margem_setor.apply(lambda x: 0 if x.SOMA_CONTA_3_01 == 0 else (x.SOMA_CONTA_3_09/x.SOMA_CONTA_3_01) * 100,  axis = 1)
    
  data_ref=[]
  for sub_lista in df_margem_setor.index:
    data_ref.append(sub_lista[1])  # select quarter

  setores=[]
  for sub_lista in df_margem_setor.index:
    setores.append(sub_lista[0])  # select quarter

  df_margem_setor['DT_REFER'] = data_ref
  df_margem_setor['SETOR'] = setores

  query =  ' SETOR == "Energia Elétrica" | '
  query += ' SETOR == "Bancos" | '
  query += ' SETOR == "Serviços Transporte e Logística" | '
  query += ' SETOR == "Comércio (Atacado e Varejo)" | '
  query += ' SETOR == "Educação" '

  df_margem_setor = df_margem_setor.query(query)

  fig = px.line(df_margem_setor, x="DT_REFER", y=df_margem_setor['MARGEM_SETOR'],
              title='Margem Setor', color='SETOR', markers=True, symbol="SETOR")
  return fig

## **Main Execution**

In [None]:
def executeNotebook():
    df_obinvest = create_dataframe_obinvest()
    plotSectorMargin = sector_margin_analytic_example(df_obinvest)
    generate_provenance_outputs()
    return plotSectorMargin

In [None]:
#provenance objects were declare in global scope, just to avoid pass as parameters to all methods
doc_prov, dict_agents , dict_activities, dict_entities = initProvenance()

def main():
    if(verify_libs_environment_versions()):
        return executeNotebook()
    else:
        print('Notbook was not executed. Verify all environment and libs versions!')

plotSectorMargin = main()

In [None]:
plotSectorMargin.show()

In [None]:
Image("DRE-CVM-PROV.png")