In [21]:
from datetime import datetime
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup
import base64
import os

In [22]:
scraper = cloudscraper.create_scraper()

In [23]:
def get_company_info(company_name):
    df_companies = pd.read_csv('companies.csv')
    company_info = df_companies[df_companies["issuingCompany"] == company_name]
    if company_info.empty:
        raise ValueError(f"Empresa '{company_name}' não encontrada.")
    return company_info.iloc[0]['codeCVM']

In [24]:
STRUCTURED_REPORTS_URL = "https://sistemaswebb3-listados.b3.com.br/listedCompaniesProxy/CompanyCall/GetListStructuredReports/"
CVM_BASE_URL = "https://www.rad.cvm.gov.br/ENET/"

In [25]:
def get_structured_reports(code_cvm, year):
    payload = f'{{"codeCVM":{code_cvm},"language":"pt-br","status":true,"year":{year}}}'
    encoded_payload = base64.b64encode(payload.encode()).decode()
    url = STRUCTURED_REPORTS_URL + encoded_payload
    response = scraper.get(url)
    if response.status_code != 200:
        raise Exception(f"Erro na requisição. Status code: {response.status_code}")
    return response.json()

In [26]:
def extract_url_search(data_json):
    try:
        return data_json["dfp"][0]["urlSearch"]
    except Exception as e:
        print(f"Erro ao extrair URL: {e}")
        return None

In [27]:
def extract_links_from_select(url_report):
    try:
        response = scraper.get(url_report)
        soup = BeautifulSoup(response.content, "html.parser")
        select = soup.find("select", {"id": "cmbQuadro"})
        options = select.find_all("option")

        script = soup.find_all("script")[-1]
        link_id = script.string.split("location=")[1].split("'")[1].split("Versao=")[1]

        links = {
            option.text: f"{CVM_BASE_URL}{option['value'].replace(' ', '%20')}{link_id}"
            for option in options
        }
        
        responses = {}
        for title, link in links.items():
            try:
                response = scraper.get(link)
                responses[title] = base64.b64encode(response.content).decode('utf-8')
            except Exception as e:
                print(f"Erro ao extrair link: {e}")
                responses[title] = None
        
        return responses
    except Exception as e:
        print(f"Erro ao extrair links: {e}")
        return None

In [28]:
def get_table_data(content):
    decoded_content = base64.b64decode(content).decode('utf-8')
    soup = BeautifulSoup(decoded_content, "html.parser")
    table = soup.find("table", {"id": "ctl00_cphPopUp_tbDados"})

    if not table:
        raise Exception("Tabela não encontrada no link fornecido.")

    all_tr = table.find_all("tr")
    data = [[td.text.strip() for td in tr.find_all("td")] for tr in all_tr]

    return pd.DataFrame(data[1:], columns=data[0])

In [29]:
def reorganize_df(df: pd.DataFrame) -> pd.DataFrame:
    df.index = df['Descrição']
    df = df.drop(columns=['Descrição', 'Conta'])
    
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.replace('.', '')
            df[column] = df[column].str.replace(',', '.')
            
            df[column] = df[column].apply(pd.to_numeric, errors='coerce')
            df[column] = df[column].fillna(0)
            
    # Verificar se existem index duplicados, caso haja, adicionar '{i}' ao final do index dos duplicados
    if df.index.duplicated().any():
        df.index = df.index + df.groupby(level=0).cumcount().astype(str).replace('0', '')
        
    return df

In [30]:
def fetch_anual_data(url_report, links, company_name, data_type, year):
    
    if not url_report:
        raise ValueError(f"Não foi possível localizar relatórios para '{company_name}' no ano {year}.")
    
    if not links or data_type not in links:
        raise ValueError(f"Tipo de dado '{data_type}' não disponível para '{company_name}'.")

    response = links[data_type]
    df = get_table_data(response)
    df = reorganize_df(df)
    
    return df

In [31]:
def fetch_all_anual_data(url_report, links, company_name, year):
    data_types = [
            "Balanço Patrimonial Ativo",
            "Balanço Patrimonial Passivo",
            "Demonstração do Resultado",
            "Demonstração do Resultado Abrangente",
            "Demonstração do Fluxo de Caixa",
            "Demonstração de Valor Adicionado",
    ]
    data = {}
    for data_type in data_types:
        try:
            df = fetch_anual_data(url_report, links, company_name, data_type, year)
            data[data_type] = df
        except Exception as e:
            print(f"Erro ao extrair dados de '{data_type}': {e}")
            data[data_type] = None
    return data

In [32]:
def get_all_itrs_links(data_json):
        itr_links = {}
        try:
            itr = data_json['itr']
            for i in itr:
                date = i['dateTimeReference'].split('T')[0]
                itr_links[date] = i['urlSearch']
                
            return itr_links
        except Exception as e:
            print("Erro ao extrair links de ITR.")
            return itr_links

In [33]:
def transform_data(data: dict, year_rec) -> dict:
        for key in data.keys():
            if key in ['Balanço Patrimonial Ativo', 'Balanço Patrimonial Passivo']:
                for column in data[key].columns:
                    year = column.split('/')[-1]
                    if year != year_rec:
                        new_column = f'01/01/{year}  a  31/12/{year}'
                        data[key].rename(columns={column: new_column}, inplace=True)
                    else:
                        new_column = f'01/01/{year}  a  {column}'
                        data[key].rename(columns={column: new_column}, inplace=True)
            
        return data

In [34]:
def fetch_itr_data(url_report, dfp_links ,itr_links, company_name, year):
    if not itr_links:
        raise ValueError("Não foi possível localizar relatórios ITR.")
    
    itr_data = {}
    
    for date, link in itr_links.items():
        itr_data[date] = fetch_all_anual_data(link, extract_links_from_select(link), company_name, year)
    
    end_year = fetch_all_anual_data(url_report, dfp_links, company_name, year + 1)
    formated_date = f'{year}-12-31'
    
    itr_data[formated_date] = end_year
    for key in itr_data.keys():
        itr_data[key] = transform_data(itr_data[key], year)
        
    return itr_data
    

In [35]:
code_cvm = get_company_info("PETR")

In [36]:
strcuted_reports_response = get_structured_reports(code_cvm, 2023)

In [37]:
url_report = extract_url_search(strcuted_reports_response)

In [38]:
dfp_links = extract_links_from_select(url_report)

In [39]:
itr_links = get_all_itrs_links(strcuted_reports_response)

In [40]:
test = fetch_itr_data(url_report, dfp_links, itr_links, "PETR", 2022)

In [44]:
def convert_df_to_dict(data: dict) -> dict:
    for key in data.keys():
        if data[key] is not None:
            for k in data[key].keys():
                if data[key][k] is not None:
                    data[key][k] = data[key][k].to_dict()
    return data

In [45]:
m = convert_df_to_dict(test.copy())

{'2023-09-30': {'Balanço Patrimonial Ativo': {'01/01/2023  a  31/12/2023': {'Ativo Total': 1025496000.0,
    'Ativo Circulante': 147311000.0,
    'Caixa e Equivalentes de Caixa': 60642000.0,
    'Aplicações Financeiras': 6505000.0,
    'Aplicações Financeiras Avaliadas a Valor Justo através do Resultado': 0.0,
    'Títulos para Negociação': 0.0,
    'Títulos Designados a Valor Justo': 0.0,
    'Aplicações Financeiras Avaliadas a Valor Justo através de Outros Resultados Abrangentes': 0.0,
    'Aplicações Financeiras Avaliadas ao Custo Amortizado': 0.0,
    'Contas a Receber': 25502000.0,
    'Clientes': 0.0,
    'Outras Contas a Receber': 0.0,
    'Estoques': 39510000.0,
    'Ativos Biológicos': 0.0,
    'Tributos a Recuperar': 5965000.0,
    'Tributos Correntes a Recuperar': 5965000.0,
    'Imposto de renda e contribuição social': 1034000.0,
    'Impostos e contribuições': 4931000.0,
    'Despesas Antecipadas': 0.0,
    'Outros Ativos Circulantes': 9187000.0,
    'Ativos Não-Correntes 