In [1]:
from datetime import datetime

def format_name(data : dict) -> str:
    nome = data["nomePregao"].strip().replace(" ", "_")
    refdate = data["dataReferencia"]
    dt = datetime.strptime(refdate, "%d/%m/%Y")
    dt_fmt = dt.strftime("%Y-%m-%d")
    id = data["id"]
    return f"{nome}_{dt_fmt}_{id}.pdf"

In [2]:
# Search Constants 

URL_LISTAR = "https://fnet.bmfbovespa.com.br/fnet/publico/listarTodasEspeciePorTipoECategoriaETipoFundo?idTipoFundo=0&idCategoria={}&idTipo={}&_={}"

URL = "https://fnet.bmfbovespa.com.br/fnet/publico/pesquisarGerenciadorDocumentosDados?" \
    "d={}&s={}&l={}&o%5B0%5D%5BdataEntrega%5D=desc&idCategoriaDocumento={}&idTipoDocumento={}&idEspecieDocumento={}&_={}"
DOC_URL = "https://fnet.bmfbovespa.com.br/fnet/publico/downloadDocumento?id={}"
BASE_FOLDER = "../Relatorios"


id_tipo_fundo = 0
id_cat = 7
id_tipo = 9
ID_ESPECIE_DOCUMENTO = 0

In [3]:
# Logger config


import logging

logging.basicConfig(level=logging.INFO, filename="execution.log")
logging.getLogger("fnet").addHandler(logging.StreamHandler())
logger = logging.getLogger("fnet")

In [4]:
# Iteration Constants

NUM_DOCS = 10000
LIST_LEN = 50

In [5]:
import os
import time
import urllib3
from fake_useragent import UserAgent
from threading import Thread
import requests
import pandas as pd

urllib3.disable_warnings()

ua = UserAgent()
iterations = NUM_DOCS // LIST_LEN

ctime = int(time.time() * 1000)
session = requests.session()
session.headers.update({
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
})

session.get(URL_LISTAR.format(id_cat, id_tipo, ctime), verify=False)

list_dict = []
for i in range(iterations):
    # Sets variables
    draw = 1 + i
    s = LIST_LEN * i

    url_format = URL.format(draw, s, LIST_LEN, id_cat, id_tipo, ID_ESPECIE_DOCUMENTO, ctime)

    response = session.get(url_format, verify=False)
    data = response.json()["data"]
    for j, doc in enumerate(data):
        # Set a different user agent
        try:
            session.headers.update({"User-Agent": ua.chrome})

            iteration_num = i * LIST_LEN + j

            doc_id = doc["id"]
            doc_url = DOC_URL.format(doc_id)
            doc_name = format_name(doc)

            logger.info(f"({iteration_num} / {NUM_DOCS}) - {doc_name}")
            doc_path = os.path.join(BASE_FOLDER, doc_name)
            
            doc.update({"url": doc_url, "doc_path": doc_path}) 
            list_dict.append(doc)

            with open(doc_path, "wb") as f:
                response = session.get(doc_url, verify=False)
                f.write(response.content)
        except Exception as e:
            logger.error(e)
    time.sleep(0.7)

df = pd.DataFrame(list_dict)
df.to_csv("fnet_docs.csv", index=False)

(0 / 10000) - FII_XP_MALLS_2024-06-28_693217.pdf
(1 / 10000) - FII_VINCI_IU_2024-06-28_693211.pdf
(2 / 10000) - FII_VINCI_OF_2024-06-28_693210.pdf
(3 / 10000) - FII_VINCI_LG_2024-06-28_693209.pdf
(4 / 10000) - FII_VINCI_IF_2024-06-28_693208.pdf
(5 / 10000) - FII_VINCI_CR_2024-06-28_693207.pdf
(6 / 10000) - FII_HSI_CRI_2024-06-28_693155.pdf
(7 / 10000) - FII_HSIRENDA_2024-06-28_693140.pdf
(8 / 10000) - FII_KINEA_UN_2024-06-30_693139.pdf
(9 / 10000) - FII_KINEA_SC_2024-06-30_693135.pdf
(10 / 10000) - FII_KINEA_IP_2024-06-30_693133.pdf
(11 / 10000) - FII_KINEA_HY_2024-06-30_693132.pdf
(12 / 10000) - FII_VINCI_IU_2024-06-28_693131.pdf
(13 / 10000) - FII_KINEA_RI_2024-06-30_693130.pdf
(14 / 10000) - FII_KINEA_CR_2024-06-30_693129.pdf
(15 / 10000) - FII_VINCI_SC_2024-06-28_693128.pdf
(16 / 10000) - FII_VINCI_OF_2024-06-28_693126.pdf
(17 / 10000) - FII_VINCI_LG_2024-06-28_693121.pdf
(18 / 10000) - FII_VINCI_IF_2024-06-28_693115.pdf
(19 / 10000) - _2024-06-28_693113.pdf
(20 / 10000) - _2024-05

In [18]:
df["downloaded"] = False

In [20]:
from tqdm import tqdm
from threading import Thread

def make_request(idx, row):
    try:
        doc_name = row["doc_path"]
        doc_url = row["url"]
        logger.info(f"Downloading {doc_name}")
        with open(doc_name, "wb") as f:
            response = session.get(doc_url, verify=False)
            f.write(response.content)
            df.at[idx, "downloaded"] = True
    except Exception as e:
        logger.error(e)

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    if not row["downloaded"]:
        t = Thread(target=make_request, args=(idx, row))
        t.start()
        time.sleep(0.1)
    else:
        logger.info(f"Skipping {row['doc_path']}")

  0%|          | 0/10000 [00:00<?, ?it/s]Skipping ../Relatorios\FII_XP_MALLS_2024-06-28_693217.pdf
Skipping ../Relatorios\FII_VINCI_IU_2024-06-28_693211.pdf
Skipping ../Relatorios\FII_VINCI_OF_2024-06-28_693210.pdf
Skipping ../Relatorios\FII_VINCI_LG_2024-06-28_693209.pdf
Skipping ../Relatorios\FII_VINCI_IF_2024-06-28_693208.pdf
Skipping ../Relatorios\FII_VINCI_CR_2024-06-28_693207.pdf
Skipping ../Relatorios\FII_HSI_CRI_2024-06-28_693155.pdf
Skipping ../Relatorios\FII_HSIRENDA_2024-06-28_693140.pdf
Skipping ../Relatorios\FII_KINEA_UN_2024-06-30_693139.pdf
Skipping ../Relatorios\FII_KINEA_SC_2024-06-30_693135.pdf
Skipping ../Relatorios\FII_KINEA_IP_2024-06-30_693133.pdf
Downloading ../Relatorios\FII_KINEA_HY_2024-06-30_693132.pdf
  0%|          | 12/10000 [00:00<01:49, 91.31it/s]Downloading ../Relatorios\FII_VINCI_IU_2024-06-28_693131.pdf
Downloading ../Relatorios\FII_KINEA_RI_2024-06-30_693130.pdf
Downloading ../Relatorios\FII_KINEA_CR_2024-06-30_693129.pdf
Downloading ../Relatorios\FI

In [None]:
response = requests.get("https://fnet.bmfbovespa.com.br/fnet/publico/downloadDocumento?id=689718", verify=False)

In [None]:
response

<Response [200]>

In [None]:
file = open("file.pdf", "wb")


file.write(response.content)

2644386

In [6]:
df.columns

Index(['id', 'descricaoFundo', 'categoriaDocumento', 'tipoDocumento',
       'especieDocumento', 'dataReferencia', 'dataEntrega', 'status',
       'descricaoStatus', 'analisado', 'situacaoDocumento', 'assuntos',
       'altaPrioridade', 'formatoDataReferencia', 'versao', 'modalidade',
       'descricaoModalidade', 'nomePregao', 'informacoesAdicionais',
       'arquivoEstruturado', 'formatoEstruturaDocumento', 'nomeAdministrador',
       'cnpjAdministrador', 'cnpjFundo', 'idTemplate',
       'idSelectNotificacaoConvenio', 'idSelectItemConvenio',
       'indicadorFundoAtivoB3', 'idEntidadeGerenciadora', 'ofertaPublica',
       'numeroEmissao', 'tipoPedido', 'dda', 'codSegNegociacao', 'url',
       'doc_path'],
      dtype='object')

In [21]:
df_repl = df.copy()
df_repl["doc_path"] = df["doc_path"].str.replace("\\", "/")
df_repl["doc_path"]

0       ../Relatorios/FII_XP_MALLS_2024-06-28_693217.pdf
1       ../Relatorios/FII_VINCI_IU_2024-06-28_693211.pdf
2       ../Relatorios/FII_VINCI_OF_2024-06-28_693210.pdf
3       ../Relatorios/FII_VINCI_LG_2024-06-28_693209.pdf
4       ../Relatorios/FII_VINCI_IF_2024-06-28_693208.pdf
                              ...                       
9995    ../Relatorios/FII_AFHI_CRI_2022-01-31_272273.pdf
9996     ../Relatorios/FII_GENERAL_2022-01-31_272269.pdf
9997     ../Relatorios/FII_HGI_CRI_2022-01-31_272210.pdf
9998    ../Relatorios/FII_RBR_PROP_2022-01-31_272179.pdf
9999     ../Relatorios/FII_MAUA_HY_2022-02-24_272176.pdf
Name: doc_path, Length: 10000, dtype: object

In [11]:
df_repl.to_csv("fnet_docs.csv", sep=";", encoding="latin-1")

In [22]:
df_path = df_repl.copy()
df_path["doc_path"] = df_path["doc_path"].str.replace("..", r"C:\Users\Paulo\OneDrive\Documentos\Estudos\PUC-Rio\Projeto Final")
df_path["doc_path"] = df_path["doc_path"].str.replace("/", "\\")


In [25]:
df_filter = df_path.dropna(axis=1, how="all")

In [26]:
df_filter.to_csv("fnet_docs.csv", sep=";", encoding="latin-1")