# Setup

Monta o Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Variaveis Globais

In [2]:
caminho_drive_relatorios = "/content/drive/MyDrive/Financeiro/Investimentos/Relatorios\ Gerenciais\ para\ Ler"

## Bibliotecas

In [3]:
from bs4 import BeautifulSoup
import requests
import os
import time

## Selenium Setup

In [4]:
%%capture
%%shell

# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

sudo apt update; sudo apt upgrade

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

service = Service(executable_path=r'/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)
# ...

### Criando a pasta onde os arquivos serão armazenados

In [6]:
colabPath_relatorios = "/content/relatorios/"

if not os.path.exists(colabPath_relatorios):
    os.makedirs(colabPath_relatorios)

## Métodos

In [7]:
def capturaHTMLdaPagina(url):
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "html.parser")

    return soup


def geraNomeDoPDF(nome_tarefa):
    file_name = nome_tarefa.replace("#", "") + ".pdf"

    return file_name


def baixarPDFporURL(pdf_url, file_name):
    file_name = colabPath_relatorios + file_name
    response = requests.get(pdf_url)

    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        print(f'Successfully downloaded {file_name}.')
    else:
        print('Failed to download the PDF.')

# XPML11

In [8]:
soup = capturaHTMLdaPagina("https://xpmalls.riweb.com.br/list.aspx?idCanal=kvNzmBoSOV6K7Q9BieMORw==")
linkPrimeiroDownloadLink = soup.find("a", {"id":"ContentInternal_ContentPlaceHolderConteudo_rptListaCanal_linkListaTituloChamada_0"})
url_parcial_relatorio = linkPrimeiroDownloadLink["href"]
link_download_relatorio = "https://xpmalls.riweb.com.br/" + url_parcial_relatorio

In [9]:
baixarPDFporURL(link_download_relatorio, "XPML11.pdf")

time.sleep(10)

Successfully downloaded /content/relatorios/XPML11.pdf.


# MXRF11

In [10]:
soup = capturaHTMLdaPagina("https://xpmaxirenda.riweb.com.br/list.aspx?idCanal=dFDM2o4LcyHK171fQJvzkQ==")
linkPrimeiroDownloadLink = soup.find("a", {"id":"ContentInternal_ContentPlaceHolderConteudo_rptListaCanal_linkListaTituloChamada_0"})
url_parcial_relatorio = linkPrimeiroDownloadLink["href"]
link_download_relatorio = "https://xpmaxirenda.riweb.com.br/" + url_parcial_relatorio

In [11]:
baixarPDFporURL(link_download_relatorio, "MXRF11.pdf")

time.sleep(10)

Successfully downloaded /content/relatorios/MXRF11.pdf.


# KNCR11

In [12]:
soup = capturaHTMLdaPagina("https://www.kinea.com.br/fundos/fundo-imobiliario-kinea-rendimentos-kncr11/")

tagsA = soup.findAll("a")

for a in tagsA:
  if a.text == "Carta do Gestor":
    link_download_relatorio = a["href"]
    break

In [13]:
baixarPDFporURL(link_download_relatorio, "KNCR11.pdf")

time.sleep(10)

Successfully downloaded /content/relatorios/KNCR11.pdf.


# VINO11

In [14]:
driver.get("https://www.vincifundoslistados.com/nossos-fundos/vinci-offices-fii-vino11/informacoes-aos-investidores/central-de-downloads/" )

In [15]:
linha = driver.find_element(By.XPATH, "//*[@id='data']/tbody/tr[1]")
links = linha.find_elements(By.TAG_NAME, "td")

# Captura o último link da linha de relatórios gerenciais
for link in links:
  try:
    ultimo_link = link.find_element(By.TAG_NAME, "a")
  except:
    pass

In [16]:
ultimo_link.click()

time.sleep(10)

In [17]:
!cp /content/*.pdf /content/relatorios/VINO11.pdf
!rm /content/*.pdf

# VILG11

In [18]:
driver.get("https://www.vincifundoslistados.com/nossos-fundos/vinci-logistica-fii-vilg11/informacoes-aos-investidores/central-de-downloads/" )

In [19]:
linha = driver.find_element(By.XPATH, "//*[@id='data']/tbody/tr[1]")
links = linha.find_elements(By.TAG_NAME, "td")

# Captura o último link da linha de relatórios gerenciais
for link in links:
  try:
    ultimo_link = link.find_element(By.TAG_NAME, "a")
  except:
    pass

In [20]:
ultimo_link.click()

time.sleep(10)

In [21]:
!cp /content/*.pdf /content/relatorios/VILG11.pdf
!rm /content/*.pdf

# HRGU11

In [22]:
driver.get("https://imobiliario.cshg.com.br/central-de-downloads/relatorios-periodicos/hgru/")

In [23]:
linha = driver.find_element(By.XPATH, "//*[@id='data']/tbody/tr[1]")
links = linha.find_elements(By.TAG_NAME, "td")

# Captura o último link da linha de relatórios gerenciais
for link in links:
  try:
    ultimo_link = link.find_element(By.TAG_NAME, "a")
  except:
    pass

link_baixar = ultimo_link.get_attribute("href")

In [24]:
baixarPDFporURL(link_baixar, "HGRU.pdf")

time.sleep(10)

Successfully downloaded /content/relatorios/HGRU.pdf.


Mata o driver

In [25]:
driver.quit()

# Google Drive

## Remove os arquivos do Drive

In [26]:
!rm /content/drive/MyDrive/Financeiro/Investimentos/Relatorios\ Gerenciais\ para\ Ler/*

rm: cannot remove '/content/drive/MyDrive/Financeiro/Investimentos/Relatorios Gerenciais para Ler/*': No such file or directory


## Move .pdf pro meu Drive

In [27]:
!cp /content/relatorios/*.pdf /content/drive/MyDrive/Financeiro/Investimentos/Relatorios\ Gerenciais\ para\ Ler