# Data Collector (Patentes)

### Importando bibliotecas

Bibliotecas que serão utilizadas no decorrer da aplicação. 

In [3]:
import time
import os
from bs4 import BeautifulSoup
import requests

### Variáveis Globais

Definindo os parâmetros para a extração de dados

In [1]:
# Variáveis globais para coletar dados do Google Patents
htmls_folder = "htmlsGooglePatents"
page_numbers = 2
url = 'https://patents.google.com/?q=("Smart+mobility")&country=US&language=ENGLISH&type=PATENT&num=100'
patentsPDF_folder = 'patentsPDF'
patentsHTML_folder = 'patentsHTML'

### URL referente ao filtro das patentes

Utilizando o webdriver(chrome) da bilbioteca selenium para acessar ao link referente ao Google Patents com um filtro específico, além de ser possível delimitar o filtro, também é possível delimitar a quantidade de páginas que serão baixadas.

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.common.by import By

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)


driver.get(url)

for i in range(page_numbers):

    time.sleep(3)
    
    html = driver.page_source

    file_name = 'paginaPatentes' + time.strftime("%Y%m%d_%H%M%S") + '.html'
    file_path = os.path.join(htmls_folder, file_name)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html)

    next_link = None
    try:
        next_link = driver.find_element(By.XPATH, '/html/body/search-app/search-results/search-ui/div/div/div/div/div/div[1]/div[6]/search-paging/state-modifier[3]/a/paper-icon-button/iron-icon')
    except NoSuchElementException:
        break

    if next_link and next_link.is_displayed() and next_link.is_enabled():
        try:
            next_link.click()
        except ElementNotInteractableException:
            print("Elemento de próxima página não está interagível.")
            break
driver.quit()

### Coletando HTML e PDF de cada patente

Utilazando o BeautifulSoup para conseguir extrair os PDF de cada patente.

In [6]:
file_name = []

for filename in os.listdir(htmls_folder):
    if filename.endswith('.html'):
        filepath = os.path.join(htmls_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        spans = soup.find_all('span', {'data-proto': 'OPEN_PATENT_PDF'})
        i = 0
        for span in spans:
            patent_link = 'https://patents.google.com/patent/' + span.text.strip()
            patent_html_response = requests.get(patent_link)
            if patent_html_response.status_code == 200:
                patent_html_content = patent_html_response.text
                file_name.append(span.text.strip())
                patent_filename = file_name[i] + '.html'
                patent_html_path = os.path.join(patentsHTML_folder, patent_filename)
                if not os.path.exists(patent_html_path):  
                    with open(patent_html_path, 'w', encoding='utf-8') as patent_html_file:
                        patent_html_file.write(patent_html_content)
            else:
                print(f"Erro ao baixar HTML da patente {patent_link}")
            i += 1
            
        linksPDF = soup.find_all('a', {'class': 'pdfLink'})
        i = 0
        for link in linksPDF:
            pdf_url = link['href']
            pdf_name = pdf_url.split('/')[-1]
            pdf_path = os.path.join(patentsPDF_folder, file_name[i]+'.pdf')
            if not os.path.exists(pdf_path):  
                response = requests.get(pdf_url)
                if response.status_code == 200:
                    with open(pdf_path, 'wb') as pdf_file:
                        pdf_file.write(response.content)
                else:
                    print(f"Erro ao baixar o PDF {pdf_url}. Status code: {response.status_code}")
            i += 1