# Data Collector (Patentes)

### Importando bibliotecas

Bibliotecas que serão utilizadas no decorrer da aplicação. 

In [1]:
import time
import os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.by import By

### Variáveis Globais

Definindo os parâmetros para a extração de dados

In [2]:
# Variáveis globais para coletar dados do Google Patents
htmls_folder = "htmlsGooglePatents"
page_numbers = 2
url_google_patents = 'https://patents.google.com/?q=("Smart+mobility")&country=US&language=ENGLISH&type=PATENT&num=100'
patentsHTML_folder = 'patentsHTML'
patentsXML_folder = 'patentsXML'

### URL referente ao filtro das patentes

Utilizando o webdriver(chrome) da bilbioteca selenium para acessar ao link referente ao Google Patents com um filtro específico, além de ser possível delimitar o filtro, também é possível delimitar a quantidade de páginas que serão baixadas.

In [10]:
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)


driver.get(url_google_patents)

for i in range(page_numbers):

    time.sleep(3)
    
    html = driver.page_source

    file_name = 'paginaPatentes' + time.strftime("%Y%m%d_%H%M%S") + '.html'
    file_path = os.path.join(htmls_folder, file_name)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html)

    next_link = None
    try:
        next_link = driver.find_element(By.XPATH, '/html/body/search-app/search-results/search-ui/div/div/div/div/div/div[1]/div[6]/search-paging/state-modifier[3]/a/paper-icon-button/iron-icon')
    except NoSuchElementException:
        break

    if next_link and next_link.is_displayed() and next_link.is_enabled():
        try:
            next_link.click()
        except ElementNotInteractableException:
            print("Elemento de próxima página não está interagível.")
            break
driver.quit()

### Coletando HTML e PDF de cada patente

Utilazando o BeautifulSoup para conseguir extrair os HTMLs de cada patente.

In [20]:
files_names = []

i = 0
for filename in os.listdir(htmls_folder):
    if filename.endswith('.html'):
        filepath = os.path.join(htmls_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        spans = soup.find_all('span', {'data-proto': 'OPEN_PATENT_PDF'})
        
        for span in spans:
            files_names.append(span.text.strip())
            patent_filename = files_names[i] + '.html'
            patent_html_path = os.path.join(patentsHTML_folder, patent_filename)
            if not os.path.exists(patent_html_path):              
                patent_link = 'https://patents.google.com/patent/' + span.text.strip()
                patent_html_response = requests.get(patent_link)
                if patent_html_response.status_code == 200:
                    patent_html_content = patent_html_response.text
                    with open(patent_html_path, 'w', encoding='utf-8') as patent_html_file:
                        patent_html_file.write(patent_html_content)
            i += 1

# Coletando dados das patenes
### Os dados coletados serão usados no web scrapling.

In [3]:
import re

patent_application_numbers = []
patent_application_pattern = r"US(\d+)/(\d+),(\d+)"

for filename in os.listdir(patentsHTML_folder):

    if filename.endswith('.html'):

        html_path = os.path.join(patentsHTML_folder, filename)
        
        with open(html_path, 'r', encoding='utf-8') as file:
            html_file = file.read()

        soup = BeautifulSoup(html_file, 'html.parser')

        patent_application_find = soup.find('dd', itemprop='applicationNumber')
        patent_application = patent_application_find.get_text().strip()
        patent_number_match = re.match(patent_application_pattern, patent_application)

        patent_title_find = soup.find('span', itemprop='title')
        patent_title = patent_title_find.get_text().strip()

        if patent_number_match:
            patent_application_number = patent_number_match.group(1) + patent_number_match.group(2) + patent_number_match.group(3)
            patent_application_numbers.append((filename, patent_title , patent_application_number))

# Web Scrapling na plataforma WIPO

In [8]:
url_wipo = 'https://patentscope.wipo.int/search/en/search.jsf'

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

for patent in patent_application_numbers:
    driver.get(url_wipo)
    time.sleep(10)

    input_element = driver.find_element(By.XPATH, '/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[1]/input')
    input_element.send_keys("EN_TI:(\"" + patent[1] + "\") AND AN:("+ patent[2] +")")

    button_element = driver.find_element(By.XPATH, '/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[2]/button')
    button_element.click()
    time.sleep(10)

    document_element = driver.find_element(By.LINK_TEXT, 'Documents')
    document_element.click()
    time.sleep(5)

    link_element = driver.find_element(By.LINK_TEXT, 'XML')
    link_url = link_element.get_attribute('href')

    driver.get(link_url)
    page_content = driver.page_source

    filename_xml = patent[0].replace(".html", ".xml")
    xml_path = os.path.join(patentsXML_folder, filename_xml)
    with open(xml_path, 'w', encoding='utf-8') as file:
        file.write(page_content)
    time.sleep(3)


KeyboardInterrupt: 

In [5]:
for patent in patent_application_numbers:
    print(patent[0] +" "+ patent[1] + " "+ patent[2])

US10037662B2.html Merchandise activity sensor system and methods of using same 15269774
US10149659B1.html Hand-held X-ray sensor with gesture-activated wake function 15719568
US10208977B2.html Intelligent LED bulb and vent method, apparatus and system 15487167
US10336391B1.html Steering system of small mobility 16176908
US10345115B2.html Terminal device, vehicle, personal mobility device, method for controlling the terminal device and method for controlling the vehicle 15485998
US10395502B2.html Smart mobility assistance device 16154213
US10440452B2.html Apparatus and method for transmitting and receiving environmental information in wireless communication system 15612568
US10499294B1.html System, method, and computer program for mitigation of user-specific ping-pong handover 15955535
US10826210B2.html Base module and aviation computer system having the base module 15809554
US10845593B2.html Display device for displaying virtual images 15980289
US11049392B2.html Position capture method