# Data Collector (Patentes)

### Importando bibliotecas

Bibliotecas que serão utilizadas no decorrer da aplicação. 

In [2]:
import time
import os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.by import By

### Variáveis Globais

Definindo os parâmetros para a extração de dados

In [45]:
# Variáveis globais para coletar dados do Google Patents
htmls_folder = "htmlsGooglePatents"
page_numbers = 2
url_google_patents = 'https://patents.google.com/?q=("Smart+mobility")&country=US&language=ENGLISH&type=PATENT&num=100'
patentsHTML_folder = 'patentsHTML'
patentsXML_folder = 'patentsXML'
db_name = "NER-Patents-DB"
db_collection = "patents"

### URL referente ao filtro das patentes

Utilizando o webdriver(chrome) da bilbioteca selenium para acessar ao link referente ao Google Patents com um filtro específico, além de ser possível delimitar o filtro, também é possível delimitar a quantidade de páginas que serão baixadas.

In [10]:
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)


driver.get(url_google_patents)

for i in range(page_numbers):

    time.sleep(3)
    
    html = driver.page_source

    file_name = 'paginaPatentes' + time.strftime("%Y%m%d_%H%M%S") + '.html'
    file_path = os.path.join(htmls_folder, file_name)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html)

    next_link = None
    try:
        next_link = driver.find_element(By.XPATH, '/html/body/search-app/search-results/search-ui/div/div/div/div/div/div[1]/div[6]/search-paging/state-modifier[3]/a/paper-icon-button/iron-icon')
    except NoSuchElementException:
        break

    if next_link and next_link.is_displayed() and next_link.is_enabled():
        try:
            next_link.click()
        except ElementNotInteractableException:
            print("Elemento de próxima página não está interagível.")
            break
driver.quit()

### Coletando HTML e PDF de cada patente

Utilazando o BeautifulSoup para conseguir extrair os HTMLs de cada patente.

In [20]:
files_names = []

i = 0
for filename in os.listdir(htmls_folder):
    if filename.endswith('.html'):
        filepath = os.path.join(htmls_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        spans = soup.find_all('span', {'data-proto': 'OPEN_PATENT_PDF'})
        
        for span in spans:
            files_names.append(span.text.strip())
            patent_filename = files_names[i] + '.html'
            patent_html_path = os.path.join(patentsHTML_folder, patent_filename)
            if not os.path.exists(patent_html_path):              
                patent_link = 'https://patents.google.com/patent/' + span.text.strip()
                patent_html_response = requests.get(patent_link)
                if patent_html_response.status_code == 200:
                    patent_html_content = patent_html_response.text
                    with open(patent_html_path, 'w', encoding='utf-8') as patent_html_file:
                        patent_html_file.write(patent_html_content)
            i += 1

### Coletando dados das patenes
Os dados coletados dos HTMLs (título e número da aplicação) serão utilizados para o web scrapling da plataforma WIPO.

In [3]:
import re

patent_application_numbers = []
patent_application_pattern = r"US(\d+)/(\d+),(\d+)"

for filename in os.listdir(patentsHTML_folder):

    if filename.endswith('.html'):

        html_path = os.path.join(patentsHTML_folder, filename)
        
        with open(html_path, 'r', encoding='utf-8') as file:
            html_file = file.read()

        soup = BeautifulSoup(html_file, 'html.parser')

        patent_application_find = soup.find('dd', itemprop='applicationNumber')
        patent_application = patent_application_find.get_text().strip()
        patent_number_match = re.match(patent_application_pattern, patent_application)

        patent_title_find = soup.find('span', itemprop='title')
        patent_title = patent_title_find.get_text().strip()

        if patent_number_match:
            patent_application_number = patent_number_match.group(1) + patent_number_match.group(2) + patent_number_match.group(3)
            patent_application_numbers.append((filename, patent_title , patent_application_number))

### Web Scrapling na plataforma WIPO

In [10]:
url_wipo = 'https://patentscope.wipo.int/search/en/search.jsf'

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

for patent in patent_application_numbers:
    
    filename_xml = patent[0].replace(".html", ".xml")
    xml_path = os.path.join(patentsXML_folder, filename_xml)

    if not os.path.exists(xml_path):
        driver.get(url_wipo)
        time.sleep(10)

        input_element = driver.find_element(By.XPATH, '/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[1]/input')
        input_element.send_keys("EN_TI:(\"" + patent[1] + "\") AND AN:("+ patent[2] +")")

        button_element = driver.find_element(By.XPATH, '/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[2]/button')
        button_element.click()
        time.sleep(10)

        document_element = driver.find_element(By.LINK_TEXT, 'Documents')
        document_element.click()
        time.sleep(5)

        link_element = driver.find_element(By.LINK_TEXT, 'XML')
        link_url = link_element.get_attribute('href')

        driver.get(link_url)
        page_content = driver.page_source

        xml_path = os.path.join(patentsXML_folder, filename_xml)
        with open(xml_path, 'w', encoding='utf-8') as file:
            file.write(page_content)
        time.sleep(3)


### Classe Pantete

In [41]:
from datetime import datetime

class Patent:
    def __init__(self):
        self.patent_title = None
        self.patent_number = None
        self.date_of_patent = None
        self.date_filed = None
        self.applicants = []
        self.inventors = []
        self.assignees = []
        self.classifications = []
        self.agents = []
        self.abstract = None
        self.claims = []

    def convert_dated(self, date_str):
        date_obj = datetime.strptime(date_str, '%Y%m%d')

        return (date_obj.strftime('%d/%m/%Y'))


    def extract_info(self, element):
        orgname = element.find('orgname')
        last_name = element.find('last-name')
        first_name = element.find('first-name')
        city = element.find('city')
        state = element.find('state')
        country = element.find('country')

        orgname = orgname.text if orgname else None
        last_name = last_name.text if last_name else None
        first_name = first_name.text if first_name else None
        city = city.text if city else None
        state = state.text if state else None
        country = country.text if country else None

        return orgname, last_name, first_name, city, state, country

    def process_elements(self, elements, type):
        for element in elements:
            orgname, last_name, first_name, city, state, country = self.extract_info(element)
            
            if orgname is not None:
                name = orgname
            else:
                name = first_name + " " + last_name

            if type == 'INV':
                self.add_inventor(name, city, state, country)        
            elif type == 'ASS':
                self.add_assignee(name, city, state, country)  
            elif type == 'APP':
                self.add_applicant(name, city, state, country)  

    def extract_data_xml(self, xml_content, filename):

        soup = BeautifulSoup(xml_content, 'html.parser')
        
        self.patent_title = soup.find('invention-title').text

        self.patent_number = os.path.splitext(filename)[0]
        
        date_patent_find = soup.find('document-id')
        self.date_of_patent = self.convert_dated(date_patent_find.find('date').text)

        date_filed_find = soup.find('application-reference')
        self.date_filed = self.convert_dated(date_filed_find.find('date').text)

        inventors = soup.find_all('inventor')
        self.process_elements(inventors, 'INV')

        assignees = soup.find_all('assignee')
        self.process_elements(assignees, 'ASS')

        applicants = soup.find_all('us-applicant')
        self.process_elements(applicants, 'APP')
        
        classifications = soup.find_all('classification-ipcr')

        for classification in classifications:
            section = classification.find('section').text if classification.find('section') else None
            class_ = classification.find('class').text if classification.find('class') else None
            subclass = classification.find('subclass').text if classification.find('subclass') else None
            main_group = classification.find('main-group').text if classification.find('main-group') else None
            subgroup = classification.find('subgroup').text if classification.find('subgroup') else None

            classification_str = f"{section}{class_}{subclass}{main_group}/{subgroup}"
            self.classifications.append(classification_str)

        agents = soup.find_all('agent')

        for agent in agents:
            orgname = agent.find('orgname').text if agent.find('orgname') else None
            last_name = agent.find('last-name').text if agent.find('last-name') else None
            first_name = agent.find('first-name').text if agent.find('first-name') else None

            if orgname is not None:
                self.agents.append(orgname)
            elif first_name is not None and last_name is not None:
                self.agents.append(first_name + " " + last_name)

        self.abstract = soup.find('abstract').text

        claims = soup.find_all('claim')

        for claim in claims:
            claim_text = claim.find('claim-text').text
            self.claims.append(claim_text)
    
    def add_applicant(self, name, city, state, country):
        self.applicants.append({
            "name": name,
            "city": city,
            "state": state,
            "country": country
        })

    def add_inventor(self, name, city, state, country):
        self.inventors.append({
            "name": name,
            "city": city,
            "state": state,
            "country": country
        })

    def add_assignee(self, name, city, state, country):
        self.assignees.append({
            "name": name,
            "city": city,
            "state": state,
            "country": country
        })
    
    def print_patent_info(self):
        print("Patent Title:", self.patent_title)
        print("Patent Number:", self.patent_number)
        print("Date Filed:", self.date_filed)
        print("Date of Patent:", self.date_of_patent)
        print("Applicants:")
        for applicant in self.applicants:
            print(applicant)
        print("Inventors:")
        for inventor in self.inventors:
            print(inventor)
        print("Assignees:")
        for assignee in self.assignees:
            print(assignee)
        print("Classifications:", self.classifications)
        print("Agents:", self.agents)
        print("Abstract:", self.abstract)
        print("Claims:")
        print(self.claims)
            
    def to_document(self):
        document = {
            "patent_title": self.patent_title,
            "patent_number": self.patent_number,
            "date_of_patent": self.date_of_patent,
            "date_filed": self.date_filed,
            "applicants": self.applicants,
            "inventors": self.inventors,
            "assignees": self.assignees,
            "classifications": self.classifications,
            "agents": self.agents,
            "abstract": self.abstract,
            "claims": self.claims,
        }
        
        return document

### Coletando dados do XML

Armazenando os dados presentes no XML em uma lista da classe patentes.

In [43]:
patents = []
for filename in os.listdir(patentsXML_folder):
    filepath = os.path.join(patentsXML_folder, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        xml_file = file.read() 
        
        patent = Patent()
        patent.extract_data_xml(xml_file, filename)
        patents.append(patent)

### Banco de Dados

Criação e inserção das patentes no banco de dados.

In [46]:
import pymongo

connection_string = "mongodb://localhost:27017/"
try:
    client = pymongo.MongoClient(connection_string)
    db = client[db_name]
    collection = db[db_collection]
except pymongo.errors.ConnectionFailure as e:
    print("Falha na conexão com o MongoDB:", e)


In [47]:
for patent in patents:
    new_document = patent.to_document()
    result = collection.insert_one(new_document)