# Introduction: Downlading Patents

The purpose of this notebook is to download patents abstracts. Using a saved query that includes the links to the patents, we can download and save the abstracts to use for machine learning and deep learning.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
search_results = pd.read_csv('../data/gp-search-20181013-123852.csv')
search_results.head()

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link
0,US-5828812-A,Recurrent neural network-based fuzzy logic sys...,National Semiconductor Corporation,"Emdadur Rahman Khan, William Shields Neely",1993-03-24,1995-05-02,1998-10-27,1998-10-27,https://patents.google.com/patent/US5828812A/en,https://patentimages.storage.googleapis.com/pa...
1,US-5649065-A,Optimal filtering by neural networks with rang...,Maryland Technology Corporation,"James Ting-Ho Lo, Lei Yu",1993-05-28,1993-08-09,1997-07-15,1997-07-15,https://patents.google.com/patent/US5649065A/en,https://patentimages.storage.googleapis.com/pa...
2,US-5182794-A,Recurrent neural networks teaching system,"Allen-Bradley Company, Inc.","Michael L. Gasperi, Wesley Davis",1990-07-12,1992-03-27,1993-01-26,1993-01-26,https://patents.google.com/patent/US5182794A/en,https://patentimages.storage.googleapis.com/pa...
3,US-5606646-A,Recurrent neural network-based fuzzy logic sys...,National Semiconductor Corporation,"Emdadur R. Khan, Faith A. Unal",1993-03-24,1994-06-24,1997-02-25,1997-02-25,https://patents.google.com/patent/US5606646A/en,https://patentimages.storage.googleapis.com/pa...
4,US-5129039-A,Recurrent neural network with variable size in...,Sony Corporation,Atsunobu Hiraiwa,1988-09-17,1991-07-10,1992-07-07,1992-07-07,https://patents.google.com/patent/US5129039A/en,https://patentimages.storage.googleapis.com/pa...


In [28]:
url = 'https://patents.google.com/patent/US9378733B1/en'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
soup.find('h1').text

'US9378733B1 - Keyword detection without decoding \n        - Google Patents'

In [30]:
soup.find('aside')

In [22]:
soup

<!DOCTYPE html>
<html>
<head>
<title>CN103606006B -   Sludge settling index Soft tissue measurements from t-s Fuzzy Neural Network   - Google Patents</title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta charset="utf-8"/>
<meta content="origin-when-crossorigin" name="referrer"/>
<link href="https://patents.google.com/patent/CN103606006B/en" rel="canonical"/>
<meta content="    基于自组织T‑S模糊神经网络的污泥沉降指数软测量方法既属于控制领域，又属于污水处理领域。   污泥沉降指数SVI的准确预测是污水处理过程正常运行的保证，本发明首先以规则层的输出量，即规则层的空间激活强度作为判定模糊规则是否增加的依据；其次，在生成新的模糊规则的基础上，以隶属函数层输出量作为判定模糊集是否增加的依据；最后，利用梯度下降算法调整模型的权值参数和高斯函数的中心值和宽度，获得一种自组织T‑S模糊递归神经网络，并基于SOTSFEN建立了SVI的在线软测量模型，实现了SVI的实时检测，为预防污泥膨胀提供了一种有效方法。     " name="description"/>
<meta content="patent" name="DC.type"/>
<meta content="  Sludge settling index Soft tissue measurements from t-s Fuzzy Neural Network  " name="DC.title"/>
<meta content="2013-11-12" name="DC.date" scheme="dateSubmitted"/>
<meta content="    基于自组织T‑S模糊神经网络的污泥沉降指数软测量方法既属于控制领域，又属于污水处理领域。   污泥沉降指数SVI的

In [21]:
soup.find_all({'class': 'righthead style-scope patent-result'})

[]

In [3]:
url = search_results.loc[0, 'result link']
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
soup.find('h1').text

'US5828812A - Recurrent neural network-based fuzzy logic system and method \n        - Google Patents'

In [4]:
abstract = soup.find("div",  {"class": 'abstract'})
abstract.text.strip()

'A recurrent, neural network-based fuzzy logic system includes in a rule base layer and a membership function layer neurons which each have a recurrent architecture with an output-to-input feedback path including a time delay element and a neural weight. Further included is a recurrent, neural network-based fuzzy logic rule generator wherein a neural network receives and fuzzifies input data and provides data corresponding to fuzzy logic membership functions and recurrent fuzzy logic rules.'

In [37]:
def parse_abstract(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    # Filter translations
    if soup.find('aside') is not None:
        return None
    try:
        abstract = soup.find('div', {'class': 'abstract'}).text.strip()
        return abstract
    except Exception as e:
        return None

In [32]:
from IPython import display

In [33]:
image_url = search_results.loc[0, 'representative figure link']
display.Image(url = image_url)

In [34]:
from tqdm import tqdm_notebook
from timeit import default_timer as timer

In [35]:
titles = search_results['title']
urls = search_results['result link']

results = []
start = timer()

for i, (title, url) in enumerate(zip(titles, urls)):
    print(f'{round(100 * (i / len(urls)), 2)}% complete.', end = '\r')
    results.append((title, parse_abstract(url)))
end = timer()

99.87% complete.

In [36]:
results = [r for r in results if r[1] is not None]
print(f'Found {len(results)} patent abstracts in {round(end - start)} seconds.')

Found 556 patent abstracts in 435 seconds.


In [12]:
# from multiprocessing.dummy import Pool
# import tqdm

# pool = Pool(50)
# r = tqdm_notebook(pool.imap_unordered(parse_abstract, urls), total = len(urls))
# pool.close()
# pool.join()

In [38]:
%%capture
import json
with open('../data/found_tech_patents.ndjson', 'w') as fout:
    # Iterate through each list element
    for l in results:
        # Write to file
        fout.write(json.dumps(l) + '\n')

In [39]:
image_url = search_results.loc[6, 'representative figure link']
display.Image(url = image_url)