# Introduction: Downlading Patents

The purpose of this notebook is to download patents abstracts. Using a saved query that includes the links to the patents, we can download and save the abstracts to use for machine learning and deep learning.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
import os
os.listdir('../data/patent_search/')

['gp-search-autonomous-systems.csv',
 'gp-search-artificial-intelligence.csv',
 'gp-search-deep-neural-networks.csv',
 'gp-search-thinking-machines.csv',
 'gp-search-neural-networks.csv',
 'gp-search-machine-learning.csv',
 'gp-search-deep-learning.csv',
 'gp-search-computer-science.csv',
 'gp-search-information-processing.csv',
 'gp-search-recurrent-neural-networks.csv']

In [4]:
search_results = pd.read_csv('../data/patent_search/gp-search-autonomous-systems.csv', 
                             skiprows = [0])
search_results.head()

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link
0,US-6914886-B2,Controlling traffic on links between autonomou...,Radware Ltd.,"Amir Peles, Smadar Fuks",2001-05-03,2001-05-03,2005-07-05,2005-07-05,https://patents.google.com/patent/US6914886B2/en,https://patentimages.storage.googleapis.com/51...
1,US-2008129825-A1,Autonomous Systems And Methods For Still And M...,"Lynx System Developers, Inc.","Douglas J. DeAngelis, Kirk Sigel, Edward Evansen",2006-12-04,2007-12-04,2008-06-05,,https://patents.google.com/patent/US2008012982...,https://patentimages.storage.googleapis.com/US...
2,US-2010066676-A1,Gestural Control of Autonomous and Semi-Autono...,"Oblong Industries, Inc.","Kwindla Hultman Kramer, Tom White, Mattie Ruth...",2006-02-08,2009-09-10,2010-03-18,,https://patents.google.com/patent/US2010006667...,
3,US-5046022-A,Tele-autonomous system and method employing ti...,The Regents Of The University Of Michigan,"Lynn A. Conway, Richard A. Volz, Michael W. Wa...",1988-03-10,1988-03-10,1991-09-03,1991-09-03,https://patents.google.com/patent/US5046022A/en,https://patentimages.storage.googleapis.com/pa...
4,US-7076559-B1,"System, device, and method for establishing la...",Nortel Networks Limited,"Anoop Ghanwani, Andre Fredette, Naganand Doras...",1999-12-28,1999-12-28,2006-07-11,2006-07-11,https://patents.google.com/patent/US7076559B1/en,https://patentimages.storage.googleapis.com/US...


In [7]:
url = search_results.loc[0, 'result link']
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
soup.find('h1').text

'US6914886B2 - Controlling traffic on links between autonomous systems \n        - Google Patents'

In [8]:
abstract = soup.find("div",  {"class": 'abstract'})
abstract.text.strip()

'The present invention provides for controlling incoming traffics on the links to an autonomous system. Incoming traffic usage for blocks of IP addresses within an autonomous system and load, congestion and capacity of the links for the incoming traffic is monitored to determine the optimal link for incoming traffic destined for a block of IP addresses. Incoming traffic for a block of IP addresses is biased towards the optimal link by configuring the border routers to announce the block of IP addresses via Border Gateway Protocol (BGP) across the non-optimal links with one or more local AS numbers pre-pended, causing the non-optimal links to look as if they are of a greater routing distance than the optimal link. In addition, outgoing traffic for a session is separately controlled by tagging the packets of the session for a specific link, causing the router to send the packet out the optimal link.'

In [9]:
soup.find('aside')

In [18]:
def parse_abstract(url_title):
    r = requests.get(url_title[0])
    title = url_title[1]
    # Create soup
    soup = BeautifulSoup(r.content, 'lxml')
    
    # Filter translations
    if soup.find('aside') is not None:
        return (None, title)
    try:
        abstract = soup.find('div', {'class': 'abstract'}).text.strip()
        return (abstract, title)
    except Exception as e:
        return (None, title)

In [11]:
from IPython import display

image_url = search_results.loc[0, 'representative figure link']
display.Image(url = image_url)

In [22]:
urls = list(search_results.loc[:, 'result link'])
titles = list(search_results.loc[:, 'title'])

url_with_titles = [(url, title) for url, title in zip(urls, titles)]
len(url_with_titles)

677

In [23]:
url_with_titles[0]

('https://patents.google.com/patent/US6914886B2/en',
 'Controlling traffic on links between autonomous systems ')

In [24]:
from tqdm import tqdm_notebook
from timeit import default_timer as timer
from multiprocessing.dummy import Pool

results = []
start = timer()
pool = Pool(20)
for i, r in enumerate(pool.imap_unordered(parse_abstract, url_with_titles)):
    results.append(r)
    print(f'{round(100 * (i / len(url_with_titles), 2))}% complete.', end = '\r')
pool.close()
pool.join()
end = timer()

99.85228951255539% complete...

In [None]:
results = [r for r in results if r[1] is not None]
print(f'Found {len(results)} patent abstracts in {round(end - start)} seconds.')

In [26]:
file = os.listdir('../data/patent_search/')[0]
search_term = file.split('gp-search-')[-1].split('.csv')[0]
search_term

'autonomous-systems'

In [32]:
import json

with open(f'../data/patents_parsed/{search_term}_abstracts.ndjson', 'w') as fout:
        # Iterate through each list element
        for l in results:
            # Write to file
            fout.write(json.dumps(l) + '\n')

In [None]:
# Iterate through all files
for j, file in enumerate(os.listdir('../data/patent_search/')):
    
    # Read in file
    search_results = pd.read_csv(f'../data/patent_search/{file}', skiprows = [0])
    
    # Determine search term name
    search_term = file.split('gp-search-')[-1].split('.csv')[0]
    print(f'Processing {search_term}.')
    
    # Create list of tuples of urls and titles
    urls = list(search_results.loc[:, 'result link'])
    titles = list(search_results.loc[:, 'title'])
    url_with_titles = [(url, title) for url, title in zip(urls, titles)]
    
    # New list for results
    results = []
    start = timer()
    
    # Pool with threads
    pool = Pool(20)
    
    # Iterate through the urls and titles
    for i, r in enumerate(pool.imap_unordered(parse_abstract, url_with_titles)):
        results.append(r)
        # Progress
        print(f'{round(100 * (i / len(url_with_titles)), 2)}% complete.', end = '\r')
    pool.close()
    pool.join()
    end = timer()
    
    # Results
    results = [r for r in results if r[1] is not None]
    
    # Save results
    with open(f'../data/patents_parsed/{search_term}_abstracts.ndjson', 'w') as fout:
        # Iterate through each list element
        for l in results:
            # Write to file
            fout.write(json.dumps(l) + '\n')
            
    # Progress updates
    print(f'Found {len(results)} patent abstracts in {round(end - start)} seconds.')
    print(f'{j + 1} files processed.')

Processing autonomous-systems.
Found 677 patent abstracts in 1078 seconds.
1 files processed.
Processing artificial-intelligence.
Found 750 patent abstracts in 200 seconds.
2 files processed.
Processing deep-neural-networks.
Found 700 patent abstracts in 193 seconds.
3 files processed.
Processing thinking-machines.
Found 659 patent abstracts in 395 seconds.
4 files processed.
Processing neural-networks.
34.9% complete..

In [None]:
image_url = search_results.loc[6, 'representative figure link']
display.Image(url = image_url)