In [1]:
import pandas as pd
import pickle 
import xml.etree.cElementTree as ET
from bs4 import BeautifulSoup
import requests
import urllib
import os
import zipfile
import lxml
from lxml import etree

In [2]:
os.chdir('/work/')

# 1. Data Download


In [4]:
def download_applications(year):
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko"\
               "/20100101 Firefox/57.0",}

    lnk = 'https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext/'\
          + str(year) + '/'
    rr = requests.get(lnk, headers=headers)
    html = rr.content
    soup = BeautifulSoup(html, "html.parser")
    urls = [x.get('href') for x in soup.find_all('a') 
            if '.zip' in x.get('href')]
    target_dir = 'data/apps/' + str(year) + '/'
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    [urllib.request.urlretrieve(lnk + x, 
                                'data/apps/' + str(year) + '/'+x) for
     x in urls]

In [4]:
year = 2017
#download_applications(year)
olddir = os.getcwd()
newdir = '/work/data/apps/' + str(year) + '/'
os.chdir(newdir)

# 2. Split Files

In [3]:
def unzip_file(fnm):
    zip_ref = zipfile.ZipFile(fnm, 'r')
    zip_ref.extractall('.')
    zip_ref.close()
    return fnm.split('/')[-1].replace('.zip', '.xml')

def split_files():
    for fnm in [x for x in os.listdir() if x.split('.')[-1]=='zip']:
        fnm_xml = unzip_file(fnm)
        print(fnm_xml)
        !bash /work/split_apps.sh {fnm_xml}
        !rm {fnm}

# 3. Meta Data Extraction

In [4]:
def extract_singles_from_xpath(tree, xpath):
    fsub = tree.xpath(xpath)
    if fsub == []:
        return ''
    return ''.join(fsub[0].itertext())

def extract_dict_from_xpath(tree, xpath):
    fsub = tree.xpath(xpath)
    if fsub == []:
        return {}
    ref_dict = {x.tag: x.text for x in fsub[0].iterchildren()}
    ref_dict.update({'len':len(fsub)})
    return ref_dict

def extract_all_singles_from_xpath(tree, xpaths):
    out = {v:extract_singles_from_xpath(tree, k) for k, v in xpaths.items()}
    return out
    
def extract_all_xpath_dicts(tree, xpaths):
    keys = []
    values = []
    
    for k, v in xpaths.items():
        edict = extract_dict_from_xpath(tree, k)
        keys.extend([v + '_' + x for x in edict.keys()])
        values.extend(list(edict.values()))
    
    edict = dict(zip(keys, values))
    return edict

In [5]:
def get_details_from_xml(fnm):
    tree = etree.parse(fnm)
    xpaths_singles = {
        '/us-patent-application/us-bibliographic-data-application/assignees/'+\
        'assignee/addressbook/orgname': 'assignee',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'invention-title': 'title',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'us-parties/us-applicants/us-applicant/addressbook/orgname': 
        'us-applicant'
    }
    
    xpaths_dicts = {
        '/us-patent-application/us-bibliographic-data-application/'+\
        'application-reference/document-id': 'app',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'publication-reference/document-id': 'pub'
    }
    out_dict = extract_all_singles_from_xpath(tree, xpaths_singles)
    out_dict.update(extract_all_xpath_dicts(tree, xpaths_dicts))
    out_dict.update({'fnm':fnm})
    return out_dict

def get_details_from_folder(file_dir):
    dict_list = [get_details_from_xml(file_dir + x) for x in 
                                     os.listdir(file_dir)]
    df = pd.DataFrame.from_dict(dict_list)
    return df

In [6]:
def get_details_from_all_folders(basedir, year):
    df = [get_details_from_folder(basedir + x + '/') for x in 
           os.listdir(basedir)]
    df = pd.concat(df)
    df.to_csv('/work/data/apps/' + str(year) + '_all_apps.csv')
    return df
    

In [7]:
def make_year(year):
    os.chdir('/work/')
    download_applications(year)
    olddir = os.getcwd()
    newdir = '/work/data/apps/' + str(year) + '/'
    print(newdir)
    os.chdir(newdir)
    split_files()
    get_details_from_all_folders(newdir, year)
    return meta_df

In [8]:
def dsplit_year(year):
    os.chdir('/work/')
    download_applications(year)
    olddir = os.getcwd()
    newdir = '/work/data/apps/' + str(year) + '/'
    print(newdir)
    os.chdir(newdir)
    #split_files()

In [10]:
list(reversed(list(range(2006, 2019, 1))))

[2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006]

In [None]:
%%time
for y in list(reversed(list(range(2006, 2019, 1)))):
    newdir = '/work/data/apps/' + str(y) + '/'
    os.chdir(newdir)
    get_details_from_all_folders(newdir, y)

In [None]:
[dsplit_year(x) for x in  list(reversed(list(range(2005, 2013, 1))))]

/work/data/apps/2012/
/work/data/apps/2011/
/work/data/apps/2010/
/work/data/apps/2009/


In [None]:
%%time
outdfs = [make_year(x) for x in  list(reversed(list(range(2005, 2019, 1))))]

In [14]:
list(reversed(list(range(2005, 2016, 1))))

[2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005]

# 3. Looking at the doc-numbers

In [None]:
from lxml import etree

In [None]:
def extract_singles_from_xpath(tree, xpath):
    fsub = tree.xpath(xpath)
    if fsub == []:
        return ''
    return ''.join(fsub[0].itertext())

def extract_dict_from_xpath(tree, xpath):
    fsub = tree.xpath(xpath)
    if fsub == []:
        return {}
    ref_dict = {x.tag: x.text for x in fsub[0].iterchildren()}
    ref_dict.update({'len':len(fsub)})
    return ref_dict

def extract_all_singles_from_xpath(tree, xpaths):
    out = {v:extract_singles_from_xpath(tree, k) for k, v in xpaths.items()}
    return out
    
def extract_all_xpath_dicts(tree, xpaths):
    keys = []
    values = []
    
    for k, v in xpaths.items():
        edict = extract_dict_from_xpath(tree, k)
        keys.extend([v + '_' + x for x in edict.keys()])
        values.extend(list(edict.values()))
    
    edict = dict(zip(keys, values))
    return edict

In [None]:
def get_details_from_xml(fnm):
    tree = etree.parse(fnm)
    xpaths_singles = {
        '/us-patent-application/us-bibliographic-data-application/assignees/'+\
        'assignee/addressbook/orgname': 'assignee',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'invention-title': 'title',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'us-parties/us-applicants/us-applicant/addressbook/orgname': 
        'us-applicant'
    }
    
    xpaths_dicts = {
        '/us-patent-application/us-bibliographic-data-application/'+\
        'application-reference/document-id': 'app',
        '/us-patent-application/us-bibliographic-data-application/'+\
        'publication-reference/document-id': 'pub'
    }
    out_dict = extract_all_singles_from_xpath(tree, xpaths_singles)
    out_dict.update(extract_all_xpath_dicts(tree, xpaths_dicts))
    out_dict.update({'fnm':fnm})
    return out_dict
    

In [None]:
from tqdm import tqdm

In [None]:
def get_details_from_folder(file_dir):
    #from IPython.core.debugger import Tracer; Tracer()()
    dict_list = [get_details_from_xml(file_dir + x) for x in 
                                     os.listdir(file_dir)]
    df = pd.DataFrame.from_dict(dict_list)
    return df

In [None]:
%%time
all_df = get_details_from_folder('/work/data/apps/2017/ipa170202/')

In [None]:
basedir = '/work/data/apps/2017/'
dfs = [get_details_from_folder(basedir + x + '/') for x in os.listdir(basedir)]
pd.concat(dfs).to_csv('/work/data/apps/2017_all_apps.csv')

In [None]:
all_apps = pd.read_csv('/work/data/apps/2017_all_apps.csv', low_memory=False).iloc[:, 1:]
all_apps.head()