In [2]:
import pandas as pd
import numpy as np
import bs4
from lxml import etree
import re
from os import walk
import pickle

In [3]:
text_filepath = "./data/pat_text/{}.txt"
claims_filepath = "./data/pat_claims/{}.txt"

In [6]:
filenames = next(walk("./data/googpat_pages"), (None, None, []))[2]  # [] if no file
pats_dat = {}
for cur_file in filenames:
    print(cur_file)
    patid = cur_file.split(".")[0]
    #Parse page
    html_soup = bs4.BeautifulSoup(open(f'./data/googpat_pages/{patid}.html', 'rb'), 'html.parser')
    dom = etree.HTML(str(html_soup))
    
    #Write out text and claims
    pat_text = text_filepath.format(patid)
    with open(pat_text, 'wb') as outfile: 
        outfile.write(" ".join(dom.xpath("//patent-text[@name='description']//text()")).encode())
    pat_claims = claims_filepath.format(patid)
    with open(pat_claims, 'wb') as outfile: 
        outfile.write(" ".join(dom.xpath("//patent-text[@name='claims']//text()")).encode())
        
    #Get cpcs and their descriptions
    cpcs = dom.xpath("//concept-mention[contains(@class,'classification-tree')]//a")
    cpcs = list(map(lambda x: x.text, cpcs))
    cpc_desc = dom.xpath("//concept-mention[contains(@class,'classification-tree')]//span")
    cpc_desc = list(map(lambda x: x.text, cpc_desc))
    cpc_desc = [cpc for cpc in cpc_desc if cpc!='\n']
    
    #Citations data
    try:
        citations_cnt = dom.xpath("//h3[@id='patentCitations']")[0]
        citations_cnt = re.search('[0-9]+', citations_cnt.text).group()
    except:
        citations_cnt = 0
    try:
        citedby_cnt = dom.xpath("//h3[@id='citedBy']")[0]
        citedby_cnt = re.search('[0-9]+', citedby_cnt.text).group()
    except:
        citedby_cnt = 0

    #Assignee data
    cur_assignee = dom.xpath("//section[contains(@class, 'knowledge-card')]//dl[contains(@class, 'important-people')]//dd//text()")
    if len(cur_assignee) > 1:
        cur_assignee = cur_assignee[len(cur_assignee)-1].strip()
    else:
        cur_assignee = np.nan

    #Status
    try:
        status = dom.xpath("//section[contains(@class, 'knowledge-card')]//div[@current]//span//text()")[0]
    except:
        status = np.nan

    #Grant date
    try:
        grant = dom.xpath("//section[contains(@class, 'knowledge-card')]//div[contains(@class, 'granted') and @date]//text()")[0]
    except:
        grant = np.nan

    pats_dat[patid] = {'text_path': pat_text,
                      'claims_path': pat_claims,
                      'cited_by': citedby_cnt,
                      'citations': citations_cnt,
                      'cur_assignee': cur_assignee,
                      'status': status,
                      'grantdt':grant,
                      'cpcs': cpcs,
                      'cpc_desc': cpc_desc}
pats_df = pd.DataFrame.from_dict(pats_dat, orient='index')

AU1271101A.html
AU1520901A.html
AU1524001A.html
AU1763800A.html
AU2001261358B2.html
AU2001279159A2.html
AU2002210952B2.html
AU2002352247B2.html
AU2002356733A1.html
AU2003217606B8.html
AU2003236646B2.html
AU2003293675B2.html
AU2004220736A1.html
AU2004260061A1.html
AU2004280636A1.html
AU2005248981C1.html
AU2005263416B2.html
AU2006275261A1.html
AU2007203090A1.html
AU2007203093A1.html
AU2007218960A1.html
AU2007219010A1.html
AU2007222842A1.html
AU2007222843A1.html
AU2007278791A1.html
AU2007304533B2.html
AU2008209280A1.html
AU2008209282A1.html
AU2008239833A2.html
AU2008247285A1.html
AU2008260731A1.html
AU2008288639A1.html
AU2008288645B2.html
AU2008291657A1.html
AU2009212099A1.html
AU2010303020B2.html
AU2011224057B2.html
AU2013200148A1.html
AU2013202753A1.html
AU2013204668A1.html
AU2013263855B2.html
AU2013366490B9.html
AU2013377774A1.html
AU2014100641A4.html
AU2014200798B2.html
AU2014345342B2.html
AU2015200833B2.html
AU2015331160A1.html
AU2016376965A1.html
AU2016380614B2.html
AU2016384265A1.h

In [7]:
pats_df.index.name = 'patid'
pats_df = pats_df.reset_index()

In [8]:
with open('./data/pats_df.pkl', 'wb') as pkl_file:
    pickle.dump(pats_df, pkl_file)