In [129]:
import os
import sys
import re
#update with the path to your config file
sys.path.append("F:\Caitlin_PV\Config_File")
import pv_config as config

from lxml import etree
import csv
import unittest
from collections import defaultdict

## Clean Code

#### Helper Functions

In [130]:
def get_xml(valid_xml_file):
    '''
    Return a xml object with patent data
    :param valid_xml_file: a valid xml object
    :return the root object of the xml file, ready to parse
    '''
    tree = etree.parse(valid_xml_file)
    root = tree.getroot()
    return root

In [131]:
def process_patent_numbers(raw_patent_num):
    '''
    Helper function ot transform patent ids into thier final format
    :param raw_patent_num: patent number extracted from the raw XML
    :return cleaned patent id.
    '''
    num = re.findall('\d+', raw_patent_num)[0] #get the just numbers in string form
    if num[0].startswith("0"):
        num = num[1:]
        let = re.findall('[a-zA-Z]+', raw_patent_num) #get the letter prefixes
    if let:
        let = let[0]#list to string
        clean_patent_num = let + num
    else:
        clean_patent_num = num
    return clean_patent_num
    

In [132]:
def process_date(date): 
    '''
    Takes a date formated as 6 numbers and returns it with dashes and days that are 00 replaced with 01
    :params date: a date object formatted as 6 numbers
    :returns cleaned up date
    '''
    if date is not None:
        if date[6:] != "00":
            date = date[:4]+'-'+date[4:6]+'-'+date[6:]
        else:
            date = date[:4]+'-'+date[4:6]+'-'+'01'
    return date

In [133]:
def recursive_children(xml_element, parent_field=""):
    '''
    :params xml_element: xml object can be nested
    :params parent_field: parent of nested xml object
    :returns a dictionary of the tags and texts of nested xml object
    '''
    test_list = []
    if len(xml_element)==0:
        if parent_field:
            test_list.append((parent_field+"-"+xml_element.tag, xml_element.text))
        else:
            test_list.append((xml_element.tag, xml_element.text)) 
        #print xml_element.tag, xml_element.text
    else:
        parent_field = xml_element.tag
        for element in xml_element:
            test_list += recursive_children(element, parent_field)

    return test_list

#### Data Functions

In [134]:
def get_patent(patent):
    '''
    :params patent: take the xml object representing a patent
    :returns a default diction with all the data for the patent table
    '''
    #publication reference information
    patent_data = defaultdict(lambda x : "")
    pub_ref_data = patent.findall('.//publication-reference/document-id/')
    for field in pub_ref_data:
        patent_data[field.tag] = field.text
    patent_data['doc-number'] = process_patent_numbers(patent_data['doc-number'])
    patent_data['date'] = process_date(patent_data['date'])
    #abstract
    abst = patent.findall('abstract/p')
    if abst: #not all patents have abstracts
        #this grabs only the first a paragraph and doesn't overwrite it with none if there is a dangling 'None' paragraph
        patent_data['abstract'] = abst[0].text
    #title
    for patent in patents:
        title_data = patent.find('.//invention-title')
        if title_data is not None:
            patent_data['title'] = title_data.text
    return patent_data

In [135]:
def get_examiner(patent):
    '''
    :params patent: take the xml object representing a patent
    :returns a list of default dictionary for primary and assistant examiner
    '''
    #examiner information
    examiners_list=[]
    examiners_xml = patent.findall('.//examiners/') #this has both primary and assistant examiner objects
    for examiner in examiners_xml:
        examiners_data = defaultdict(lambda x : "")
        examiners_data["role"] = examiner.tag
        for field in examiner:
            examiners_data[field.tag] = field.text
        examiners_list.append(examiners_data)
    return examiners_list

In [293]:
def get_single_entity(patent, entity_name):
    '''
    :params patent: take the xml object representing a patent
    :params entity_name: a string with the xml tag for the entity
    :returns a default dictionary with all the data for the entity and processes dates
    '''
    data = defaultdict(lambda x : 0)
    xml = patent.findall('.//'+entity_name+"/")
    for field in xml:
        data = {}
        data[field.tag] = field.text
        for field in xml:
            results_list = recursive_children(field)
            data.update(dict(results_list))
        for key in data.keys():
            if 'date' in key:
                data[key] = process_date(data[key])
    return data

In [220]:
def get_multiple_entity(patent, entity_name, attribute_list=None):
    '''
    :params patent: take the xml object representing a patent
    :params entity_name: a string with the xml tag for an entity with multiple entities
    :returns a list of default dictionaries with all the data for the entity and processes dates
    '''
    var_list=[]
    xml = patent.findall('.//'+entity_name+"/")
    for field in xml:
        data={}
        if attribute_list:
            for attribute in attribute_list:
                data[attribute]=field.attrib[attribute]
        #recursive function modifies data dictionary defined above
        results_list = recursive_children(field)
        data.update(dict(results_list))
        for key in data.keys():
            if 'date' in key:
                data[key] = process_date(data[key])
        var_list.append(data)
    return var_list

### Processed Entities

In [57]:
#Single
#number-of-claims
#us-application-series code
#figures
#pct-or-regional-publishing-data
#us-term-of-grant
#pct-or-regional-filing-data

#Multiple
#us-exemplary-claim
#classifications-ipcr
#classification-national
#inventors
#us-references-cited
#assignees
#'us-applicants', attribute_list=['sequence', 'app-type', 'designation', 'applicant-authority-category'
#'inventors', attribute_list=['sequence', 'designation']
#'agents', attribute_list=['sequence', 'rep-type']
#us_related_docs


## To Run Process

In [138]:
patents = get_xml("D:\Caitlin_PV\Data\ipg171212_clean.xml")

## Working Area

In [143]:
#TODO
#get_multiple_entity(patent, 'priority-claim', attribute_list=['sequence', 'kind'])
#take out guts of function and debug


In [None]:
#Questions: RELAPP, description-of-drawings, BRFSUM, DETDESC: Sarah building

In [None]:
#still need to do:
#application-reference, invention-title, abstract, publication-reference, us-botanic

In [139]:
patents #is an xml-tree object that has all the patents an information
patents[104] #is the element that represnts the 104th patent 

<Element us-patent-grant at 0x12ce154c8>

In [12]:
#The main highest level grouings are things like 'abstract','figures', and 'us-bibliographic-data-grant
#'us-bibliographic-data-grant' has most of the fields we are working with

In [296]:
patent = patents[2010]

### PostProcessing

In [14]:
application_reference = get_single_entity(patent, 'application-reference/document-id/')
application_reference['date'] = process_date(application_reference['date'])

In [15]:
classification_icpr_data = get_multiple_entity(patent, 'classifications-ipcr')
print classification_icpr_data
for classification in classification_icpr_data:
    classification['action-date-date'] = process_date(classification['action-date-date'])
    classification['ipc-version-indicator-date'] = process_date(classification['ipc-version-indicator-date'])
print classification_icpr_data

[{'classification-ipcr-section': 'C', 'classification-ipcr-subclass': 'L', 'classification-ipcr-class': '08', 'classification-ipcr-main-group': '21', 'classification-ipcr-classification-data-source': 'H', 'classification-ipcr-classification-status': 'B', 'classification-ipcr-classification-level': 'A', 'classification-ipcr-symbol-position': 'F', 'ipc-version-indicator-date': '20060101', 'generating-office-country': 'US', 'action-date-date': '20171212', 'classification-ipcr-subgroup': '00', 'classification-ipcr-classification-value': 'I'}]
[{'classification-ipcr-section': 'C', 'classification-ipcr-subclass': 'L', 'classification-ipcr-class': '08', 'classification-ipcr-main-group': '21', 'classification-ipcr-classification-data-source': 'H', 'classification-ipcr-classification-status': 'B', 'classification-ipcr-classification-level': 'A', 'classification-ipcr-symbol-position': 'F', 'ipc-version-indicator-date': '2006-01-01', 'generating-office-country': 'US', 'action-date-date': '2017-12

### To do

In [None]:
#need to do claims
claims_data = get_multiple_entity(patent, 'claims/claim')
print claims_data

In [None]:
#classification-national post processing, line 556